5 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
7 Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py
9 --- sle11-2009-10-16.orig/arch/x86/Kconfig 2009-02-16 16:18:36.000000000 +0100
10 +++ sle11-2009-10-16/arch/x86/Kconfig 2009-03-16 16:33:40.000000000 +0100
11 @@ -27,7 +27,7 @@ config X86
12 select HAVE_KRETPROBES
13 select HAVE_DYNAMIC_FTRACE
15 - select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
16 + select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN
17 select HAVE_ARCH_KGDB if !X86_VOYAGER
18 select HAVE_ARCH_TRACEHOOK
19 select HAVE_GENERIC_DMA_COHERENT if X86_32
20 @@ -211,14 +211,12 @@ config X86_TRAMPOLINE
37 @@ -728,9 +726,8 @@ config X86_VISWS_APIC
38 depends on X86_32 && X86_VISWS
40 config X86_XEN_GENAPIC
47 bool "Machine Check Exception"
48 @@ -1117,7 +1114,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
50 config ARCH_SPARSEMEM_DEFAULT
53 + depends on X86_64 && !X86_64_XEN
55 config ARCH_SPARSEMEM_ENABLE
57 @@ -1747,10 +1744,10 @@ config PCI_MMCONFIG
58 depends on X86_64 && PCI && ACPI
60 config XEN_PCIDEV_FRONTEND
61 - bool "Xen PCI Frontend" if X86_64
63 + prompt "Xen PCI Frontend" if X86_64
64 depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
68 The PCI device frontend driver allows the kernel to import arbitrary
69 PCI devices from a PCI backend to support PCI driver domains.
70 @@ -1758,7 +1755,6 @@ config XEN_PCIDEV_FRONTEND
71 config XEN_PCIDEV_FE_DEBUG
72 bool "Xen PCI Frontend Debugging"
73 depends on XEN_PCIDEV_FRONTEND
76 Enables some debug statements within the PCI Frontend.
78 --- sle11-2009-10-16.orig/arch/x86/Kconfig.debug 2009-02-02 09:40:56.000000000 +0100
79 +++ sle11-2009-10-16/arch/x86/Kconfig.debug 2009-03-16 16:33:40.000000000 +0100
80 @@ -279,6 +279,7 @@ config DEBUG_BOOT_PARAMS
81 bool "Debug boot parameters"
82 depends on DEBUG_KERNEL
86 This option will cause struct boot_params to be exported via debugfs.
88 --- sle11-2009-10-16.orig/arch/x86/ia32/ia32entry-xen.S 2009-02-16 16:18:36.000000000 +0100
89 +++ sle11-2009-10-16/arch/x86/ia32/ia32entry-xen.S 2009-03-16 16:33:40.000000000 +0100
91 #include <asm/ia32_unistd.h>
92 #include <asm/thread_info.h>
93 #include <asm/segment.h>
94 -#include <asm/vsyscall32.h>
95 #include <asm/irqflags.h>
96 #include <linux/linkage.h>
98 @@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target)
100 movl %ebp,%ebp /* zero extension */
102 + movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
103 movl $__USER32_DS,40(%rsp)
105 movl $__USER32_CS,16(%rsp)
106 - movl $VSYSCALL32_SYSEXIT,8(%rsp)
111 @@ -582,8 +582,8 @@ ia32_sys_call_table:
112 .quad compat_sys_futex /* 240 */
113 .quad compat_sys_sched_setaffinity
114 .quad compat_sys_sched_getaffinity
115 - .quad sys32_set_thread_area
116 - .quad sys32_get_thread_area
117 + .quad sys_set_thread_area
118 + .quad sys_get_thread_area
119 .quad compat_sys_io_setup /* 245 */
121 .quad compat_sys_io_getevents
122 @@ -661,7 +661,9 @@ ia32_sys_call_table:
123 .quad sys_epoll_pwait
124 .quad compat_sys_utimensat /* 320 */
125 .quad compat_sys_signalfd
126 - .quad compat_sys_timerfd
127 + .quad sys_timerfd_create
129 .quad sys32_fallocate
130 + .quad compat_sys_timerfd_settime /* 325 */
131 + .quad compat_sys_timerfd_gettime
133 --- sle11-2009-10-16.orig/arch/x86/kernel/Makefile 2009-02-16 16:18:36.000000000 +0100
134 +++ sle11-2009-10-16/arch/x86/kernel/Makefile 2009-03-16 16:33:40.000000000 +0100
135 @@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y)
137 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
139 + obj-$(CONFIG_XEN) += nmi_64.o
140 time_64-$(CONFIG_XEN) += time_32.o
141 pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
144 disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
145 smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
146 -disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
147 -%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) :=
148 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/boot.c 2008-12-01 11:11:08.000000000 +0100
149 +++ sle11-2009-10-16/arch/x86/kernel/acpi/boot.c 2009-03-16 16:33:40.000000000 +0100
150 @@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l
152 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
155 + if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
156 + return isa_bus_to_virt(phys);
159 offset = phys & (PAGE_SIZE - 1);
160 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
161 +++ sle11-2009-10-16/arch/x86/kernel/acpi/sleep-xen.c 2009-03-16 16:33:40.000000000 +0100
164 + * sleep.c - x86-specific ACPI sleep support.
166 + * Copyright (C) 2001-2003 Patrick Mochel
167 + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
170 +#include <linux/acpi.h>
171 +#include <linux/bootmem.h>
172 +#include <linux/dmi.h>
173 +#include <linux/cpumask.h>
175 +#include <asm/smp.h>
177 +#ifndef CONFIG_ACPI_PV_SLEEP
178 +/* address in low memory of the wakeup routine. */
179 +unsigned long acpi_wakeup_address = 0;
180 +unsigned long acpi_realmode_flags;
181 +extern char wakeup_start, wakeup_end;
183 +extern unsigned long acpi_copy_wakeup_routine(unsigned long);
187 + * acpi_save_state_mem - save kernel state
189 + * Create an identity mapped page table and copy the wakeup routine to
192 +int acpi_save_state_mem(void)
194 +#ifndef CONFIG_ACPI_PV_SLEEP
195 + if (!acpi_wakeup_address) {
196 + printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
199 + memcpy((void *)acpi_wakeup_address, &wakeup_start,
200 + &wakeup_end - &wakeup_start);
201 + acpi_copy_wakeup_routine(acpi_wakeup_address);
208 + * acpi_restore_state - undo effects of acpi_save_state_mem
210 +void acpi_restore_state_mem(void)
216 + * acpi_reserve_bootmem - do _very_ early ACPI initialisation
218 + * We allocate a page from the first 1MB of memory for the wakeup
219 + * routine for when we come back from a sleep state. The
220 + * runtime allocator allows specification of <16MB pages, but not
223 +void __init acpi_reserve_bootmem(void)
225 +#ifndef CONFIG_ACPI_PV_SLEEP
226 + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
228 + "ACPI: Wakeup code way too big, S3 disabled.\n");
232 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
233 + if (!acpi_wakeup_address)
234 + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
239 +#ifndef CONFIG_ACPI_PV_SLEEP
240 +static int __init acpi_sleep_setup(char *str)
242 + while ((str != NULL) && (*str != '\0')) {
243 + if (strncmp(str, "s3_bios", 7) == 0)
244 + acpi_realmode_flags |= 1;
245 + if (strncmp(str, "s3_mode", 7) == 0)
246 + acpi_realmode_flags |= 2;
247 + if (strncmp(str, "s3_beep", 7) == 0)
248 + acpi_realmode_flags |= 4;
249 + str = strchr(str, ',');
251 + str += strspn(str, ", \t");
256 +__setup("acpi_sleep=", acpi_sleep_setup);
257 +#endif /* CONFIG_ACPI_PV_SLEEP */
258 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/sleep_32-xen.c 2009-02-16 16:18:36.000000000 +0100
259 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
262 - * sleep.c - x86-specific ACPI sleep support.
264 - * Copyright (C) 2001-2003 Patrick Mochel
265 - * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
268 -#include <linux/acpi.h>
269 -#include <linux/bootmem.h>
270 -#include <linux/dmi.h>
271 -#include <linux/cpumask.h>
273 -#include <asm/smp.h>
275 -#ifndef CONFIG_ACPI_PV_SLEEP
276 -/* address in low memory of the wakeup routine. */
277 -unsigned long acpi_wakeup_address = 0;
278 -unsigned long acpi_realmode_flags;
279 -extern char wakeup_start, wakeup_end;
281 -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
285 - * acpi_save_state_mem - save kernel state
287 - * Create an identity mapped page table and copy the wakeup routine to
290 -int acpi_save_state_mem(void)
292 -#ifndef CONFIG_ACPI_PV_SLEEP
293 - if (!acpi_wakeup_address)
295 - memcpy((void *)acpi_wakeup_address, &wakeup_start,
296 - &wakeup_end - &wakeup_start);
297 - acpi_copy_wakeup_routine(acpi_wakeup_address);
303 - * acpi_restore_state - undo effects of acpi_save_state_mem
305 -void acpi_restore_state_mem(void)
310 - * acpi_reserve_bootmem - do _very_ early ACPI initialisation
312 - * We allocate a page from the first 1MB of memory for the wakeup
313 - * routine for when we come back from a sleep state. The
314 - * runtime allocator allows specification of <16MB pages, but not
317 -void __init acpi_reserve_bootmem(void)
319 -#ifndef CONFIG_ACPI_PV_SLEEP
320 - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
322 - "ACPI: Wakeup code way too big, S3 disabled.\n");
326 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
327 - if (!acpi_wakeup_address)
328 - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
332 -#ifndef CONFIG_ACPI_PV_SLEEP
333 -static int __init acpi_sleep_setup(char *str)
335 - while ((str != NULL) && (*str != '\0')) {
336 - if (strncmp(str, "s3_bios", 7) == 0)
337 - acpi_realmode_flags |= 1;
338 - if (strncmp(str, "s3_mode", 7) == 0)
339 - acpi_realmode_flags |= 2;
340 - if (strncmp(str, "s3_beep", 7) == 0)
341 - acpi_realmode_flags |= 4;
342 - str = strchr(str, ',');
344 - str += strspn(str, ", \t");
349 -__setup("acpi_sleep=", acpi_sleep_setup);
351 -/* Ouch, we want to delete this. We already have better version in userspace, in
352 - s2ram from suspend.sf.net project */
353 -static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
355 - acpi_realmode_flags |= 2;
359 -static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
360 - { /* Reset video mode after returning from ACPI S3 sleep */
361 - .callback = reset_videomode_after_s3,
362 - .ident = "Toshiba Satellite 4030cdt",
364 - DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
370 -static int __init acpisleep_dmi_init(void)
372 - dmi_check_system(acpisleep_dmi_table);
376 -core_initcall(acpisleep_dmi_init);
377 -#endif /* CONFIG_ACPI_PV_SLEEP */
378 --- sle11-2009-10-16.orig/arch/x86/kernel/acpi/sleep_64-xen.c 2009-02-16 16:18:36.000000000 +0100
379 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
382 - * acpi.c - Architecture-Specific Low-Level ACPI Support
384 - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
385 - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
386 - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
387 - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
388 - * Copyright (C) 2003 Pavel Machek, SuSE Labs
390 - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
392 - * This program is free software; you can redistribute it and/or modify
393 - * it under the terms of the GNU General Public License as published by
394 - * the Free Software Foundation; either version 2 of the License, or
395 - * (at your option) any later version.
397 - * This program is distributed in the hope that it will be useful,
398 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
399 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
400 - * GNU General Public License for more details.
402 - * You should have received a copy of the GNU General Public License
403 - * along with this program; if not, write to the Free Software
404 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
406 - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
409 -#include <linux/kernel.h>
410 -#include <linux/init.h>
411 -#include <linux/types.h>
412 -#include <linux/stddef.h>
413 -#include <linux/slab.h>
414 -#include <linux/pci.h>
415 -#include <linux/bootmem.h>
416 -#include <linux/acpi.h>
417 -#include <linux/cpumask.h>
419 -#include <asm/mpspec.h>
421 -#include <asm/apic.h>
422 -#include <asm/apicdef.h>
423 -#include <asm/page.h>
424 -#include <asm/pgtable.h>
425 -#include <asm/pgalloc.h>
426 -#include <asm/io_apic.h>
427 -#include <asm/proto.h>
428 -#include <asm/tlbflush.h>
430 -/* --------------------------------------------------------------------------
431 - Low-Level Sleep Support
432 - -------------------------------------------------------------------------- */
434 -#ifndef CONFIG_ACPI_PV_SLEEP
435 -/* address in low memory of the wakeup routine. */
436 -unsigned long acpi_wakeup_address = 0;
437 -unsigned long acpi_realmode_flags;
438 -extern char wakeup_start, wakeup_end;
440 -extern unsigned long acpi_copy_wakeup_routine(unsigned long);
444 - * acpi_save_state_mem - save kernel state
446 - * Create an identity mapped page table and copy the wakeup routine to
449 -int acpi_save_state_mem(void)
451 -#ifndef CONFIG_ACPI_PV_SLEEP
452 - memcpy((void *)acpi_wakeup_address, &wakeup_start,
453 - &wakeup_end - &wakeup_start);
454 - acpi_copy_wakeup_routine(acpi_wakeup_address);
460 - * acpi_restore_state
462 -void acpi_restore_state_mem(void)
467 - * acpi_reserve_bootmem - do _very_ early ACPI initialisation
469 - * We allocate a page in low memory for the wakeup
470 - * routine for when we come back from a sleep state. The
471 - * runtime allocator allows specification of <16M pages, but not
474 -void __init acpi_reserve_bootmem(void)
476 -#ifndef CONFIG_ACPI_PV_SLEEP
477 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
478 - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
480 - "ACPI: Wakeup code way too big, will crash on attempt"
485 -#ifndef CONFIG_ACPI_PV_SLEEP
486 -static int __init acpi_sleep_setup(char *str)
488 - while ((str != NULL) && (*str != '\0')) {
489 - if (strncmp(str, "s3_bios", 7) == 0)
490 - acpi_realmode_flags |= 1;
491 - if (strncmp(str, "s3_mode", 7) == 0)
492 - acpi_realmode_flags |= 2;
493 - if (strncmp(str, "s3_beep", 7) == 0)
494 - acpi_realmode_flags |= 4;
495 - str = strchr(str, ',');
497 - str += strspn(str, ", \t");
503 -__setup("acpi_sleep=", acpi_sleep_setup);
504 -#endif /* CONFIG_ACPI_PV_SLEEP */
506 --- sle11-2009-10-16.orig/arch/x86/kernel/apic_32-xen.c 2008-12-15 11:27:22.000000000 +0100
507 +++ sle11-2009-10-16/arch/x86/kernel/apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
508 @@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m
509 * This initializes the IO-APIC and APIC hardware if this is
512 -int __init APIC_init_uniprocessor (void)
513 +int __init APIC_init_uniprocessor(void)
515 #ifdef CONFIG_X86_IO_APIC
516 if (smp_found_config)
517 --- sle11-2009-10-16.orig/arch/x86/kernel/apic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
518 +++ sle11-2009-10-16/arch/x86/kernel/apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
520 #include <asm/hpet.h>
521 #include <asm/idle.h>
527 - * 'what should we do if we get a hw irq event on an illegal vector'.
528 - * each architecture has to answer this themselves.
529 + * Debug level, exported for io_apic.c
531 -void ack_bad_irq(unsigned int irq)
533 - printk("unexpected IRQ trap at irq %02x\n", irq);
535 - * Currently unexpected vectors happen only on SMP and APIC.
536 - * We _must_ ack these because every local APIC has only N
537 - * irq slots per priority level, and a 'hanging, unacked' IRQ
538 - * holds up an irq slot - in excessive cases (when multiple
539 - * unexpected vectors occur) that might lock up the APIC
541 - * But don't ack when the APIC is disabled. -AK
547 -int setup_profiling_timer(unsigned int multiplier)
553 -void smp_local_timer_interrupt(void)
555 + * The guts of the apic timer interrupt
557 +static void local_apic_timer_interrupt(void)
560 int cpu = smp_processor_id();
561 @@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_
565 - smp_local_timer_interrupt();
566 + local_apic_timer_interrupt();
568 set_irq_regs(old_regs);
571 +int setup_profiling_timer(unsigned int multiplier)
577 + * This initializes the IO-APIC and APIC hardware if this is
580 +int __init APIC_init_uniprocessor(void)
582 +#ifdef CONFIG_X86_IO_APIC
583 + if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
591 + * Local APIC interrupts
595 * This interrupt should _never_ happen with our APIC/SMP architecture
597 @@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v
599 * This interrupt should never happen with our APIC/SMP architecture
602 asmlinkage void smp_error_interrupt(void)
605 @@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void
606 smp_processor_id(), v , v1);
613 - * This initializes the IO-APIC and APIC hardware if this is
616 -int __init APIC_init_uniprocessor (void)
618 -#ifdef CONFIG_X86_IO_APIC
619 - if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
625 --- sle11-2009-10-16.orig/arch/x86/kernel/asm-offsets_32.c 2009-02-16 16:17:21.000000000 +0100
626 +++ sle11-2009-10-16/arch/x86/kernel/asm-offsets_32.c 2009-03-16 16:33:40.000000000 +0100
628 #include <xen/interface/xen.h>
631 +#ifdef CONFIG_LGUEST_GUEST
632 #include <linux/lguest.h>
633 #include "../../../drivers/lguest/lg.h"
636 /* workaround for a warning with -Wmissing-prototypes */
638 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/common-xen.c 2009-02-16 16:18:36.000000000 +0100
639 +++ sle11-2009-10-16/arch/x86/kernel/cpu/common-xen.c 2009-03-16 16:33:40.000000000 +0100
643 DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
644 - [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
645 - [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
646 - [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
647 - [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
648 + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
649 + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
650 + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
651 + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
654 * Segments used for calling PnP BIOS have byte granularity.
655 * They code segments and data segments have fixed 64k limits,
656 * the transfer segment sizes are set at run time.
658 - [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
659 - [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
660 - [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
661 - [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
662 - [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
664 + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
666 + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
668 + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
670 + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
672 + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
674 * The APM segments have byte granularity and their bases
675 * are set at run time. All have 64k limits.
677 - [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
679 + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
681 - [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
682 - [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
683 + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
685 + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
687 - [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
688 + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
690 - [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
691 + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
693 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
695 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
697 static int cachesize_override __cpuinitdata = -1;
698 -static int disable_x86_fxsr __cpuinitdata;
699 static int disable_x86_serial_nr __cpuinitdata = 1;
700 -static int disable_x86_sep __cpuinitdata;
702 struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
704 -extern int disable_pse;
706 static void __cpuinit default_init(struct cpuinfo_x86 * c)
708 /* Not much we can do here... */
709 @@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str
711 static int __init x86_fxsr_setup(char * s)
713 - /* Tell all the other CPUs to not use it... */
714 - disable_x86_fxsr = 1;
717 - * ... and clear the bits early in the boot_cpu_data
718 - * so that the bootup process doesn't try to do this
721 - clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
722 - clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
723 + setup_clear_cpu_cap(X86_FEATURE_FXSR);
724 + setup_clear_cpu_cap(X86_FEATURE_XMM);
727 __setup("nofxsr", x86_fxsr_setup);
728 @@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup);
730 static int __init x86_sep_setup(char * s)
732 - disable_x86_sep = 1;
733 + setup_clear_cpu_cap(X86_FEATURE_SEP);
736 __setup("nosep", x86_sep_setup);
737 @@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void)
738 void __init cpu_detect(struct cpuinfo_x86 *c)
740 /* Get vendor name */
741 - cpuid(0x00000000, &c->cpuid_level,
742 - (int *)&c->x86_vendor_id[0],
743 - (int *)&c->x86_vendor_id[8],
744 - (int *)&c->x86_vendor_id[4]);
745 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
746 + (unsigned int *)&c->x86_vendor_id[0],
747 + (unsigned int *)&c->x86_vendor_id[8],
748 + (unsigned int *)&c->x86_vendor_id[4]);
751 if (c->cpuid_level >= 0x00000001) {
752 @@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8
754 c->x86_model += ((tfms >> 16) & 0xF) << 4;
755 c->x86_mask = tfms & 15;
756 - if (cap0 & (1<<19))
757 + if (cap0 & (1<<19)) {
758 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
759 + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
763 +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
768 + memset(&c->x86_capability, 0, sizeof c->x86_capability);
769 + if (have_cpuid_p()) {
770 + /* Intel-defined flags: level 0x00000001 */
771 + if (c->cpuid_level >= 0x00000001) {
772 + u32 capability, excap;
773 + cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
774 + c->x86_capability[0] = capability;
775 + c->x86_capability[4] = excap;
778 + /* AMD-defined flags: level 0x80000001 */
779 + xlvl = cpuid_eax(0x80000000);
780 + if ((xlvl & 0xffff0000) == 0x80000000) {
781 + if (xlvl >= 0x80000001) {
782 + c->x86_capability[1] = cpuid_edx(0x80000001);
783 + c->x86_capability[6] = cpuid_ecx(0x80000001);
791 /* Do minimum CPU detection early.
792 @@ -300,6 +326,7 @@ static void __init early_cpu_detect(void
793 struct cpuinfo_x86 *c = &boot_cpu_data;
795 c->x86_cache_alignment = 32;
796 + c->x86_clflush_size = 32;
800 @@ -307,19 +334,30 @@ static void __init early_cpu_detect(void
803 get_cpu_vendor(c, 1);
805 + switch (c->x86_vendor) {
806 + case X86_VENDOR_AMD:
809 + case X86_VENDOR_INTEL:
810 + early_init_intel(c);
817 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
823 if (have_cpuid_p()) {
824 /* Get vendor name */
825 - cpuid(0x00000000, &c->cpuid_level,
826 - (int *)&c->x86_vendor_id[0],
827 - (int *)&c->x86_vendor_id[8],
828 - (int *)&c->x86_vendor_id[4]);
829 + cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
830 + (unsigned int *)&c->x86_vendor_id[0],
831 + (unsigned int *)&c->x86_vendor_id[8],
832 + (unsigned int *)&c->x86_vendor_id[4]);
834 get_cpu_vendor(c, 0);
835 /* Initialize the standard set of capabilities */
836 @@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s
837 init_scattered_cpuid_features(c);
840 - early_intel_workaround(c);
843 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
845 @@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se
847 * This does the hard work of actually picking apart the CPU stuff...
849 -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
850 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
854 @@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc
858 - printk(KERN_DEBUG "CPU: After generic identify, caps:");
859 - for (i = 0; i < NCAPINTS; i++)
860 - printk(" %08lx", c->x86_capability[i]);
863 - if (this_cpu->c_identify) {
864 + if (this_cpu->c_identify)
865 this_cpu->c_identify(c);
867 - printk(KERN_DEBUG "CPU: After vendor identify, caps:");
868 - for (i = 0; i < NCAPINTS; i++)
869 - printk(" %08lx", c->x86_capability[i]);
874 * Vendor-specific initialization. In this section we
875 * canonicalize the feature flags, meaning if there are
876 @@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc
877 * we do "generic changes."
880 - /* TSC disabled? */
882 - clear_bit(X86_FEATURE_TSC, c->x86_capability);
884 - /* FXSR disabled? */
885 - if (disable_x86_fxsr) {
886 - clear_bit(X86_FEATURE_FXSR, c->x86_capability);
887 - clear_bit(X86_FEATURE_XMM, c->x86_capability);
890 - /* SEP disabled? */
891 - if (disable_x86_sep)
892 - clear_bit(X86_FEATURE_SEP, c->x86_capability);
895 - clear_bit(X86_FEATURE_PSE, c->x86_capability);
897 /* If the model name is still unset, do table lookup. */
898 if ( !c->x86_model_id[0] ) {
900 @@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc
901 c->x86, c->x86_model);
904 - /* Now the feature flags better reflect actual CPU features! */
906 - printk(KERN_DEBUG "CPU: After all inits, caps:");
907 - for (i = 0; i < NCAPINTS; i++)
908 - printk(" %08lx", c->x86_capability[i]);
912 * On SMP, boot_cpu_data holds the common feature set between
913 * all CPUs; so make sure that we indicate which features are
914 @@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc
915 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
918 + /* Clear all flags overriden by options */
919 + for (i = 0; i < NCAPINTS; i++)
920 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
922 /* Init Machine Check Exception if available. */
925 + select_idle_routine(c);
928 void __init identify_boot_cpu(void)
929 @@ -517,7 +524,6 @@ void __init identify_boot_cpu(void)
930 identify_cpu(&boot_cpu_data);
936 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
937 @@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_
941 +static __init int setup_noclflush(char *arg)
943 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
946 +__setup("noclflush", setup_noclflush);
948 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
951 @@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu
955 +static __init int setup_disablecpuid(char *arg)
958 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
959 + setup_clear_cpu_cap(bit);
964 +__setup("clearcpuid=", setup_disablecpuid);
966 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
969 @@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata
970 * They will insert themselves into the cpu_devs structure.
971 * Then, when cpu_init() is called, we can just iterate over that array.
974 -extern int intel_cpu_init(void);
975 -extern int cyrix_init_cpu(void);
976 -extern int nsc_init_cpu(void);
977 -extern int amd_init_cpu(void);
978 -extern int centaur_init_cpu(void);
979 -extern int transmeta_init_cpu(void);
980 -extern int nexgen_init_cpu(void);
981 -extern int umc_init_cpu(void);
983 void __init early_cpu_init(void)
986 @@ -627,21 +641,13 @@ void __init early_cpu_init(void)
991 -#ifdef CONFIG_DEBUG_PAGEALLOC
992 - /* pse is not compatible with on-the-fly unmapping,
993 - * disable it even if the cpus claim to support it.
995 - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
1000 /* Make sure %fs is initialized properly in idle threads */
1001 -struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
1002 +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1004 memset(regs, 0, sizeof(struct pt_regs));
1005 - regs->xfs = __KERNEL_PERCPU;
1006 + regs->fs = __KERNEL_PERCPU;
1010 @@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str
1011 * it's on the real one. */
1012 void switch_to_new_gdt(void)
1014 - struct Xgt_desc_struct gdt_descr;
1015 + struct desc_ptr gdt_descr;
1016 unsigned long va, frames[16];
1019 @@ -692,12 +698,6 @@ void __cpuinit cpu_init(void)
1021 if (cpu_has_vme || cpu_has_de)
1022 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1023 - if (tsc_disable && cpu_has_tsc) {
1024 - printk(KERN_NOTICE "Disabling TSC...\n");
1025 - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
1026 - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
1027 - set_in_cr4(X86_CR4_TSD);
1030 switch_to_new_gdt();
1032 @@ -710,7 +710,7 @@ void __cpuinit cpu_init(void)
1034 enter_lazy_tlb(&init_mm, curr);
1036 - load_esp0(t, thread);
1037 + load_sp0(t, thread);
1039 load_LDT(&init_mm.context);
1041 --- sle11-2009-10-16.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-02-16 16:17:21.000000000 +0100
1042 +++ sle11-2009-10-16/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-03-16 16:33:40.000000000 +0100
1043 @@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
1045 struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
1046 unsigned int num_var_ranges;
1047 -unsigned int *usage_table;
1048 +unsigned int mtrr_usage_table[MAX_VAR_RANGES];
1050 static void __init set_num_var_ranges(void)
1052 @@ -52,17 +52,12 @@ static void __init init_table(void)
1055 max = num_var_ranges;
1056 - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
1058 - printk(KERN_ERR "mtrr: could not allocate\n");
1061 for (i = 0; i < max; i++)
1062 - usage_table[i] = 0;
1063 + mtrr_usage_table[i] = 0;
1066 int mtrr_add_page(unsigned long base, unsigned long size,
1067 - unsigned int type, char increment)
1068 + unsigned int type, bool increment)
1071 struct xen_platform_op op;
1072 @@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un
1076 - ++usage_table[op.u.add_memtype.reg];
1077 + ++mtrr_usage_table[op.u.add_memtype.reg];
1079 mutex_unlock(&mtrr_mutex);
1081 @@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base
1084 mtrr_add(unsigned long base, unsigned long size, unsigned int type,
1088 if (mtrr_check(base, size))
1090 @@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long
1094 - if (usage_table[reg] < 1) {
1095 + if (mtrr_usage_table[reg] < 1) {
1096 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
1099 - if (--usage_table[reg] < 1) {
1100 + if (--mtrr_usage_table[reg] < 1) {
1101 op.cmd = XENPF_del_memtype;
1102 op.u.del_memtype.handle = 0;
1103 op.u.del_memtype.reg = reg;
1104 --- sle11-2009-10-16.orig/arch/x86/kernel/e820_32-xen.c 2009-02-16 16:18:36.000000000 +0100
1105 +++ sle11-2009-10-16/arch/x86/kernel/e820_32-xen.c 2009-03-16 16:33:40.000000000 +0100
1107 #include <linux/kexec.h>
1108 #include <linux/module.h>
1109 #include <linux/mm.h>
1110 -#include <linux/efi.h>
1111 #include <linux/pfn.h>
1112 #include <linux/uaccess.h>
1113 #include <linux/suspend.h>
1115 #include <asm/setup.h>
1116 #include <xen/interface/memory.h>
1119 -int efi_enabled = 0;
1120 -EXPORT_SYMBOL(efi_enabled);
1123 struct e820map e820;
1124 struct change_member {
1125 struct e820entry *pbios; /* pointer to original bios entry */
1126 @@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000
1127 EXPORT_SYMBOL(pci_mem_start);
1129 extern int user_defined_memmap;
1130 -struct resource data_resource = {
1131 - .name = "Kernel data",
1134 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1137 -struct resource code_resource = {
1138 - .name = "Kernel code",
1141 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1144 -struct resource bss_resource = {
1145 - .name = "Kernel bss",
1148 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1151 static struct resource system_rom_resource = {
1152 .name = "System ROM",
1153 @@ -112,60 +86,6 @@ static struct resource video_rom_resourc
1154 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
1157 -static struct resource video_ram_resource = {
1158 - .name = "Video RAM area",
1161 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1164 -static struct resource standard_io_resources[] = { {
1168 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1173 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1178 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1183 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1185 - .name = "keyboard",
1188 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1190 - .name = "dma page reg",
1193 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1198 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1203 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1208 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
1211 #define ROMSIGNATURE 0xaa55
1213 static int __init romsignature(const unsigned char *rom)
1214 @@ -272,10 +192,9 @@ static struct e820map machine_e820;
1215 * Request address space for all standard RAM and ROM resources
1216 * and also for regions reported as reserved by the e820.
1219 -legacy_init_iomem_resources(struct resource *code_resource,
1220 - struct resource *data_resource,
1221 - struct resource *bss_resource)
1222 +void __init init_iomem_resources(struct resource *code_resource,
1223 + struct resource *data_resource,
1224 + struct resource *bss_resource)
1228 @@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou
1233 - * Request address space for all standard resources
1235 - * This is called just before pcibios_init(), which is also a
1236 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1238 -static int __init request_standard_resources(void)
1242 - /* Nothing to do if not running in dom0. */
1243 - if (!is_initial_xendomain())
1246 - printk("Setting up standard PCI resources\n");
1248 - efi_initialize_iomem_resources(&code_resource,
1249 - &data_resource, &bss_resource);
1251 - legacy_init_iomem_resources(&code_resource,
1252 - &data_resource, &bss_resource);
1254 - /* EFI systems may still have VGA */
1255 - request_resource(&iomem_resource, &video_ram_resource);
1257 - /* request I/O space for devices used on all i[345]86 PCs */
1258 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1259 - request_resource(&ioport_resource, &standard_io_resources[i]);
1263 -subsys_initcall(request_standard_resources);
1265 #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
1267 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
1268 @@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l
1272 - if (!efi_enabled) {
1275 - if (x == E820MAX) {
1276 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1281 - e820.map[x].addr = start;
1282 - e820.map[x].size = size;
1283 - e820.map[x].type = type;
1285 + if (x == E820MAX) {
1286 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
1290 + e820.map[x].addr = start;
1291 + e820.map[x].size = size;
1292 + e820.map[x].type = type;
1294 } /* add_memory_region */
1297 @@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr
1301 - * Callback for efi_memory_walk.
1304 -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1306 - unsigned long *max_pfn = arg, pfn;
1308 - if (start < end) {
1309 - pfn = PFN_UP(end -1);
1310 - if (pfn > *max_pfn)
1317 -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1319 - memory_present(0, PFN_UP(start), PFN_DOWN(end));
1324 * Find the highest page frame number we have available
1326 void __init find_max_pfn(void)
1327 @@ -672,11 +533,6 @@ void __init find_max_pfn(void)
1331 - if (efi_enabled) {
1332 - efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1333 - efi_memmap_walk(efi_memory_present_wrapper, NULL);
1337 for (i = 0; i < e820.nr_map; i++) {
1338 unsigned long start, end;
1339 @@ -694,34 +550,12 @@ void __init find_max_pfn(void)
1343 - * Free all available memory for boot time allocation. Used
1344 - * as a callback function by efi_memory_walk()
1348 -free_available_memory(unsigned long start, unsigned long end, void *arg)
1350 - /* check max_low_pfn */
1351 - if (start >= (max_low_pfn << PAGE_SHIFT))
1353 - if (end >= (max_low_pfn << PAGE_SHIFT))
1354 - end = max_low_pfn << PAGE_SHIFT;
1356 - free_bootmem(start, end - start);
1361 * Register fully available low RAM pages with the bootmem allocator.
1363 void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1367 - if (efi_enabled) {
1368 - efi_memmap_walk(free_available_memory, NULL);
1371 for (i = 0; i < e820.nr_map; i++) {
1372 unsigned long curr_pfn, last_pfn, size;
1374 @@ -855,56 +689,12 @@ void __init print_memory_map(char *who)
1378 -static __init __always_inline void efi_limit_regions(unsigned long long size)
1380 - unsigned long long current_addr = 0;
1381 - efi_memory_desc_t *md, *next_md;
1387 - for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1390 - current_addr = md->phys_addr +
1391 - PFN_PHYS(md->num_pages);
1392 - if (is_available_memory(md)) {
1393 - if (md->phys_addr >= size) continue;
1394 - memcpy(next_md, md, memmap.desc_size);
1395 - if (current_addr >= size) {
1396 - next_md->num_pages -=
1397 - PFN_UP(current_addr-size);
1399 - p1 += memmap.desc_size;
1402 - } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1403 - EFI_MEMORY_RUNTIME) {
1404 - /* In order to make runtime services
1405 - * available we have to include runtime
1406 - * memory regions in memory map */
1407 - memcpy(next_md, md, memmap.desc_size);
1408 - p1 += memmap.desc_size;
1413 - memmap.nr_map = j;
1414 - memmap.map_end = memmap.map +
1415 - (memmap.nr_map * memmap.desc_size);
1418 void __init limit_regions(unsigned long long size)
1420 unsigned long long current_addr = 0;
1423 print_memory_map("limit_regions start");
1424 - if (efi_enabled) {
1425 - efi_limit_regions(size);
1428 for (i = 0; i < e820.nr_map; i++) {
1429 current_addr = e820.map[i].addr + e820.map[i].size;
1430 if (current_addr < size)
1431 @@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg
1434 early_param("memmap", parse_memmap);
1437 +void __init update_memory_range(u64 start, u64 size, unsigned old_type,
1438 + unsigned new_type)
1442 + BUG_ON(old_type == new_type);
1444 + for (i = 0; i < e820.nr_map; i++) {
1445 + struct e820entry *ei = &e820.map[i];
1446 + u64 final_start, final_end;
1447 + if (ei->type != old_type)
1449 + /* totally covered? */
1450 + if (ei->addr >= start && ei->size <= size) {
1451 + ei->type = new_type;
1454 + /* partially covered */
1455 + final_start = max(start, ei->addr);
1456 + final_end = min(start + size, ei->addr + ei->size);
1457 + if (final_start >= final_end)
1459 + add_memory_region(final_start, final_end - final_start,
1464 +void __init update_e820(void)
1468 + nr_map = e820.nr_map;
1469 + if (sanitize_e820_map(e820.map, &nr_map))
1471 + e820.nr_map = nr_map;
1472 + printk(KERN_INFO "modified physical RAM map:\n");
1473 + print_memory_map("modified");
1476 --- sle11-2009-10-16.orig/arch/x86/kernel/e820_64-xen.c 2009-02-16 16:18:36.000000000 +0100
1477 +++ sle11-2009-10-16/arch/x86/kernel/e820_64-xen.c 2009-03-16 16:33:40.000000000 +0100
1481 * Handle the memory map.
1482 * The functions here do the job until bootmem takes over.
1485 #include <asm/proto.h>
1486 #include <asm/setup.h>
1487 #include <asm/sections.h>
1488 +#include <asm/kdebug.h>
1489 #include <xen/interface/memory.h>
1491 struct e820map e820 __initdata;
1492 @@ -33,96 +34,103 @@ struct e820map e820 __initdata;
1493 struct e820map machine_e820;
1498 * PFN of last memory page.
1500 -unsigned long end_pfn;
1501 -EXPORT_SYMBOL(end_pfn);
1502 +unsigned long end_pfn;
1506 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
1507 * The direct mapping extends to end_pfn_map, so that we can directly access
1508 * apertures, ACPI and other tables without having to play with fixmaps.
1510 -unsigned long end_pfn_map;
1512 +unsigned long end_pfn_map;
1516 * Last pfn which the user wants to use.
1518 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
1520 -extern struct resource code_resource, data_resource, bss_resource;
1522 -/* Check for some hardcoded bad areas that early boot is not allowed to touch */
1523 -static inline int bad_addr(unsigned long *addrp, unsigned long size)
1525 - unsigned long addr = *addrp, last = addr + size;
1527 + * Early reserved memory areas.
1529 +#define MAX_EARLY_RES 20
1532 + unsigned long start, end;
1535 +static struct early_res early_res[MAX_EARLY_RES] __initdata = {
1537 - /* various gunk below that needed for SMP startup */
1538 - if (addr < 0x8000) {
1539 - *addrp = PAGE_ALIGN(0x8000);
1543 - /* direct mapping tables of the kernel */
1544 - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
1545 - *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
1550 -#ifdef CONFIG_BLK_DEV_INITRD
1551 - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
1552 - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
1553 - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
1554 - unsigned long ramdisk_end = ramdisk_image+ramdisk_size;
1556 - if (last >= ramdisk_image && addr < ramdisk_end) {
1557 - *addrp = PAGE_ALIGN(ramdisk_end);
1561 + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
1563 + { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
1566 - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
1567 - *addrp = PAGE_ALIGN(__pa_symbol(&_end));
1574 - if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
1575 - *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
1577 +void __init reserve_early(unsigned long start, unsigned long end, char *name)
1580 + struct early_res *r;
1581 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1582 + r = &early_res[i];
1583 + if (end > r->start && start < r->end)
1584 + panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
1585 + start, end - 1, name?name:"", r->start, r->end - 1, r->name);
1587 + if (i >= MAX_EARLY_RES)
1588 + panic("Too many early reservations");
1589 + r = &early_res[i];
1593 + strncpy(r->name, name, sizeof(r->name) - 1);
1597 - /* NUMA memory to node map */
1598 - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
1599 - *addrp = nodemap_addr + nodemap_size;
1601 +void __init early_res_to_bootmem(void)
1604 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1605 + struct early_res *r = &early_res[i];
1606 + printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
1607 + r->start, r->end - 1, r->name);
1608 + reserve_bootmem_generic(r->start, r->end - r->start);
1611 - /* XXX ramdisk image here? */
1613 - if (last < (table_end<<PAGE_SHIFT)) {
1614 - *addrp = table_end << PAGE_SHIFT;
1618 +/* Check for already reserved areas */
1619 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
1622 + unsigned long addr = *addrp, last;
1625 + last = addr + size;
1626 + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
1627 + struct early_res *r = &early_res[i];
1628 + if (last >= r->start && addr < r->end) {
1629 + *addrp = addr = r->end;
1641 * This function checks if any part of the range <start,end> is mapped
1644 -int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1647 +e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
1652 - for (i = 0; i < e820.nr_map; i++) {
1653 - struct e820entry *ei = &e820.map[i];
1654 + for (i = 0; i < e820.nr_map; i++) {
1655 + struct e820entry *ei = &e820.map[i];
1657 if (!is_initial_xendomain())
1659 @@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start,
1660 const struct e820entry *ei = &machine_e820.map[i];
1663 - if (type && ei->type != type)
1664 + if (type && ei->type != type)
1666 if (ei->addr >= end || ei->addr + ei->size <= start)
1675 EXPORT_SYMBOL_GPL(e820_any_mapped);
1676 @@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
1677 * Note: this function only works correct if the e820 table is sorted and
1678 * not-overlapping, which is the case
1680 -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
1681 +int __init e820_all_mapped(unsigned long start, unsigned long end,
1686 @@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long
1688 if (ei->addr <= start)
1689 start = ei->addr + ei->size;
1690 - /* if start is now at or beyond end, we're done, full coverage */
1692 + * if start is now at or beyond end, we're done, full
1696 - return 1; /* we're done */
1703 - * Find a free area in a specific range.
1705 -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
1708 - for (i = 0; i < e820.nr_map; i++) {
1709 - struct e820entry *ei = &e820.map[i];
1710 - unsigned long addr = ei->addr, last;
1711 - if (ei->type != E820_RAM)
1715 + * Find a free area with specified alignment in a specific range.
1717 +unsigned long __init find_e820_area(unsigned long start, unsigned long end,
1718 + unsigned size, unsigned long align)
1721 + unsigned long mask = ~(align - 1);
1723 + for (i = 0; i < e820.nr_map; i++) {
1724 + struct e820entry *ei = &e820.map[i];
1725 + unsigned long addr = ei->addr, last;
1727 + if (ei->type != E820_RAM)
1731 - if (addr > ei->addr + ei->size)
1733 + if (addr > ei->addr + ei->size)
1735 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
1737 - last = PAGE_ALIGN(addr) + size;
1738 + addr = (addr + align - 1) & mask;
1739 + last = addr + size;
1740 if (last > ei->addr + ei->size)
1755 * Find the highest page frame number we have available
1757 unsigned long __init e820_end_of_ram(void)
1759 - unsigned long end_pfn = 0;
1760 + unsigned long end_pfn;
1762 end_pfn = find_max_pfn_with_active_regions();
1764 - if (end_pfn > end_pfn_map)
1766 + if (end_pfn > end_pfn_map)
1767 end_pfn_map = end_pfn;
1768 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
1769 end_pfn_map = MAXMEM>>PAGE_SHIFT;
1770 if (end_pfn > end_user_pfn)
1771 end_pfn = end_user_pfn;
1772 - if (end_pfn > end_pfn_map)
1773 - end_pfn = end_pfn_map;
1774 + if (end_pfn > end_pfn_map)
1775 + end_pfn = end_pfn_map;
1777 - printk("end_pfn_map = %lu\n", end_pfn_map);
1779 + printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
1784 * Mark e820 reserved areas as busy for the resource manager.
1786 -void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
1787 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
1788 + struct resource *code_resource,
1789 + struct resource *data_resource,
1790 + struct resource *bss_resource)
1793 for (i = 0; i < nr_map; i++) {
1794 @@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc
1795 request_resource(&iomem_resource, res);
1796 if (e820[i].type == E820_RAM) {
1798 - * We don't know which RAM region contains kernel data,
1799 - * so we try it repeatedly and let the resource manager
1801 + * We don't know which RAM region contains kernel data,
1802 + * so we try it repeatedly and let the resource manager
1806 - request_resource(res, &code_resource);
1807 - request_resource(res, &data_resource);
1808 - request_resource(res, &bss_resource);
1809 + request_resource(res, code_resource);
1810 + request_resource(res, data_resource);
1811 + request_resource(res, bss_resource);
1814 if (crashk_res.start != crashk_res.end)
1815 @@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un
1816 add_active_range(nid, ei_startpfn, ei_endpfn);
1821 * Add a memory region to the kernel e820 map.
1824 void __init add_memory_region(unsigned long start, unsigned long size, int type)
1826 int x = e820.nr_map;
1827 @@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi
1829 unsigned long start_pfn = start >> PAGE_SHIFT;
1830 unsigned long end_pfn = end >> PAGE_SHIFT;
1831 - unsigned long ei_startpfn;
1832 - unsigned long ei_endpfn;
1833 - unsigned long ram = 0;
1834 + unsigned long ei_startpfn, ei_endpfn, ram = 0;
1837 for (i = 0; i < e820.nr_map; i++) {
1838 @@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi
1839 return end - start - (ram << PAGE_SHIFT);
1842 -void __init e820_print_map(char *who)
1843 +static void __init e820_print_map(char *who)
1847 for (i = 0; i < e820.nr_map; i++) {
1848 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
1849 - (unsigned long long) e820.map[i].addr,
1850 - (unsigned long long) (e820.map[i].addr + e820.map[i].size));
1851 + (unsigned long long) e820.map[i].addr,
1852 + (unsigned long long)
1853 + (e820.map[i].addr + e820.map[i].size));
1854 switch (e820.map[i].type) {
1855 - case E820_RAM: printk("(usable)\n");
1858 + printk(KERN_CONT "(usable)\n");
1861 - printk("(reserved)\n");
1863 + printk(KERN_CONT "(reserved)\n");
1866 - printk("(ACPI data)\n");
1868 + printk(KERN_CONT "(ACPI data)\n");
1871 - printk("(ACPI NVS)\n");
1873 - default: printk("type %u\n", e820.map[i].type);
1875 + printk(KERN_CONT "(ACPI NVS)\n");
1878 + printk(KERN_CONT "type %u\n", e820.map[i].type);
1883 @@ -427,11 +449,11 @@ void __init e820_print_map(char *who)
1885 * Sanitize the BIOS e820 map.
1887 - * Some e820 responses include overlapping entries. The following
1888 + * Some e820 responses include overlapping entries. The following
1889 * replaces the original e820 map with a new one, removing overlaps.
1892 -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
1893 +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
1895 struct change_member {
1896 struct e820entry *pbios; /* pointer to original bios entry */
1897 @@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru
1901 - Visually we're performing the following (1,2,3,4 = memory types)...
1902 + Visually we're performing the following
1903 + (1,2,3,4 = memory types)...
1905 Sample memory map (w/overlaps):
1906 ____22__________________
1907 @@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru
1910 /* bail out if we find any unreasonable addresses in bios map */
1911 - for (i=0; i<old_nr; i++)
1912 + for (i = 0; i < old_nr; i++)
1913 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
1916 /* create pointers for initial change-point information (for sorting) */
1917 - for (i=0; i < 2*old_nr; i++)
1918 + for (i = 0; i < 2 * old_nr; i++)
1919 change_point[i] = &change_point_list[i];
1921 /* record all known change-points (starting and ending addresses),
1922 omitting those that are for empty memory regions */
1924 - for (i=0; i < old_nr; i++) {
1925 + for (i = 0; i < old_nr; i++) {
1926 if (biosmap[i].size != 0) {
1927 change_point[chgidx]->addr = biosmap[i].addr;
1928 change_point[chgidx++]->pbios = &biosmap[i];
1929 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
1930 + change_point[chgidx]->addr = biosmap[i].addr +
1932 change_point[chgidx++]->pbios = &biosmap[i];
1935 @@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru
1937 while (still_changing) {
1939 - for (i=1; i < chg_nr; i++) {
1940 - /* if <current_addr> > <last_addr>, swap */
1941 - /* or, if current=<start_addr> & last=<end_addr>, swap */
1942 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
1943 - ((change_point[i]->addr == change_point[i-1]->addr) &&
1944 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
1945 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
1948 + for (i = 1; i < chg_nr; i++) {
1949 + unsigned long long curaddr, lastaddr;
1950 + unsigned long long curpbaddr, lastpbaddr;
1952 + curaddr = change_point[i]->addr;
1953 + lastaddr = change_point[i - 1]->addr;
1954 + curpbaddr = change_point[i]->pbios->addr;
1955 + lastpbaddr = change_point[i - 1]->pbios->addr;
1958 + * swap entries, when:
1960 + * curaddr > lastaddr or
1961 + * curaddr == lastaddr and curaddr == curpbaddr and
1962 + * lastaddr != lastpbaddr
1964 + if (curaddr < lastaddr ||
1965 + (curaddr == lastaddr && curaddr == curpbaddr &&
1966 + lastaddr != lastpbaddr)) {
1967 change_tmp = change_point[i];
1968 change_point[i] = change_point[i-1];
1969 change_point[i-1] = change_tmp;
1971 + still_changing = 1;
1976 /* create a new bios memory map, removing overlaps */
1977 - overlap_entries=0; /* number of entries in the overlap table */
1978 - new_bios_entry=0; /* index for creating new bios map entries */
1979 + overlap_entries = 0; /* number of entries in the overlap table */
1980 + new_bios_entry = 0; /* index for creating new bios map entries */
1981 last_type = 0; /* start with undefined memory type */
1982 last_addr = 0; /* start with 0 as last starting address */
1984 /* loop through change-points, determining affect on the new bios map */
1985 - for (chgidx=0; chgidx < chg_nr; chgidx++)
1987 + for (chgidx = 0; chgidx < chg_nr; chgidx++) {
1988 /* keep track of all overlapping bios entries */
1989 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
1991 - /* add map entry to overlap list (> 1 entry implies an overlap) */
1992 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
1996 - /* remove entry from list (order independent, so swap with last) */
1997 - for (i=0; i<overlap_entries; i++)
1999 - if (overlap_list[i] == change_point[chgidx]->pbios)
2000 - overlap_list[i] = overlap_list[overlap_entries-1];
2001 + if (change_point[chgidx]->addr ==
2002 + change_point[chgidx]->pbios->addr) {
2004 + * add map entry to overlap list (> 1 entry
2005 + * implies an overlap)
2007 + overlap_list[overlap_entries++] =
2008 + change_point[chgidx]->pbios;
2011 + * remove entry from list (order independent,
2012 + * so swap with last)
2014 + for (i = 0; i < overlap_entries; i++) {
2015 + if (overlap_list[i] ==
2016 + change_point[chgidx]->pbios)
2018 + overlap_list[overlap_entries-1];
2022 - /* if there are overlapping entries, decide which "type" to use */
2023 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
2025 + * if there are overlapping entries, decide which
2026 + * "type" to use (larger value takes precedence --
2027 + * 1=usable, 2,3,4,4+=unusable)
2030 - for (i=0; i<overlap_entries; i++)
2031 + for (i = 0; i < overlap_entries; i++)
2032 if (overlap_list[i]->type > current_type)
2033 current_type = overlap_list[i]->type;
2034 - /* continue building up new bios map based on this information */
2036 + * continue building up new bios map based on this
2039 if (current_type != last_type) {
2040 if (last_type != 0) {
2041 new_bios[new_bios_entry].size =
2042 change_point[chgidx]->addr - last_addr;
2043 - /* move forward only if the new size was non-zero */
2045 + * move forward only if the new size
2048 if (new_bios[new_bios_entry].size != 0)
2050 + * no more space left for new
2053 if (++new_bios_entry >= E820MAX)
2054 - break; /* no more space left for new bios entries */
2057 if (current_type != 0) {
2058 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
2059 + new_bios[new_bios_entry].addr =
2060 + change_point[chgidx]->addr;
2061 new_bios[new_bios_entry].type = current_type;
2062 - last_addr=change_point[chgidx]->addr;
2063 + last_addr = change_point[chgidx]->addr;
2065 last_type = current_type;
2068 - new_nr = new_bios_entry; /* retain count for new bios entries */
2069 + /* retain count for new bios entries */
2070 + new_nr = new_bios_entry;
2072 /* copy new bios mapping into original location */
2073 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
2074 + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
2078 @@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru
2079 * will have given us a memory map that we can use to properly
2080 * set up memory. If we aren't, we'll fake a memory map.
2082 -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
2083 +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
2086 /* Only one memory region (or negative)? Ignore it */
2087 @@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e
2090 add_memory_region(start, size, type);
2091 - } while (biosmap++,--nr_map);
2092 + } while (biosmap++, --nr_map);
2095 if (is_initial_xendomain()) {
2096 @@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e
2100 -void early_panic(char *msg)
2101 +static void early_panic(char *msg)
2108 -void __init setup_memory_region(void)
2109 +/* We're not void only for x86 32-bit compat */
2110 +char * __init machine_specific_memory_setup(void)
2113 + char *who = "BIOS-e820";
2115 * Try to copy the BIOS-supplied E820-map.
2117 @@ -659,14 +716,8 @@ void __init setup_memory_region(void)
2118 sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
2119 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
2120 early_panic("Cannot find a valid memory map");
2121 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2122 - e820_print_map("BIOS-e820");
2125 #else /* CONFIG_XEN */
2127 -void __init setup_memory_region(void)
2129 + char *who = "Xen";
2131 struct xen_memory_map memmap;
2133 @@ -694,11 +745,13 @@ void __init setup_memory_region(void)
2135 if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
2136 early_panic("Cannot find a valid memory map");
2139 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
2140 - e820_print_map("Xen");
2141 + e820_print_map(who);
2143 + /* In case someone cares... */
2148 static int __init parse_memopt(char *p)
2150 @@ -709,7 +762,7 @@ static int __init parse_memopt(char *p)
2153 end_user_pfn = memparse(p, &p);
2154 - end_user_pfn >>= PAGE_SHIFT;
2155 + end_user_pfn >>= PAGE_SHIFT;
2157 end = end_user_pfn<<PAGE_SHIFT;
2159 @@ -727,7 +780,7 @@ static int __init parse_memopt(char *p)
2165 early_param("mem", parse_memopt);
2167 static int userdef __initdata;
2168 @@ -739,9 +792,9 @@ static int __init parse_memmap_opt(char
2170 if (!strcmp(p, "exactmap")) {
2171 #ifdef CONFIG_CRASH_DUMP
2172 - /* If we are doing a crash dump, we
2173 - * still need to know the real mem
2174 - * size before original memory map is
2176 + * If we are doing a crash dump, we still need to know
2177 + * the real mem size before original memory map is
2180 e820_register_active_regions(0, 0, -1UL);
2181 @@ -758,6 +811,8 @@ static int __init parse_memmap_opt(char
2182 mem_size = memparse(p, &p);
2188 start_at = memparse(p+1, &p);
2189 add_memory_region(start_at, mem_size, E820_RAM);
2190 @@ -777,11 +832,58 @@ early_param("memmap", parse_memmap_opt);
2191 void __init finish_e820_parsing(void)
2194 + char nr = e820.nr_map;
2196 + if (sanitize_e820_map(e820.map, &nr) < 0)
2197 + early_panic("Invalid user supplied memory map");
2200 printk(KERN_INFO "user-defined physical RAM map:\n");
2201 e820_print_map("user");
2206 +void __init update_memory_range(u64 start, u64 size, unsigned old_type,
2207 + unsigned new_type)
2211 + BUG_ON(old_type == new_type);
2213 + for (i = 0; i < e820.nr_map; i++) {
2214 + struct e820entry *ei = &e820.map[i];
2215 + u64 final_start, final_end;
2216 + if (ei->type != old_type)
2218 + /* totally covered? */
2219 + if (ei->addr >= start && ei->size <= size) {
2220 + ei->type = new_type;
2223 + /* partially covered */
2224 + final_start = max(start, ei->addr);
2225 + final_end = min(start + size, ei->addr + ei->size);
2226 + if (final_start >= final_end)
2228 + add_memory_region(final_start, final_end - final_start,
2233 +void __init update_e820(void)
2237 + nr_map = e820.nr_map;
2238 + if (sanitize_e820_map(e820.map, &nr_map))
2240 + e820.nr_map = nr_map;
2241 + printk(KERN_INFO "modified physical RAM map:\n");
2242 + e820_print_map("modified");
2246 unsigned long pci_mem_start = 0xaeedbabe;
2247 EXPORT_SYMBOL(pci_mem_start);
2249 @@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en
2252 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
2253 - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
2254 - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
2255 + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
2257 + KERN_ERR "PCI: Unassigned devices with 32bit resource "
2258 + "registers may break!\n");
2262 @@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en
2263 /* Fun with two's complement */
2264 pci_mem_start = (gapstart + round) & -round;
2266 - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2267 - pci_mem_start, gapstart, gapsize);
2269 + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
2270 + pci_mem_start, gapstart, gapsize);
2273 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
2274 --- sle11-2009-10-16.orig/arch/x86/kernel/early_printk-xen.c 2009-09-24 10:27:18.000000000 +0200
2275 +++ sle11-2009-10-16/arch/x86/kernel/early_printk-xen.c 2009-03-16 16:33:40.000000000 +0100
2276 @@ -222,7 +222,7 @@ static struct console simnow_console = {
2279 /* Direct interface for emergencies */
2280 -struct console *early_console = &early_vga_console;
2281 +static struct console *early_console = &early_vga_console;
2282 static int early_console_initialized = 0;
2284 void early_printk(const char *fmt, ...)
2285 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:18.000000000 +0200
2286 +++ sle11-2009-10-16/arch/x86/kernel/entry_32-xen.S 2009-05-14 11:18:32.000000000 +0200
2288 * for paravirtualization. The following will never clobber any registers:
2289 * INTERRUPT_RETURN (aka. "iret")
2290 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
2291 - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
2292 + * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
2294 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
2295 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
2296 @@ -282,16 +282,21 @@ END(resume_kernel)
2300 + .macro test_tif ti_reg # system call tracing in operation / emulation
2301 + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2302 + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
2305 /* SYSENTER_RETURN points to after the "sysenter" instruction in
2306 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
2308 # sysenter call handler stub
2309 -ENTRY(sysenter_entry)
2310 +ENTRY(ia32_sysenter_target)
2311 CFI_STARTPROC simple
2314 CFI_REGISTER esp, ebp
2315 - movl SYSENTER_stack_esp0(%esp),%esp
2316 + movl SYSENTER_stack_sp0(%esp),%esp
2319 * No need to follow this irqs on/off section: the syscall
2320 @@ -334,9 +339,7 @@ sysenter_past_esp:
2321 CFI_ADJUST_CFA_OFFSET 4
2323 GET_THREAD_INFO(%ebp)
2325 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2326 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2328 jnz syscall_trace_entry
2329 cmpl $(nr_syscalls), %eax
2331 @@ -354,7 +357,7 @@ sysenter_past_esp:
2334 1: mov PT_FS(%esp), %fs
2335 - ENABLE_INTERRUPTS_SYSEXIT
2336 + ENABLE_INTERRUPTS_SYSCALL_RET
2338 .pushsection .fixup,"ax"
2339 2: movl $0,PT_FS(%esp)
2340 @@ -363,10 +366,10 @@ sysenter_past_esp:
2344 -ENDPROC(sysenter_entry)
2345 +ENDPROC(ia32_sysenter_target)
2347 # pv sysenter call handler stub
2348 -ENTRY(sysenter_entry_pv)
2349 +ENTRY(ia32pv_sysenter_target)
2351 movl $__USER_DS,16(%esp)
2353 @@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv)
2357 -ENDPROC(sysenter_entry_pv)
2358 +ENDPROC(ia32pv_sysenter_target)
2360 # system call handler stub
2362 @@ -398,9 +401,7 @@ ENTRY(system_call)
2363 CFI_ADJUST_CFA_OFFSET 4
2365 GET_THREAD_INFO(%ebp)
2366 - # system call tracing in operation / emulation
2367 - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2368 - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2370 jnz syscall_trace_entry
2371 cmpl $(nr_syscalls), %eax
2373 @@ -452,7 +453,8 @@ restore_nocheck_notrace:
2375 addl $4, %esp # skip orig_eax/error_code
2376 CFI_ADJUST_CFA_OFFSET -4
2377 -1: INTERRUPT_RETURN
2380 .section .fixup,"ax"
2382 pushl $0 # no error code
2383 @@ -461,7 +463,7 @@ iret_exc:
2385 .section __ex_table,"a"
2388 + .long irq_return,iret_exc
2392 @@ -657,7 +659,7 @@ END(syscall_badsys)
2393 * Build the entry stubs and pointer table with
2394 * some assembler magic.
2397 +.section .rodata,"a"
2401 @@ -963,7 +965,7 @@ END(device_not_available)
2402 * that sets up the real kernel stack. Check here, since we can't
2403 * allow the wrong stack to be used.
2405 - * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2406 + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
2407 * already pushed 3 words if it hits on the sysenter instruction:
2408 * eflags, cs and eip.
2410 @@ -975,7 +977,7 @@ END(device_not_available)
2411 cmpw $__KERNEL_CS,4(%esp); \
2414 - movl SYSENTER_stack_esp0+offset(%esp),%esp; \
2415 + movl SYSENTER_stack_sp0+offset(%esp),%esp; \
2416 CFI_DEF_CFA esp, 0; \
2417 CFI_UNDEFINED eip; \
2419 @@ -990,7 +992,7 @@ label: \
2423 - cmpl $sysenter_entry,(%esp)
2424 + cmpl $ia32_sysenter_target,(%esp)
2425 jne debug_stack_correct
2426 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
2427 debug_stack_correct:
2428 @@ -1023,7 +1025,7 @@ KPROBE_ENTRY(nmi)
2430 CFI_ADJUST_CFA_OFFSET -4
2432 - cmpl $sysenter_entry,(%esp)
2433 + cmpl $ia32_sysenter_target,(%esp)
2436 CFI_ADJUST_CFA_OFFSET 4
2437 @@ -1036,7 +1038,7 @@ KPROBE_ENTRY(nmi)
2439 CFI_ADJUST_CFA_OFFSET -4
2440 jae nmi_stack_correct
2441 - cmpl $sysenter_entry,12(%esp)
2442 + cmpl $ia32_sysenter_target,12(%esp)
2443 je nmi_debug_stack_check
2445 /* We have a RING0_INT_FRAME here */
2446 @@ -1089,12 +1091,8 @@ nmi_espfix_stack:
2448 lss 12+4(%esp), %esp # back to espfix stack
2449 CFI_ADJUST_CFA_OFFSET -24
2450 -1: INTERRUPT_RETURN
2453 -.section __ex_table,"a"
2460 @@ -1112,17 +1110,17 @@ KPROBE_END(nmi)
2462 #ifdef CONFIG_PARAVIRT
2466 .section __ex_table,"a"
2469 + .long native_iret, iret_exc
2473 -ENTRY(native_irq_enable_sysexit)
2474 +ENTRY(native_irq_enable_syscall_ret)
2477 -END(native_irq_enable_sysexit)
2478 +END(native_irq_enable_syscall_ret)
2482 @@ -1271,7 +1269,144 @@ ENTRY(kernel_thread_helper)
2484 ENDPROC(kernel_thread_helper)
2486 +#include <asm/alternative-asm.h>
2488 + # pv syscall call handler stub
2489 +ENTRY(ia32pv_cstar_target)
2491 + movl $__USER_DS,16(%esp)
2493 + movl $__USER_CS,4(%esp)
2494 + movl 12(%esp),%ebp
2495 + pushl %eax # save orig_eax
2496 + CFI_ADJUST_CFA_OFFSET 4
2498 + * Load the potential sixth argument from user stack.
2499 + * Careful about security.
2501 + cmpl $__PAGE_OFFSET-4,%ebp
2502 + CFI_REMEMBER_STATE
2504 +1: movl (%ebp),%ebp
2505 +.section __ex_table,"a"
2507 + .long 1b,cstar_fault
2510 + GET_THREAD_INFO(%ebp)
2512 + jnz cstar_trace_entry
2513 + cmpl $nr_syscalls,%eax
2516 + btl %eax,cstar_special
2517 + jc .Lcstar_special
2518 + call *cstar_call_table(,%eax,4)
2519 + movl %eax,PT_EAX(%esp) # store the return value
2521 + movl PT_ECX(%esp),%ecx
2522 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2525 + movl PT_ECX(%esp),%ecx
2526 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2529 + movl $cstar_clear_tif,(%esp) # replace return address
2531 + orl $_TIF_CSTAR,TI_flags(%ebp)
2532 + jmp *sys_call_table(,%eax,4)
2534 + movl %eax,PT_EAX(%esp) # store the return value
2536 + andl $~_TIF_CSTAR,TI_flags(%ebp)
2539 + movl $-ENOSYS,PT_EAX(%esp)
2540 + cmpl $nr_syscalls,%eax
2542 + btl %eax,cstar_special
2543 + jc .Lcstar_trace_special
2547 + orl $_TIF_CSTAR,TI_flags(%ebp)
2548 + call do_syscall_trace
2550 + andl $~_TIF_CSTAR,TI_flags(%ebp)
2552 + jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
2553 + # so must skip actual syscall
2554 + movl PT_ORIG_EAX(%esp),%eax
2555 + cmpl $nr_syscalls,%eax
2558 +.Lcstar_trace_special:
2559 + movl PT_ECX(%esp),%ecx
2562 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2563 + call do_syscall_trace
2565 + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2566 + # so must skip actual syscall
2567 + movl PT_ORIG_EAX(%esp),%eax
2568 + cmpl $nr_syscalls,%eax
2572 + movl $-ENOSYS,PT_EAX(%esp)
2574 + movl PT_ECX(%esp),%ecx
2575 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2576 + jmp resume_userspace
2579 + movl $-EFAULT,%eax
2581 + GET_THREAD_INFO(%ebp)
2582 + jmp .Lcstar_resume
2584 +ENDPROC(ia32pv_cstar_target)
2586 +ENTRY(cstar_ret_from_fork)
2588 + movl PT_ECX(%esp),%ecx
2589 + GET_THREAD_INFO(%ebp)
2590 + movl %ecx,PT_EBP(%esp) # put user EBP back in place
2592 + andl $~_TIF_CSTAR,TI_flags(%ebp)
2597 .section .rodata,"a"
2598 #include "syscall_table_32.S"
2600 syscall_table_size=(.-sys_call_table)
2602 +#include <asm/unistd.h>
2606 +.rept nr_syscalls+31
2607 + .irp n, __NR_sigreturn, __NR_rt_sigreturn
2609 + mask = mask | (1 << (\n & 31))
2613 + .if (nr & 31) == 0
2618 +#define sys_call_table cstar_call_table
2619 +#define sys_fork cstar_set_tif
2620 +#define sys_clone cstar_set_tif
2621 +#define sys_vfork cstar_set_tif
2622 +#include "syscall_table_32.S"
2623 +#undef sys_call_table
2627 --- sle11-2009-10-16.orig/arch/x86/kernel/entry_64-xen.S 2009-02-16 16:18:36.000000000 +0100
2628 +++ sle11-2009-10-16/arch/x86/kernel/entry_64-xen.S 2009-03-16 16:33:40.000000000 +0100
2630 #include <asm/page.h>
2631 #include <asm/irqflags.h>
2632 #include <asm/errno.h>
2633 -#include <xen/interface/arch-x86_64.h>
2634 +#include <xen/interface/xen.h>
2635 #include <xen/interface/features.h>
2637 -#include "xen_entry_64.S"
2641 #ifndef CONFIG_PREEMPT
2642 #define retint_kernel retint_restore_args
2645 +#ifdef CONFIG_PARAVIRT
2646 +ENTRY(native_irq_enable_syscall_ret)
2647 + movq %gs:pda_oldrsp,%rsp
2650 +#endif /* CONFIG_PARAVIRT */
2653 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
2654 #ifdef CONFIG_TRACE_IRQFLAGS
2655 @@ -277,7 +282,7 @@ ret_from_sys_call:
2658 GET_THREAD_INFO(%rcx)
2659 - XEN_BLOCK_EVENTS(%rsi)
2660 + DISABLE_INTERRUPTS(CLBR_NONE)
2662 movl threadinfo_flags(%rcx),%edx
2664 @@ -287,7 +292,7 @@ sysret_check:
2665 * sysretq will re-enable interrupts:
2668 - XEN_UNBLOCK_EVENTS(%rsi)
2669 + ENABLE_INTERRUPTS(CLBR_NONE)
2671 HYPERVISOR_IRET VGCF_IN_SYSCALL
2673 @@ -298,7 +303,7 @@ sysret_careful:
2674 bt $TIF_NEED_RESCHED,%edx
2677 - XEN_UNBLOCK_EVENTS(%rsi)
2678 + ENABLE_INTERRUPTS(CLBR_NONE)
2680 CFI_ADJUST_CFA_OFFSET 8
2682 @@ -309,9 +314,8 @@ sysret_careful:
2683 /* Handle a signal */
2687 - XEN_UNBLOCK_EVENTS(%rsi)
2688 - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2689 + ENABLE_INTERRUPTS(CLBR_NONE)
2690 + testl $_TIF_DO_NOTIFY_MASK,%edx
2693 /* Really a signal */
2694 @@ -323,7 +327,7 @@ sysret_signal:
2695 1: movl $_TIF_NEED_RESCHED,%edi
2696 /* Use IRET because user could have changed frame. This
2697 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
2698 - XEN_BLOCK_EVENTS(%rsi)
2699 + DISABLE_INTERRUPTS(CLBR_NONE)
2703 @@ -355,7 +359,7 @@ tracesys:
2705 .globl int_ret_from_sys_call
2706 int_ret_from_sys_call:
2707 - XEN_BLOCK_EVENTS(%rsi)
2708 + DISABLE_INTERRUPTS(CLBR_NONE)
2710 testb $3,CS-ARGOFFSET(%rsp)
2712 @@ -381,22 +385,20 @@ int_careful:
2713 bt $TIF_NEED_RESCHED,%edx
2714 jnc int_very_careful
2717 - XEN_UNBLOCK_EVENTS(%rsi)
2718 + ENABLE_INTERRUPTS(CLBR_NONE)
2720 CFI_ADJUST_CFA_OFFSET 8
2723 CFI_ADJUST_CFA_OFFSET -8
2724 - XEN_BLOCK_EVENTS(%rsi)
2725 + DISABLE_INTERRUPTS(CLBR_NONE)
2729 /* handle signals and tracing -- both require a full stack frame */
2733 - XEN_UNBLOCK_EVENTS(%rsi)
2734 + ENABLE_INTERRUPTS(CLBR_NONE)
2736 /* Check for syscall exit trace */
2737 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
2738 @@ -411,7 +413,7 @@ int_very_careful:
2739 jmp int_restore_rest
2742 - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2743 + testl $_TIF_DO_NOTIFY_MASK,%edx
2745 movq %rsp,%rdi # &ptregs -> arg1
2746 xorl %esi,%esi # oldset -> arg2
2747 @@ -419,7 +421,7 @@ int_signal:
2748 1: movl $_TIF_NEED_RESCHED,%edi
2751 - XEN_BLOCK_EVENTS(%rsi)
2752 + DISABLE_INTERRUPTS(CLBR_NONE)
2756 @@ -474,6 +476,7 @@ ENTRY(stub_execve)
2757 CFI_REGISTER rip, r11
2759 FIXUP_TOP_OF_STACK %r11
2762 RESTORE_TOP_OF_STACK %r11
2764 @@ -526,11 +529,10 @@ retint_check:
2765 retint_restore_args: /* return to kernel space */
2766 movl EFLAGS-REST_SKIP(%rsp), %eax
2767 shr $9, %eax # EAX[0] == IRET_EFLAGS.IF
2768 - XEN_GET_VCPU_INFO(%rsi)
2770 andb evtchn_upcall_mask(%rsi),%al
2771 andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask
2772 jnz restore_all_enable_events # != 0 => enable event delivery
2773 - XEN_PUT_VCPU_INFO(%rsi)
2777 @@ -541,31 +543,29 @@ retint_careful:
2778 bt $TIF_NEED_RESCHED,%edx
2781 - XEN_UNBLOCK_EVENTS(%rsi)
2783 + ENABLE_INTERRUPTS(CLBR_NONE)
2785 CFI_ADJUST_CFA_OFFSET 8
2788 CFI_ADJUST_CFA_OFFSET -8
2789 GET_THREAD_INFO(%rcx)
2790 - XEN_BLOCK_EVENTS(%rsi)
2792 + DISABLE_INTERRUPTS(CLBR_NONE)
2797 - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
2798 + testl $_TIF_DO_NOTIFY_MASK,%edx
2799 jz retint_restore_args
2801 - XEN_UNBLOCK_EVENTS(%rsi)
2802 + ENABLE_INTERRUPTS(CLBR_NONE)
2804 movq $-1,ORIG_RAX(%rsp)
2805 xorl %esi,%esi # oldset
2806 movq %rsp,%rdi # &pt_regs
2807 call do_notify_resume
2809 - XEN_BLOCK_EVENTS(%rsi)
2810 + DISABLE_INTERRUPTS(CLBR_NONE)
2812 movl $_TIF_NEED_RESCHED,%edi
2813 GET_THREAD_INFO(%rcx)
2814 @@ -702,7 +702,7 @@ END(spurious_interrupt)
2823 @@ -719,8 +719,7 @@ END(spurious_interrupt)
2825 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
2828 - XEN_BLOCK_EVENTS(%rsi)
2829 + DISABLE_INTERRUPTS(CLBR_NONE)
2833 @@ -749,10 +748,10 @@ paranoid_swapgs\trace:
2838 + SWAPGS_UNSAFE_STACK
2839 paranoid_restore\trace:
2843 paranoid_userspace\trace:
2844 GET_THREAD_INFO(%rcx)
2845 movl threadinfo_flags(%rcx),%ebx
2846 @@ -767,11 +766,11 @@ paranoid_userspace\trace:
2851 + ENABLE_INTERRUPTS(CLBR_NONE)
2852 xorl %esi,%esi /* arg2: oldset */
2853 movq %rsp,%rdi /* arg1: &pt_regs */
2854 call do_notify_resume
2856 + DISABLE_INTERRUPTS(CLBR_NONE)
2860 @@ -780,9 +779,9 @@ paranoid_schedule\trace:
2865 + ENABLE_INTERRUPTS(CLBR_ANY)
2868 + DISABLE_INTERRUPTS(CLBR_ANY)
2872 @@ -846,8 +845,7 @@ error_call_handler:
2877 - XEN_BLOCK_EVENTS(%rsi)
2878 + DISABLE_INTERRUPTS(CLBR_NONE)
2880 GET_THREAD_INFO(%rcx)
2881 testb $3,CS-ARGOFFSET(%rsp)
2882 @@ -875,7 +873,7 @@ error_kernelspace:
2883 iret run with kernel gs again, so don't set the user space flag.
2884 B stepping K8s sometimes report an truncated RIP for IRET
2885 exceptions returning to compat mode. Check for these here too. */
2886 - leaq iret_label(%rip),%rbp
2887 + leaq irq_return(%rip),%rbp
2890 movl %ebp,%ebp /* zero extend */
2891 @@ -930,19 +928,17 @@ END(do_hypervisor_callback)
2892 restore_all_enable_events:
2893 CFI_DEFAULT_STACK adj=1
2895 - XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up...
2896 + __ENABLE_INTERRUPTS
2898 scrit: /**** START OF CRITICAL REGION ****/
2899 - XEN_TEST_PENDING(%rsi)
2902 jnz 14f # process more events if necessary...
2903 - XEN_PUT_VCPU_INFO(%rsi)
2908 -14: XEN_LOCKED_BLOCK_EVENTS(%rsi)
2909 - XEN_PUT_VCPU_INFO(%rsi)
2910 +14: __DISABLE_INTERRUPTS
2912 movq %rsp,%rdi # set the argument again
2914 @@ -1086,15 +1082,16 @@ ENDPROC(child_rip)
2915 * rdi: name, rsi: argv, rdx: envp
2917 * We want to fallback into:
2918 - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
2919 + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
2921 * do_sys_execve asm fallback arguments:
2922 - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
2923 + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
2925 ENTRY(kernel_execve)
2931 movq %rax, RAX(%rsp)
2933 @@ -1144,7 +1141,7 @@ do_nmi_callback:
2935 orl $NMI_MASK,EFLAGS(%rsp)
2937 - XEN_BLOCK_EVENTS(%rsi)
2938 + DISABLE_INTERRUPTS(CLBR_NONE)
2940 GET_THREAD_INFO(%rcx)
2941 jmp retint_restore_args
2942 --- sle11-2009-10-16.orig/arch/x86/kernel/fixup.c 2009-10-28 14:55:04.000000000 +0100
2943 +++ sle11-2009-10-16/arch/x86/kernel/fixup.c 2009-03-16 16:33:40.000000000 +0100
2946 #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args )
2948 -fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2949 +void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
2951 static unsigned long printed = 0;
2953 --- sle11-2009-10-16.orig/arch/x86/kernel/genapic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
2954 +++ sle11-2009-10-16/arch/x86/kernel/genapic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
2956 #include <acpi/acpi_bus.h>
2960 - * which logical CPU number maps to which CPU (physical APIC ID)
2962 - * The following static array is used during kernel startup
2963 - * and the x86_cpu_to_apicid_ptr contains the address of the
2964 - * array during this time. Is it zeroed when the per_cpu
2965 - * data area is removed.
2967 +/* which logical CPU number maps to which CPU (physical APIC ID) */
2969 -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
2970 +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
2971 = { [0 ... NR_CPUS-1] = BAD_APICID };
2972 -void *x86_cpu_to_apicid_ptr;
2973 +void *x86_cpu_to_apicid_early_ptr;
2975 -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
2976 +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
2977 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
2980 --- sle11-2009-10-16.orig/arch/x86/kernel/head64-xen.c 2009-02-16 16:18:36.000000000 +0100
2981 +++ sle11-2009-10-16/arch/x86/kernel/head64-xen.c 2009-03-16 16:33:40.000000000 +0100
2983 #include <linux/kernel.h>
2984 #include <linux/string.h>
2985 #include <linux/percpu.h>
2986 +#include <linux/start_kernel.h>
2987 #include <linux/module.h>
2989 #include <asm/processor.h>
2991 #include <asm/pgtable.h>
2992 #include <asm/tlbflush.h>
2993 #include <asm/sections.h>
2994 +#include <asm/kdebug.h>
2995 +#include <asm/e820.h>
2997 unsigned long start_pfn;
2999 @@ -34,7 +37,7 @@ static void __init zap_identity_mappings
3001 pgd_t *pgd = pgd_offset_k(0UL);
3004 + __flush_tlb_all();
3007 /* Don't add a printk in there. printk relies on the PDA which is not initialized
3008 @@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
3009 unsigned int machine_to_phys_order;
3010 EXPORT_SYMBOL(machine_to_phys_order);
3012 +#define EBDA_ADDR_POINTER 0x40E
3014 +static __init void reserve_ebda(void)
3017 + unsigned ebda_addr, ebda_size;
3020 + * there is a real-mode segmented pointer pointing to the
3021 + * 4K EBDA area at 0x40E
3023 + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
3029 + ebda_size = *(unsigned short *)__va(ebda_addr);
3031 + /* Round EBDA up to pages */
3032 + if (ebda_size == 0)
3035 + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
3036 + if (ebda_size > 64*1024)
3037 + ebda_size = 64*1024;
3039 + reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
3043 void __init x86_64_start_kernel(char * real_mode_data)
3045 struct xen_machphys_mapping mapping;
3046 @@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r
3047 /* Make NULL pointers segfault */
3048 zap_identity_mappings();
3050 - for (i = 0; i < IDT_ENTRIES; i++)
3051 + /* Cleanup the over mapped high alias */
3052 + cleanup_highmap();
3054 + for (i = 0; i < IDT_ENTRIES; i++) {
3055 +#ifdef CONFIG_EARLY_PRINTK
3056 + set_intr_gate(i, &early_idt_handlers[i]);
3058 set_intr_gate(i, early_idt_handler);
3061 load_idt((const struct desc_ptr *)&idt_descr);
3064 @@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r
3067 copy_bootdata(__va(real_mode_data));
3069 - cpu_set(0, cpu_online_map);
3072 + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
3074 + reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
3075 + start_pfn << PAGE_SHIFT, "Xen provided");
3080 + * At this point everything still needed from the boot loader
3081 + * or BIOS or kernel text should be early reserved or marked not
3082 + * RAM in e820. All other memory is free game.
3087 --- sle11-2009-10-16.orig/arch/x86/kernel/head_32-xen.S 2009-02-16 16:17:21.000000000 +0100
3088 +++ sle11-2009-10-16/arch/x86/kernel/head_32-xen.S 2009-03-16 16:33:40.000000000 +0100
3091 #include <linux/elfnote.h>
3092 #include <linux/threads.h>
3093 +#include <linux/init.h>
3094 #include <linux/linkage.h>
3095 #include <asm/segment.h>
3096 #include <asm/page.h>
3097 @@ -88,7 +89,7 @@ ENTRY(_stext)
3099 .section ".bss.page_aligned","wa"
3100 .align PAGE_SIZE_asm
3101 -ENTRY(swapper_pg_pmd)
3102 +ENTRY(swapper_pg_fixmap)
3104 ENTRY(empty_zero_page)
3106 --- sle11-2009-10-16.orig/arch/x86/kernel/init_task-xen.c 2009-02-16 16:18:36.000000000 +0100
3107 +++ sle11-2009-10-16/arch/x86/kernel/init_task-xen.c 2009-03-16 16:33:40.000000000 +0100
3108 @@ -19,7 +19,7 @@ static struct sighand_struct init_sighan
3110 struct mm_struct init_mm = INIT_MM(init_mm);
3111 #undef swapper_pg_dir
3112 -EXPORT_SYMBOL(init_mm);
3113 +EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
3116 * Initial thread structure.
3117 --- sle11-2009-10-16.orig/arch/x86/kernel/io_apic_32-xen.c 2009-02-16 16:18:36.000000000 +0100
3118 +++ sle11-2009-10-16/arch/x86/kernel/io_apic_32-xen.c 2009-03-16 16:33:40.000000000 +0100
3120 #include <linux/htirq.h>
3121 #include <linux/freezer.h>
3122 #include <linux/kthread.h>
3123 +#include <linux/jiffies.h> /* time_after() */
3126 #include <asm/smp.h>
3128 #include <mach_apic.h>
3129 #include <mach_apicdef.h>
3131 -#include "io_ports.h"
3134 #include <xen/interface/xen.h>
3135 #include <xen/interface/physdev.h>
3136 @@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi
3137 # include <asm/processor.h> /* kernel_thread() */
3138 # include <linux/kernel_stat.h> /* kstat */
3139 # include <linux/slab.h> /* kmalloc() */
3140 -# include <linux/timer.h> /* time_after() */
3141 +# include <linux/timer.h>
3143 #define IRQBALANCE_CHECK_ARCH -999
3144 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
3145 @@ -777,7 +776,7 @@ late_initcall(balanced_irq_init);
3149 -void fastcall send_IPI_self(int vector)
3150 +void send_IPI_self(int vector)
3154 @@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void)
3155 * might have cached one ExtINT interrupt. Finally, at
3156 * least one tick may be lost due to delays.
3158 - if (jiffies - t1 > 4)
3159 + if (time_after(jiffies, t1 + 4))
3163 @@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read
3167 -static void setup_nmi (void)
3168 +static void __init setup_nmi(void)
3171 * Dirty trick to enable the NMI watchdog ...
3172 @@ -2155,7 +2154,7 @@ static void setup_nmi (void)
3174 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
3176 - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
3177 + enable_NMI_through_LVT0();
3179 apic_printk(APIC_VERBOSE, " done.\n");
3181 @@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi
3184 static struct sysdev_class ioapic_sysdev_class = {
3185 - set_kset_name("ioapic"),
3187 .suspend = ioapic_suspend,
3188 .resume = ioapic_resume,
3190 --- sle11-2009-10-16.orig/arch/x86/kernel/io_apic_64-xen.c 2009-02-16 16:18:36.000000000 +0100
3191 +++ sle11-2009-10-16/arch/x86/kernel/io_apic_64-xen.c 2009-03-16 16:33:40.000000000 +0100
3193 #include <linux/msi.h>
3194 #include <linux/htirq.h>
3195 #include <linux/dmar.h>
3196 +#include <linux/jiffies.h>
3198 #include <acpi/acpi_bus.h>
3200 +#include <linux/bootmem.h>
3202 #include <asm/idle.h>
3204 @@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo
3205 v = apic_read(APIC_LVR);
3206 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
3207 ver = GET_APIC_VERSION(v);
3208 - maxlvt = get_maxlvt();
3209 + maxlvt = lapic_get_maxlvt();
3211 v = apic_read(APIC_TASKPRI);
3212 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
3213 @@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void)
3215 #endif /* !CONFIG_XEN */
3217 -static void __init enable_IO_APIC(void)
3218 +void __init enable_IO_APIC(void)
3220 union IO_APIC_reg_01 reg_01;
3222 @@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void)
3226 - if (jiffies - t1 > 4)
3227 + if (time_after(jiffies, t1 + 4))
3231 @@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i
3232 if (likely(!cfg->move_in_progress))
3235 - vector = ~get_irq_regs()->orig_rax;
3236 + vector = ~get_irq_regs()->orig_ax;
3237 me = smp_processor_id();
3238 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
3239 cpumask_t cleanup_mask;
3240 @@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int
3241 int do_unmask_irq = 0;
3243 irq_complete_move(irq);
3244 -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3245 +#ifdef CONFIG_GENERIC_PENDING_IRQ
3246 /* If we are moving the irq we need to mask it */
3247 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3249 @@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir
3250 .end = end_lapic_irq,
3253 -static void setup_nmi (void)
3254 +static void __init setup_nmi(void)
3257 * Dirty trick to enable the NMI watchdog ...
3258 @@ -1583,7 +1585,7 @@ static void setup_nmi (void)
3260 printk(KERN_INFO "activating NMI Watchdog ...");
3262 - enable_NMI_through_LVT0(NULL);
3263 + enable_NMI_through_LVT0();
3267 @@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v
3269 * FIXME: really need to revamp this for modern platforms only.
3271 -static inline void check_timer(void)
3272 +static inline void __init check_timer(void)
3274 struct irq_cfg *cfg = irq_cfg + 0;
3275 int apic1, pin1, apic2, pin2;
3276 @@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi
3279 static struct sysdev_class ioapic_sysdev_class = {
3280 - set_kset_name("ioapic"),
3282 .suspend = ioapic_suspend,
3283 .resume = ioapic_resume,
3285 @@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void)
3289 -#endif /* !CONFIG_XEN */
3291 +#define IOAPIC_RESOURCE_NAME_SIZE 11
3293 +static struct resource *ioapic_resources;
3295 +static struct resource * __init ioapic_setup_resources(void)
3298 + struct resource *res;
3302 + if (nr_ioapics <= 0)
3305 + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
3308 + mem = alloc_bootmem(n);
3309 + res = (void *)mem;
3311 + if (mem != NULL) {
3312 + memset(mem, 0, n);
3313 + mem += sizeof(struct resource) * nr_ioapics;
3315 + for (i = 0; i < nr_ioapics; i++) {
3316 + res[i].name = mem;
3317 + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
3318 + sprintf(mem, "IOAPIC %u", i);
3319 + mem += IOAPIC_RESOURCE_NAME_SIZE;
3323 + ioapic_resources = res;
3328 +void __init ioapic_init_mappings(void)
3330 + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3331 + struct resource *ioapic_res;
3334 + ioapic_res = ioapic_setup_resources();
3335 + for (i = 0; i < nr_ioapics; i++) {
3336 + if (smp_found_config) {
3337 + ioapic_phys = mp_ioapics[i].mpc_apicaddr;
3339 + ioapic_phys = (unsigned long)
3340 + alloc_bootmem_pages(PAGE_SIZE);
3341 + ioapic_phys = __pa(ioapic_phys);
3343 + set_fixmap_nocache(idx, ioapic_phys);
3344 + apic_printk(APIC_VERBOSE,
3345 + "mapped IOAPIC to %016lx (%016lx)\n",
3346 + __fix_to_virt(idx), ioapic_phys);
3349 + if (ioapic_res != NULL) {
3350 + ioapic_res->start = ioapic_phys;
3351 + ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
3357 +static int __init ioapic_insert_resources(void)
3360 + struct resource *r = ioapic_resources;
3364 + "IO APIC resources could be not be allocated.\n");
3368 + for (i = 0; i < nr_ioapics; i++) {
3369 + insert_resource(&iomem_resource, r);
3376 +/* Insert the IO APIC resources after PCI initialization has occured to handle
3377 + * IO APICS that are mapped in on a BAR in PCI space. */
3378 +late_initcall(ioapic_insert_resources);
3379 +#endif /* !CONFIG_XEN */
3380 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
3381 +++ sle11-2009-10-16/arch/x86/kernel/ioport-xen.c 2009-03-16 16:33:40.000000000 +0100
3384 + * This contains the io-permission bitmap code - written by obz, with changes
3385 + * by Linus. 32/64 bits code unification by Miguel Botón.
3388 +#include <linux/sched.h>
3389 +#include <linux/kernel.h>
3390 +#include <linux/capability.h>
3391 +#include <linux/errno.h>
3392 +#include <linux/types.h>
3393 +#include <linux/ioport.h>
3394 +#include <linux/smp.h>
3395 +#include <linux/stddef.h>
3396 +#include <linux/slab.h>
3397 +#include <linux/thread_info.h>
3398 +#include <linux/syscalls.h>
3399 +#include <xen/interface/physdev.h>
3401 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3402 +static void set_bitmap(unsigned long *bitmap, unsigned int base,
3403 + unsigned int extent, int new_value)
3407 + for (i = base; i < base + extent; i++) {
3409 + __set_bit(i, bitmap);
3411 + __clear_bit(i, bitmap);
3416 + * this changes the io permissions bitmap in the current task.
3418 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3420 + struct thread_struct * t = ¤t->thread;
3421 + struct physdev_set_iobitmap set_iobitmap;
3423 + if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3425 + if (turn_on && !capable(CAP_SYS_RAWIO))
3429 + * If it's the first ioperm() call in this thread's lifetime, set the
3430 + * IO bitmap up. ioperm() is much less timing critical than clone(),
3431 + * this is why we delay this operation until now:
3433 + if (!t->io_bitmap_ptr) {
3434 + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3439 + memset(bitmap, 0xff, IO_BITMAP_BYTES);
3440 + t->io_bitmap_ptr = bitmap;
3441 + set_thread_flag(TIF_IO_BITMAP);
3443 + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3444 + set_iobitmap.nr_ports = IO_BITMAP_BITS;
3445 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3449 + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3455 + * sys_iopl has to be used when you want to access the IO ports
3456 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3457 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
3459 +static int do_iopl(unsigned int level, struct thread_struct *t)
3461 + unsigned int old = t->iopl >> 12;
3465 + /* Trying to gain more privileges? */
3466 + if (level > old) {
3467 + if (!capable(CAP_SYS_RAWIO))
3474 +#ifdef CONFIG_X86_32
3475 +asmlinkage long sys_iopl(unsigned long regsp)
3477 + struct pt_regs *regs = (struct pt_regs *)®sp;
3478 + unsigned int level = regs->bx;
3480 +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
3483 + struct thread_struct *t = ¤t->thread;
3486 + rc = do_iopl(level, t);
3490 + t->iopl = level << 12;
3491 + set_iopl_mask(t->iopl);
3495 --- sle11-2009-10-16.orig/arch/x86/kernel/ioport_32-xen.c 2009-02-16 16:18:36.000000000 +0100
3496 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3499 - * This contains the io-permission bitmap code - written by obz, with changes
3503 -#include <linux/sched.h>
3504 -#include <linux/kernel.h>
3505 -#include <linux/capability.h>
3506 -#include <linux/errno.h>
3507 -#include <linux/types.h>
3508 -#include <linux/ioport.h>
3509 -#include <linux/smp.h>
3510 -#include <linux/stddef.h>
3511 -#include <linux/slab.h>
3512 -#include <linux/thread_info.h>
3513 -#include <linux/syscalls.h>
3514 -#include <xen/interface/physdev.h>
3516 -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3517 -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3519 - unsigned long mask;
3520 - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
3521 - unsigned int low_index = base & (BITS_PER_LONG-1);
3522 - int length = low_index + extent;
3524 - if (low_index != 0) {
3525 - mask = (~0UL << low_index);
3526 - if (length < BITS_PER_LONG)
3527 - mask &= ~(~0UL << length);
3529 - *bitmap_base++ |= mask;
3531 - *bitmap_base++ &= ~mask;
3532 - length -= BITS_PER_LONG;
3535 - mask = (new_value ? ~0UL : 0UL);
3536 - while (length >= BITS_PER_LONG) {
3537 - *bitmap_base++ = mask;
3538 - length -= BITS_PER_LONG;
3542 - mask = ~(~0UL << length);
3544 - *bitmap_base++ |= mask;
3546 - *bitmap_base++ &= ~mask;
3552 - * this changes the io permissions bitmap in the current task.
3554 -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3556 - struct thread_struct * t = ¤t->thread;
3557 - unsigned long *bitmap;
3558 - struct physdev_set_iobitmap set_iobitmap;
3560 - if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3562 - if (turn_on && !capable(CAP_SYS_RAWIO))
3566 - * If it's the first ioperm() call in this thread's lifetime, set the
3567 - * IO bitmap up. ioperm() is much less timing critical than clone(),
3568 - * this is why we delay this operation until now:
3570 - if (!t->io_bitmap_ptr) {
3571 - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3575 - memset(bitmap, 0xff, IO_BITMAP_BYTES);
3576 - t->io_bitmap_ptr = bitmap;
3577 - set_thread_flag(TIF_IO_BITMAP);
3579 - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3580 - set_iobitmap.nr_ports = IO_BITMAP_BITS;
3581 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3585 - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3591 - * sys_iopl has to be used when you want to access the IO ports
3592 - * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3593 - * you'd need 8kB of bitmaps/process, which is a bit excessive.
3595 - * Here we just change the eflags value on the stack: we allow
3596 - * only the super-user to do it. This depends on the stack-layout
3597 - * on system-call entry - see also fork() and the signal handling
3601 -asmlinkage long sys_iopl(unsigned long unused)
3603 - volatile struct pt_regs * regs = (struct pt_regs *) &unused;
3604 - unsigned int level = regs->ebx;
3605 - struct thread_struct *t = ¤t->thread;
3606 - unsigned int old = (t->iopl >> 12) & 3;
3610 - /* Trying to gain more privileges? */
3611 - if (level > old) {
3612 - if (!capable(CAP_SYS_RAWIO))
3615 - t->iopl = level << 12;
3616 - set_iopl_mask(t->iopl);
3619 --- sle11-2009-10-16.orig/arch/x86/kernel/ioport_64-xen.c 2009-02-16 16:18:36.000000000 +0100
3620 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
3623 - * This contains the io-permission bitmap code - written by obz, with changes
3627 -#include <linux/sched.h>
3628 -#include <linux/kernel.h>
3629 -#include <linux/capability.h>
3630 -#include <linux/errno.h>
3631 -#include <linux/types.h>
3632 -#include <linux/ioport.h>
3633 -#include <linux/mm.h>
3634 -#include <linux/smp.h>
3635 -#include <linux/stddef.h>
3636 -#include <linux/slab.h>
3637 -#include <linux/thread_info.h>
3638 -#include <linux/syscalls.h>
3639 -#include <xen/interface/physdev.h>
3641 -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
3642 -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
3647 - for (i = base; i < base + extent; i++)
3648 - __set_bit(i, bitmap);
3650 - for (i = base; i < base + extent; i++)
3651 - clear_bit(i, bitmap);
3655 - * this changes the io permissions bitmap in the current task.
3657 -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
3659 - struct thread_struct * t = ¤t->thread;
3660 - unsigned long *bitmap;
3661 - struct physdev_set_iobitmap set_iobitmap;
3663 - if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
3665 - if (turn_on && !capable(CAP_SYS_RAWIO))
3669 - * If it's the first ioperm() call in this thread's lifetime, set the
3670 - * IO bitmap up. ioperm() is much less timing critical than clone(),
3671 - * this is why we delay this operation until now:
3673 - if (!t->io_bitmap_ptr) {
3674 - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
3678 - memset(bitmap, 0xff, IO_BITMAP_BYTES);
3679 - t->io_bitmap_ptr = bitmap;
3680 - set_thread_flag(TIF_IO_BITMAP);
3682 - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
3683 - set_iobitmap.nr_ports = IO_BITMAP_BITS;
3684 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
3688 - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
3694 - * sys_iopl has to be used when you want to access the IO ports
3695 - * beyond the 0x3ff range: to get the full 65536 ports bitmapped
3696 - * you'd need 8kB of bitmaps/process, which is a bit excessive.
3700 -asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
3702 - unsigned int old_iopl = current->thread.iopl;
3703 - struct physdev_set_iopl set_iopl;
3708 - /* Need "raw I/O" privileges for direct port access. */
3709 - if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
3712 - /* Change our version of the privilege levels. */
3713 - current->thread.iopl = new_iopl;
3715 - /* Force the change at ring 0. */
3716 - set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
3717 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
3721 --- sle11-2009-10-16.orig/arch/x86/kernel/irq_32-xen.c 2009-02-16 16:18:36.000000000 +0100
3722 +++ sle11-2009-10-16/arch/x86/kernel/irq_32-xen.c 2009-03-16 16:33:40.000000000 +0100
3723 @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU
3724 * SMP cross-CPU interrupts have their own specific
3727 -fastcall unsigned int do_IRQ(struct pt_regs *regs)
3728 +unsigned int do_IRQ(struct pt_regs *regs)
3730 struct pt_regs *old_regs;
3731 /* high bit used in ret_from_ code */
3732 - int irq = ~regs->orig_eax;
3733 + int irq = ~regs->orig_ax;
3734 struct irq_desc *desc = irq_desc + irq;
3735 #ifdef CONFIG_4KSTACKS
3736 union irq_ctx *curctx, *irqctx;
3737 @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r
3738 #ifdef CONFIG_DEBUG_STACKOVERFLOW
3739 /* Debugging check for stack overflow: is there less than 1KB free? */
3744 __asm__ __volatile__("andl %%esp,%0" :
3745 - "=r" (esp) : "0" (THREAD_SIZE - 1));
3746 - if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
3747 + "=r" (sp) : "0" (THREAD_SIZE - 1));
3748 + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
3749 printk("do_IRQ: stack overflow: %ld\n",
3750 - esp - sizeof(struct thread_info));
3751 + sp - sizeof(struct thread_info));
3755 @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r
3756 * current stack (which is the irq stack already after all)
3758 if (curctx != irqctx) {
3759 - int arg1, arg2, ebx;
3760 + int arg1, arg2, bx;
3762 /* build the stack frame on the IRQ stack */
3763 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
3764 @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r
3765 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
3768 - " xchgl %%ebx,%%esp \n"
3770 - " movl %%ebx,%%esp \n"
3771 - : "=a" (arg1), "=d" (arg2), "=b" (ebx)
3772 + " xchgl %%ebx,%%esp \n"
3774 + " movl %%ebx,%%esp \n"
3775 + : "=a" (arg1), "=d" (arg2), "=b" (bx)
3776 : "0" (irq), "1" (desc), "2" (isp),
3777 "D" (desc->handle_irq)
3779 --- sle11-2009-10-16.orig/arch/x86/kernel/irq_64-xen.c 2009-02-16 16:18:36.000000000 +0100
3780 +++ sle11-2009-10-16/arch/x86/kernel/irq_64-xen.c 2009-03-16 16:33:40.000000000 +0100
3783 atomic_t irq_err_count;
3786 + * 'what should we do if we get a hw irq event on an illegal vector'.
3787 + * each architecture has to answer this themselves.
3789 +void ack_bad_irq(unsigned int irq)
3791 + printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq);
3792 +#ifdef CONFIG_X86_LOCAL_APIC
3794 + * Currently unexpected vectors happen only on SMP and APIC.
3795 + * We _must_ ack these because every local APIC has only N
3796 + * irq slots per priority level, and a 'hanging, unacked' IRQ
3797 + * holds up an irq slot - in excessive cases (when multiple
3798 + * unexpected vectors occur) that might lock up the APIC
3800 + * But don't ack when the APIC is disabled. -AK
3802 + if (!disable_apic)
3807 #ifdef CONFIG_DEBUG_STACKOVERFLOW
3809 * Probabilistic stack overflow check:
3810 @@ -33,11 +55,11 @@ static inline void stack_overflow_check(
3811 u64 curbase = (u64)task_stack_page(current);
3812 static unsigned long warned = -60*HZ;
3814 - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
3815 - regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
3816 + if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
3817 + regs->sp < curbase + sizeof(struct thread_info) + 128 &&
3818 time_after(jiffies, warned + 60*HZ)) {
3819 - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
3820 - current->comm, curbase, regs->rsp);
3821 + printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
3822 + current->comm, curbase, regs->sp);
3823 show_stack(NULL,NULL);
3826 @@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt
3827 struct pt_regs *old_regs = set_irq_regs(regs);
3829 /* high bit used in ret_from_ code */
3830 - unsigned irq = ~regs->orig_rax;
3831 + unsigned irq = ~regs->orig_ax;
3835 @@ -251,14 +273,3 @@ asmlinkage void do_softirq(void)
3837 local_irq_restore(flags);
3840 -#ifndef CONFIG_X86_LOCAL_APIC
3842 - * 'what should we do if we get a hw irq event on an illegal vector'.
3843 - * each architecture has to answer this themselves.
3845 -void ack_bad_irq(unsigned int irq)
3847 - printk("unexpected IRQ trap at irq %02x\n", irq);
3850 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
3851 +++ sle11-2009-10-16/arch/x86/kernel/ldt-xen.c 2009-03-16 16:33:40.000000000 +0100
3854 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3855 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
3856 + * Copyright (C) 2002 Andi Kleen
3858 + * This handles calls from both 32bit and 64bit mode.
3861 +#include <linux/errno.h>
3862 +#include <linux/sched.h>
3863 +#include <linux/string.h>
3864 +#include <linux/mm.h>
3865 +#include <linux/smp.h>
3866 +#include <linux/vmalloc.h>
3868 +#include <asm/uaccess.h>
3869 +#include <asm/system.h>
3870 +#include <asm/ldt.h>
3871 +#include <asm/desc.h>
3872 +#include <asm/mmu_context.h>
3875 +static void flush_ldt(void *null)
3877 + if (current->active_mm)
3878 + load_LDT(¤t->active_mm->context);
3882 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
3884 + void *oldldt, *newldt;
3887 + if (mincount <= pc->size)
3889 + oldsize = pc->size;
3890 + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
3891 + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
3892 + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
3893 + newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
3895 + newldt = (void *)__get_free_page(GFP_KERNEL);
3901 + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
3903 + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
3904 + (mincount - oldsize) * LDT_ENTRY_SIZE);
3906 +#ifdef CONFIG_X86_64
3907 + /* CHECKME: Do we really need this ? */
3912 + pc->size = mincount;
3919 + preempt_disable();
3921 + make_pages_readonly(newldt,
3922 + (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
3923 + XENFEAT_writable_descriptor_tables);
3926 + mask = cpumask_of_cpu(smp_processor_id());
3927 + if (!cpus_equal(current->mm->cpu_vm_mask, mask))
3928 + smp_call_function(flush_ldt, NULL, 1, 1);
3933 + make_pages_writable(oldldt,
3934 + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
3935 + XENFEAT_writable_descriptor_tables);
3936 + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
3939 + put_page(virt_to_page(oldldt));
3944 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
3946 + int err = alloc_ldt(new, old->size, 0);
3950 + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
3951 + make_pages_readonly(new->ldt,
3952 + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
3953 + XENFEAT_writable_descriptor_tables);
3958 + * we do not have to muck with descriptors here, that is
3959 + * done in switch_mm() as needed.
3961 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
3963 + struct mm_struct *old_mm;
3966 + memset(&mm->context, 0, sizeof(mm->context));
3967 + mutex_init(&mm->context.lock);
3968 + old_mm = current->mm;
3970 + mm->context.vdso = old_mm->context.vdso;
3971 + if (old_mm && old_mm->context.size > 0) {
3972 + mutex_lock(&old_mm->context.lock);
3973 + retval = copy_ldt(&mm->context, &old_mm->context);
3974 + mutex_unlock(&old_mm->context.lock);
3980 + * No need to lock the MM as we are the last user
3982 + * 64bit: Don't touch the LDT register - we're already in the next thread.
3984 +void destroy_context(struct mm_struct *mm)
3986 + if (mm->context.size) {
3987 + /* CHECKME: Can this ever happen ? */
3988 + if (mm == current->active_mm)
3990 + make_pages_writable(mm->context.ldt,
3991 + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
3992 + XENFEAT_writable_descriptor_tables);
3993 + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
3994 + vfree(mm->context.ldt);
3996 + put_page(virt_to_page(mm->context.ldt));
3997 + mm->context.size = 0;
4001 +static int read_ldt(void __user *ptr, unsigned long bytecount)
4004 + unsigned long size;
4005 + struct mm_struct *mm = current->mm;
4007 + if (!mm->context.size)
4009 + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
4010 + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
4012 + mutex_lock(&mm->context.lock);
4013 + size = mm->context.size * LDT_ENTRY_SIZE;
4014 + if (size > bytecount)
4018 + if (copy_to_user(ptr, mm->context.ldt, size))
4020 + mutex_unlock(&mm->context.lock);
4022 + goto error_return;
4023 + if (size != bytecount) {
4024 + /* zero-fill the rest */
4025 + if (clear_user(ptr + size, bytecount - size) != 0) {
4027 + goto error_return;
4035 +static int read_default_ldt(void __user *ptr, unsigned long bytecount)
4037 + /* CHECKME: Can we use _one_ random number ? */
4038 +#ifdef CONFIG_X86_32
4039 + unsigned long size = 5 * sizeof(struct desc_struct);
4041 + unsigned long size = 128;
4043 + if (bytecount > size)
4045 + if (clear_user(ptr, bytecount))
4050 +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
4052 + struct mm_struct *mm = current->mm;
4053 + struct desc_struct ldt;
4055 + struct user_desc ldt_info;
4058 + if (bytecount != sizeof(ldt_info))
4061 + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4065 + if (ldt_info.entry_number >= LDT_ENTRIES)
4067 + if (ldt_info.contents == 3) {
4070 + if (ldt_info.seg_not_present == 0)
4074 + mutex_lock(&mm->context.lock);
4075 + if (ldt_info.entry_number >= mm->context.size) {
4076 + error = alloc_ldt(¤t->mm->context,
4077 + ldt_info.entry_number + 1, 1);
4082 + /* Allow LDTs to be cleared by the user. */
4083 + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4084 + if (oldmode || LDT_empty(&ldt_info)) {
4085 + memset(&ldt, 0, sizeof(ldt));
4090 + fill_ldt(&ldt, &ldt_info);
4094 + /* Install the new entry ... */
4096 + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
4099 + mutex_unlock(&mm->context.lock);
4104 +asmlinkage int sys_modify_ldt(int func, void __user *ptr,
4105 + unsigned long bytecount)
4107 + int ret = -ENOSYS;
4111 + ret = read_ldt(ptr, bytecount);
4114 + ret = write_ldt(ptr, bytecount, 1);
4117 + ret = read_default_ldt(ptr, bytecount);
4120 + ret = write_ldt(ptr, bytecount, 0);
4125 --- sle11-2009-10-16.orig/arch/x86/kernel/ldt_32-xen.c 2009-02-16 16:18:36.000000000 +0100
4126 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
4129 - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4130 - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4133 -#include <linux/errno.h>
4134 -#include <linux/sched.h>
4135 -#include <linux/string.h>
4136 -#include <linux/mm.h>
4137 -#include <linux/smp.h>
4138 -#include <linux/vmalloc.h>
4139 -#include <linux/slab.h>
4141 -#include <asm/uaccess.h>
4142 -#include <asm/system.h>
4143 -#include <asm/ldt.h>
4144 -#include <asm/desc.h>
4145 -#include <asm/mmu_context.h>
4147 -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4148 -static void flush_ldt(void *null)
4150 - if (current->active_mm)
4151 - load_LDT(¤t->active_mm->context);
4155 -static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
4161 - if (mincount <= pc->size)
4163 - oldsize = pc->size;
4164 - mincount = (mincount+511)&(~511);
4165 - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4166 - newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4168 - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4174 - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4176 - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4179 - pc->size = mincount;
4185 - preempt_disable();
4187 - make_pages_readonly(
4189 - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4190 - XENFEAT_writable_descriptor_tables);
4193 - mask = cpumask_of_cpu(smp_processor_id());
4194 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4195 - smp_call_function(flush_ldt, NULL, 1, 1);
4200 - make_pages_writable(
4202 - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4203 - XENFEAT_writable_descriptor_tables);
4204 - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4212 -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4214 - int err = alloc_ldt(new, old->size, 0);
4217 - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4218 - make_pages_readonly(
4220 - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4221 - XENFEAT_writable_descriptor_tables);
4226 - * we do not have to muck with descriptors here, that is
4227 - * done in switch_mm() as needed.
4229 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4231 - struct mm_struct * old_mm;
4234 - mutex_init(&mm->context.lock);
4235 - mm->context.size = 0;
4236 - mm->context.has_foreign_mappings = 0;
4237 - old_mm = current->mm;
4238 - if (old_mm && old_mm->context.size > 0) {
4239 - mutex_lock(&old_mm->context.lock);
4240 - retval = copy_ldt(&mm->context, &old_mm->context);
4241 - mutex_unlock(&old_mm->context.lock);
4247 - * No need to lock the MM as we are the last user
4249 -void destroy_context(struct mm_struct *mm)
4251 - if (mm->context.size) {
4252 - if (mm == current->active_mm)
4254 - make_pages_writable(
4256 - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4257 - XENFEAT_writable_descriptor_tables);
4258 - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4259 - vfree(mm->context.ldt);
4261 - kfree(mm->context.ldt);
4262 - mm->context.size = 0;
4266 -static int read_ldt(void __user * ptr, unsigned long bytecount)
4269 - unsigned long size;
4270 - struct mm_struct * mm = current->mm;
4272 - if (!mm->context.size)
4274 - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4275 - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4277 - mutex_lock(&mm->context.lock);
4278 - size = mm->context.size*LDT_ENTRY_SIZE;
4279 - if (size > bytecount)
4283 - if (copy_to_user(ptr, mm->context.ldt, size))
4285 - mutex_unlock(&mm->context.lock);
4287 - goto error_return;
4288 - if (size != bytecount) {
4289 - /* zero-fill the rest */
4290 - if (clear_user(ptr+size, bytecount-size) != 0) {
4292 - goto error_return;
4300 -static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4303 - unsigned long size;
4306 - size = 5*sizeof(struct desc_struct);
4307 - if (size > bytecount)
4311 - if (clear_user(ptr, size))
4317 -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4319 - struct mm_struct * mm = current->mm;
4320 - __u32 entry_1, entry_2;
4322 - struct user_desc ldt_info;
4325 - if (bytecount != sizeof(ldt_info))
4328 - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
4332 - if (ldt_info.entry_number >= LDT_ENTRIES)
4334 - if (ldt_info.contents == 3) {
4337 - if (ldt_info.seg_not_present == 0)
4341 - mutex_lock(&mm->context.lock);
4342 - if (ldt_info.entry_number >= mm->context.size) {
4343 - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
4348 - /* Allow LDTs to be cleared by the user. */
4349 - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4350 - if (oldmode || LDT_empty(&ldt_info)) {
4357 - entry_1 = LDT_entry_a(&ldt_info);
4358 - entry_2 = LDT_entry_b(&ldt_info);
4360 - entry_2 &= ~(1 << 20);
4362 - /* Install the new entry ... */
4364 - error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
4365 - entry_1, entry_2);
4368 - mutex_unlock(&mm->context.lock);
4373 -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4375 - int ret = -ENOSYS;
4379 - ret = read_ldt(ptr, bytecount);
4382 - ret = write_ldt(ptr, bytecount, 1);
4385 - ret = read_default_ldt(ptr, bytecount);
4388 - ret = write_ldt(ptr, bytecount, 0);
4393 --- sle11-2009-10-16.orig/arch/x86/kernel/ldt_64-xen.c 2009-02-16 16:18:36.000000000 +0100
4394 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
4397 - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4398 - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4399 - * Copyright (C) 2002 Andi Kleen
4401 - * This handles calls from both 32bit and 64bit mode.
4404 -#include <linux/errno.h>
4405 -#include <linux/sched.h>
4406 -#include <linux/string.h>
4407 -#include <linux/mm.h>
4408 -#include <linux/smp.h>
4409 -#include <linux/vmalloc.h>
4410 -#include <linux/slab.h>
4412 -#include <asm/uaccess.h>
4413 -#include <asm/system.h>
4414 -#include <asm/ldt.h>
4415 -#include <asm/desc.h>
4416 -#include <asm/proto.h>
4417 -#include <asm/pgalloc.h>
4419 -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
4420 -static void flush_ldt(void *null)
4422 - if (current->active_mm)
4423 - load_LDT(¤t->active_mm->context);
4427 -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
4433 - if (mincount <= (unsigned)pc->size)
4435 - oldsize = pc->size;
4436 - mincount = (mincount+511)&(~511);
4437 - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
4438 - newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
4440 - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
4446 - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
4448 - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
4452 - pc->size = mincount;
4458 - preempt_disable();
4460 - make_pages_readonly(
4462 - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4463 - XENFEAT_writable_descriptor_tables);
4466 - mask = cpumask_of_cpu(smp_processor_id());
4467 - if (!cpus_equal(current->mm->cpu_vm_mask, mask))
4468 - smp_call_function(flush_ldt, NULL, 1, 1);
4473 - make_pages_writable(
4475 - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
4476 - XENFEAT_writable_descriptor_tables);
4477 - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
4485 -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
4487 - int err = alloc_ldt(new, old->size, 0);
4490 - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
4491 - make_pages_readonly(
4493 - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4494 - XENFEAT_writable_descriptor_tables);
4499 - * we do not have to muck with descriptors here, that is
4500 - * done in switch_mm() as needed.
4502 -int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
4504 - struct mm_struct * old_mm;
4507 - memset(&mm->context, 0, sizeof(mm->context));
4508 - mutex_init(&mm->context.lock);
4509 - old_mm = current->mm;
4511 - mm->context.vdso = old_mm->context.vdso;
4512 - if (old_mm && old_mm->context.size > 0) {
4513 - mutex_lock(&old_mm->context.lock);
4514 - retval = copy_ldt(&mm->context, &old_mm->context);
4515 - mutex_unlock(&old_mm->context.lock);
4522 - * Don't touch the LDT register - we're already in the next thread.
4524 -void destroy_context(struct mm_struct *mm)
4526 - if (mm->context.size) {
4527 - if (mm == current->active_mm)
4529 - make_pages_writable(
4531 - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
4532 - XENFEAT_writable_descriptor_tables);
4533 - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
4534 - vfree(mm->context.ldt);
4536 - kfree(mm->context.ldt);
4537 - mm->context.size = 0;
4541 -static int read_ldt(void __user * ptr, unsigned long bytecount)
4544 - unsigned long size;
4545 - struct mm_struct * mm = current->mm;
4547 - if (!mm->context.size)
4549 - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
4550 - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
4552 - mutex_lock(&mm->context.lock);
4553 - size = mm->context.size*LDT_ENTRY_SIZE;
4554 - if (size > bytecount)
4558 - if (copy_to_user(ptr, mm->context.ldt, size))
4560 - mutex_unlock(&mm->context.lock);
4562 - goto error_return;
4563 - if (size != bytecount) {
4564 - /* zero-fill the rest */
4565 - if (clear_user(ptr+size, bytecount-size) != 0) {
4567 - goto error_return;
4575 -static int read_default_ldt(void __user * ptr, unsigned long bytecount)
4577 - /* Arbitrary number */
4578 - /* x86-64 default LDT is all zeros */
4579 - if (bytecount > 128)
4581 - if (clear_user(ptr, bytecount))
4586 -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
4588 - struct task_struct *me = current;
4589 - struct mm_struct * mm = me->mm;
4590 - __u32 entry_1, entry_2, *lp;
4591 - unsigned long mach_lp;
4593 - struct user_desc ldt_info;
4597 - if (bytecount != sizeof(ldt_info))
4600 - if (copy_from_user(&ldt_info, ptr, bytecount))
4604 - if (ldt_info.entry_number >= LDT_ENTRIES)
4606 - if (ldt_info.contents == 3) {
4609 - if (ldt_info.seg_not_present == 0)
4613 - mutex_lock(&mm->context.lock);
4614 - if (ldt_info.entry_number >= (unsigned)mm->context.size) {
4615 - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1);
4620 - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
4621 - mach_lp = arbitrary_virt_to_machine(lp);
4623 - /* Allow LDTs to be cleared by the user. */
4624 - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
4625 - if (oldmode || LDT_empty(&ldt_info)) {
4632 - entry_1 = LDT_entry_a(&ldt_info);
4633 - entry_2 = LDT_entry_b(&ldt_info);
4635 - entry_2 &= ~(1 << 20);
4637 - /* Install the new entry ... */
4639 - error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
4642 - mutex_unlock(&mm->context.lock);
4647 -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
4649 - int ret = -ENOSYS;
4653 - ret = read_ldt(ptr, bytecount);
4656 - ret = write_ldt(ptr, bytecount, 1);
4659 - ret = read_default_ldt(ptr, bytecount);
4662 - ret = write_ldt(ptr, bytecount, 0);
4667 --- sle11-2009-10-16.orig/arch/x86/kernel/machine_kexec_64.c 2008-11-25 12:35:54.000000000 +0100
4668 +++ sle11-2009-10-16/arch/x86/kernel/machine_kexec_64.c 2009-03-16 16:33:40.000000000 +0100
4669 @@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image)
4671 void arch_crash_save_vmcoreinfo(void)
4673 +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
4674 VMCOREINFO_SYMBOL(phys_base);
4676 VMCOREINFO_SYMBOL(init_level4_pgt);
4679 --- sle11-2009-10-16.orig/arch/x86/kernel/microcode-xen.c 2009-02-16 16:17:21.000000000 +0100
4680 +++ sle11-2009-10-16/arch/x86/kernel/microcode-xen.c 2009-03-16 16:33:40.000000000 +0100
4681 @@ -167,7 +167,7 @@ static int request_microcode(void)
4684 op.cmd = XENPF_microcode_update;
4685 - set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4686 + set_xen_guest_handle(op.u.microcode.data, firmware->data);
4687 op.u.microcode.length = firmware->size;
4688 error = HYPERVISOR_platform_op(&op);
4690 --- sle11-2009-10-16.orig/arch/x86/kernel/mpparse_32-xen.c 2009-02-16 16:18:36.000000000 +0100
4691 +++ sle11-2009-10-16/arch/x86/kernel/mpparse_32-xen.c 2009-03-16 16:33:40.000000000 +0100
4692 @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
4693 /* Processor that is doing the boot up */
4694 unsigned int boot_cpu_physical_apicid = -1U;
4695 /* Internal processor count */
4696 -unsigned int __cpuinitdata num_processors;
4697 +unsigned int num_processors;
4699 /* Bitmask of physically existing CPUs */
4700 physid_mask_t phys_cpu_present_map;
4701 @@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc
4702 if (!(m->mpc_flags & MPC_APIC_USABLE))
4705 - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
4706 + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
4707 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
4708 if (nr_ioapics >= MAX_IO_APICS) {
4709 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
4710 @@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp
4712 mps_oem_check(mpc, oem, str);
4714 - printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
4715 + printk("APIC at: 0x%X\n", mpc->mpc_lapic);
4719 * Save the local APIC address (it might be non-default) -- but only
4720 * if we're not using ACPI.
4722 @@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig
4723 unsigned long *bp = isa_bus_to_virt(base);
4724 struct intel_mp_floating *mpf;
4726 - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
4727 + printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
4728 if (sizeof(*mpf) != 16)
4729 printk("Error: MPF size\n");
4731 @@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig
4733 smp_found_config = 1;
4735 - printk(KERN_INFO "found SMP MP-table at %08lx\n",
4736 - virt_to_phys(mpf));
4737 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
4738 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4739 + mpf, virt_to_phys(mpf));
4740 + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
4742 if (mpf->mpf_physptr) {
4744 * We cannot access to MPC table to compute
4745 @@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig
4746 unsigned long end = max_low_pfn * PAGE_SIZE;
4747 if (mpf->mpf_physptr + size > end)
4748 size = end - mpf->mpf_physptr;
4749 - reserve_bootmem(mpf->mpf_physptr, size);
4750 + reserve_bootmem(mpf->mpf_physptr, size,
4754 - printk(KERN_INFO "found SMP MP-table at %08lx\n",
4755 - ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
4756 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
4757 + mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
4761 @@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3
4763 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
4764 mp_ioapic_routing[idx].gsi_base = gsi_base;
4765 - mp_ioapic_routing[idx].gsi_end = gsi_base +
4766 + mp_ioapic_routing[idx].gsi_end = gsi_base +
4767 io_apic_get_redir_entries(idx);
4769 - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
4770 - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4771 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4772 - mp_ioapic_routing[idx].gsi_base,
4773 - mp_ioapic_routing[idx].gsi_end);
4774 + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4775 + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
4776 + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4777 + mp_ioapic_routing[idx].gsi_base,
4778 + mp_ioapic_routing[idx].gsi_end);
4782 @@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs (
4785 #define MAX_GSI_NUM 4096
4786 +#define IRQ_COMPRESSION_START 64
4788 int mp_register_gsi(u32 gsi, int triggering, int polarity)
4793 - static int pci_irq = 16;
4794 + static int pci_irq = IRQ_COMPRESSION_START;
4796 - * Mapping between Global System Interrups, which
4797 + * Mapping between Global System Interrupts, which
4798 * represent all possible interrupts, and IRQs
4799 * assigned to actual devices.
4801 @@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger
4802 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
4803 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
4804 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
4805 - return gsi_to_irq[gsi];
4806 + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
4809 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
4811 - if (triggering == ACPI_LEVEL_SENSITIVE) {
4813 + * For GSI >= 64, use IRQ compression
4815 + if ((gsi >= IRQ_COMPRESSION_START)
4816 + && (triggering == ACPI_LEVEL_SENSITIVE)) {
4818 * For PCI devices assign IRQs in order, avoiding gaps
4819 * due to unused I/O APIC pins.
4820 --- sle11-2009-10-16.orig/arch/x86/kernel/mpparse_64-xen.c 2009-02-16 16:18:36.000000000 +0100
4821 +++ sle11-2009-10-16/arch/x86/kernel/mpparse_64-xen.c 2009-03-16 16:33:40.000000000 +0100
4822 @@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U;
4823 EXPORT_SYMBOL(boot_cpu_id);
4825 /* Internal processor count */
4826 -unsigned int num_processors __cpuinitdata = 0;
4827 +unsigned int num_processors;
4829 unsigned disabled_cpus __cpuinitdata;
4831 /* Bitmask of physically existing CPUs */
4832 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4834 -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4836 +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
4837 + = { [0 ... NR_CPUS-1] = BAD_APICID };
4838 +void *x86_bios_cpu_apicid_early_ptr;
4840 +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
4841 +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
4845 @@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info(
4846 physid_set(m->mpc_apicid, phys_cpu_present_map);
4847 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4849 - * bios_cpu_apicid is required to have processors listed
4850 + * x86_bios_cpu_apicid is required to have processors listed
4851 * in same order as logical cpu numbers. Hence the first
4852 * entry is BSP, and so on.
4856 - bios_cpu_apicid[cpu] = m->mpc_apicid;
4858 - * We get called early in the the start_kernel initialization
4859 - * process when the per_cpu data area is not yet setup, so we
4860 - * use a static array that is removed after the per_cpu data
4861 - * area is created.
4863 - if (x86_cpu_to_apicid_ptr) {
4864 - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
4865 - x86_cpu_to_apicid[cpu] = m->mpc_apicid;
4866 + /* are we being called early in kernel startup? */
4867 + if (x86_cpu_to_apicid_early_ptr) {
4868 + u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
4869 + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
4871 + cpu_to_apicid[cpu] = m->mpc_apicid;
4872 + bios_cpu_apicid[cpu] = m->mpc_apicid;
4874 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
4875 + per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
4878 cpu_set(cpu, cpu_possible_map);
4879 --- sle11-2009-10-16.orig/arch/x86/kernel/pci-dma-xen.c 2009-02-16 16:18:36.000000000 +0100
4880 +++ sle11-2009-10-16/arch/x86/kernel/pci-dma-xen.c 2009-03-16 16:33:40.000000000 +0100
4881 @@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device
4882 swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
4884 EXPORT_SYMBOL(dma_sync_single_for_device);
4887 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
4888 + enum dma_data_direction direction)
4891 + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
4892 + flush_write_buffers();
4894 +EXPORT_SYMBOL(dma_sync_sg_for_cpu);
4897 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
4898 + enum dma_data_direction direction)
4901 + swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
4902 + flush_write_buffers();
4904 +EXPORT_SYMBOL(dma_sync_sg_for_device);
4905 --- sle11-2009-10-16.orig/arch/x86/kernel/process_32-xen.c 2009-02-16 16:18:36.000000000 +0100
4906 +++ sle11-2009-10-16/arch/x86/kernel/process_32-xen.c 2009-03-16 16:33:40.000000000 +0100
4908 #include <linux/slab.h>
4909 #include <linux/vmalloc.h>
4910 #include <linux/user.h>
4911 -#include <linux/a.out.h>
4912 #include <linux/interrupt.h>
4913 #include <linux/utsname.h>
4914 #include <linux/delay.h>
4917 #include <asm/tlbflush.h>
4918 #include <asm/cpu.h>
4919 +#include <asm/kdebug.h>
4921 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
4922 +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
4924 static int hlt_counter;
4926 @@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
4928 unsigned long thread_saved_pc(struct task_struct *tsk)
4930 - return ((unsigned long *)tsk->thread.esp)[3];
4931 + return ((unsigned long *)tsk->thread.sp)[3];
4935 @@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas
4937 void (*pm_idle)(void);
4938 EXPORT_SYMBOL(pm_idle);
4939 -static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
4941 void disable_hlt(void)
4943 @@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt);
4944 * to poll the ->work.need_resched flag instead of waiting for the
4945 * cross-CPU IPI to arrive. Use this option with caution.
4947 -static void poll_idle (void)
4948 +static void poll_idle(void)
4952 @@ -122,10 +122,19 @@ static void xen_idle(void)
4955 local_irq_disable();
4956 - if (!need_resched())
4957 + if (!need_resched()) {
4962 + t0n = ktime_to_ns(t0);
4963 safe_halt(); /* enables interrupts racelessly */
4965 - local_irq_enable();
4966 + local_irq_disable();
4968 + t1n = ktime_to_ns(t1);
4969 + sched_clock_idle_wakeup_event(t1n - t0n);
4971 + local_irq_enable();
4972 current_thread_info()->status |= TS_POLLING;
4974 #ifdef CONFIG_APM_MODULE
4975 @@ -168,13 +177,13 @@ void cpu_idle(void)
4976 while (!need_resched()) {
4979 - if (__get_cpu_var(cpu_idle_state))
4980 - __get_cpu_var(cpu_idle_state) = 0;
4984 idle = xen_idle; /* no alternatives */
4986 + if (rcu_pending(cpu))
4987 + rcu_check_callbacks(cpu, 0);
4989 if (cpu_is_offline(cpu))
4992 @@ -192,40 +201,19 @@ static void do_nothing(void *unused)
4997 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
4998 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
4999 + * handler on SMP systems.
5001 + * Caller must have changed pm_idle to the new value before the call. Old
5002 + * pm_idle value will not be used by any CPU after the return of this function.
5004 void cpu_idle_wait(void)
5006 - unsigned int cpu, this_cpu = get_cpu();
5007 - cpumask_t map, tmp = current->cpus_allowed;
5009 - set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5013 - for_each_online_cpu(cpu) {
5014 - per_cpu(cpu_idle_state, cpu) = 1;
5015 - cpu_set(cpu, map);
5018 - __get_cpu_var(cpu_idle_state) = 0;
5023 - for_each_online_cpu(cpu) {
5024 - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
5025 - cpu_clear(cpu, map);
5027 - cpus_and(map, map, cpu_online_map);
5029 - * We waited 1 sec, if a CPU still did not call idle
5030 - * it may be because it is in idle and not waking up
5031 - * because it has nothing to do.
5032 - * Give all the remaining CPUS a kick.
5034 - smp_call_function_mask(map, do_nothing, 0, 0);
5035 - } while (!cpus_empty(map));
5037 - set_cpus_allowed(current, tmp);
5039 + /* kick all the CPUs so that they exit out of pm_idle */
5040 + smp_call_function(do_nothing, NULL, 0, 1);
5042 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5044 @@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re
5046 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
5047 unsigned long d0, d1, d2, d3, d6, d7;
5048 - unsigned long esp;
5050 unsigned short ss, gs;
5052 if (user_mode_vm(regs)) {
5054 - ss = regs->xss & 0xffff;
5056 + ss = regs->ss & 0xffff;
5057 savesegment(gs, gs);
5059 - esp = (unsigned long) (®s->esp);
5060 + sp = (unsigned long) (®s->sp);
5061 savesegment(ss, ss);
5062 savesegment(gs, gs);
5064 @@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re
5065 init_utsname()->version);
5067 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
5068 - 0xffff & regs->xcs, regs->eip, regs->eflags,
5069 + 0xffff & regs->cs, regs->ip, regs->flags,
5070 smp_processor_id());
5071 - print_symbol("EIP is at %s\n", regs->eip);
5072 + print_symbol("EIP is at %s\n", regs->ip);
5074 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5075 - regs->eax, regs->ebx, regs->ecx, regs->edx);
5076 + regs->ax, regs->bx, regs->cx, regs->dx);
5077 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
5078 - regs->esi, regs->edi, regs->ebp, esp);
5079 + regs->si, regs->di, regs->bp, sp);
5080 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
5081 - regs->xds & 0xffff, regs->xes & 0xffff,
5082 - regs->xfs & 0xffff, gs, ss);
5083 + regs->ds & 0xffff, regs->es & 0xffff,
5084 + regs->fs & 0xffff, gs, ss);
5088 @@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re
5089 void show_regs(struct pt_regs *regs)
5091 __show_registers(regs, 1);
5092 - show_trace(NULL, regs, ®s->esp);
5093 + show_trace(NULL, regs, ®s->sp, regs->bp);
5097 - * This gets run with %ebx containing the
5098 - * function to call, and %edx containing
5099 + * This gets run with %bx containing the
5100 + * function to call, and %dx containing
5103 extern void kernel_thread_helper(void);
5104 @@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi
5106 memset(®s, 0, sizeof(regs));
5108 - regs.ebx = (unsigned long) fn;
5109 - regs.edx = (unsigned long) arg;
5110 + regs.bx = (unsigned long) fn;
5111 + regs.dx = (unsigned long) arg;
5113 - regs.xds = __USER_DS;
5114 - regs.xes = __USER_DS;
5115 - regs.xfs = __KERNEL_PERCPU;
5116 - regs.orig_eax = -1;
5117 - regs.eip = (unsigned long) kernel_thread_helper;
5118 - regs.xcs = __KERNEL_CS | get_kernel_rpl();
5119 - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5120 + regs.ds = __USER_DS;
5121 + regs.es = __USER_DS;
5122 + regs.fs = __KERNEL_PERCPU;
5123 + regs.orig_ax = -1;
5124 + regs.ip = (unsigned long) kernel_thread_helper;
5125 + regs.cs = __KERNEL_CS | get_kernel_rpl();
5126 + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5128 /* Ok, create the new process.. */
5129 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
5130 @@ -368,7 +356,12 @@ void flush_thread(void)
5132 struct task_struct *tsk = current;
5134 - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
5135 + tsk->thread.debugreg0 = 0;
5136 + tsk->thread.debugreg1 = 0;
5137 + tsk->thread.debugreg2 = 0;
5138 + tsk->thread.debugreg3 = 0;
5139 + tsk->thread.debugreg6 = 0;
5140 + tsk->thread.debugreg7 = 0;
5141 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5142 clear_tsk_thread_flag(tsk, TIF_DEBUG);
5144 @@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct
5148 -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
5149 +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5150 unsigned long unused,
5151 struct task_struct * p, struct pt_regs * regs)
5153 @@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl
5155 childregs = task_pt_regs(p);
5157 - childregs->eax = 0;
5158 - childregs->esp = esp;
5159 + childregs->ax = 0;
5160 + childregs->sp = sp;
5162 - p->thread.esp = (unsigned long) childregs;
5163 - p->thread.esp0 = (unsigned long) (childregs+1);
5164 + p->thread.sp = (unsigned long) childregs;
5165 + p->thread.sp0 = (unsigned long) (childregs+1);
5167 - p->thread.eip = (unsigned long) ret_from_fork;
5168 + p->thread.ip = (unsigned long) ret_from_fork;
5170 - savesegment(gs,p->thread.gs);
5171 + savesegment(gs, p->thread.gs);
5174 + if (test_tsk_thread_flag(tsk, TIF_CSTAR))
5175 + p->thread.ip = (unsigned long) cstar_ret_from_fork;
5176 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5177 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5178 IO_BITMAP_BYTES, GFP_KERNEL);
5179 @@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl
5180 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5186 * Set a new TLS for the child thread?
5188 - if (clone_flags & CLONE_SETTLS) {
5189 - struct desc_struct *desc;
5190 - struct user_desc info;
5194 - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
5197 - if (LDT_empty(&info))
5200 - idx = info.entry_number;
5201 - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5204 - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5205 - desc->a = LDT_entry_a(&info);
5206 - desc->b = LDT_entry_b(&info);
5208 + if (clone_flags & CLONE_SETTLS)
5209 + err = do_set_thread_area(p, -1,
5210 + (struct user_desc __user *)childregs->si, 0);
5212 p->thread.iopl = current->thread.iopl;
5216 if (err && p->thread.io_bitmap_ptr) {
5217 kfree(p->thread.io_bitmap_ptr);
5218 p->thread.io_bitmap_max = 0;
5219 @@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl
5224 - * fill in the user structure for a core dump..
5226 -void dump_thread(struct pt_regs * regs, struct user * dump)
5230 -/* changed the size calculations - should hopefully work better. lbt */
5231 - dump->magic = CMAGIC;
5232 - dump->start_code = 0;
5233 - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
5234 - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
5235 - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
5236 - dump->u_dsize -= dump->u_tsize;
5237 - dump->u_ssize = 0;
5238 - for (i = 0; i < 8; i++)
5239 - dump->u_debugreg[i] = current->thread.debugreg[i];
5241 - if (dump->start_stack < TASK_SIZE)
5242 - dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
5244 - dump->regs.ebx = regs->ebx;
5245 - dump->regs.ecx = regs->ecx;
5246 - dump->regs.edx = regs->edx;
5247 - dump->regs.esi = regs->esi;
5248 - dump->regs.edi = regs->edi;
5249 - dump->regs.ebp = regs->ebp;
5250 - dump->regs.eax = regs->eax;
5251 - dump->regs.ds = regs->xds;
5252 - dump->regs.es = regs->xes;
5253 - dump->regs.fs = regs->xfs;
5254 - savesegment(gs,dump->regs.gs);
5255 - dump->regs.orig_eax = regs->orig_eax;
5256 - dump->regs.eip = regs->eip;
5257 - dump->regs.cs = regs->xcs;
5258 - dump->regs.eflags = regs->eflags;
5259 - dump->regs.esp = regs->esp;
5260 - dump->regs.ss = regs->xss;
5262 - dump->u_fpvalid = dump_fpu (regs, &dump->i387);
5264 -EXPORT_SYMBOL(dump_thread);
5267 - * Capture the user space registers if the task is not running (in user space)
5269 -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
5271 - struct pt_regs ptregs = *task_pt_regs(tsk);
5272 - ptregs.xcs &= 0xffff;
5273 - ptregs.xds &= 0xffff;
5274 - ptregs.xes &= 0xffff;
5275 - ptregs.xss &= 0xffff;
5277 - elf_core_copy_regs(regs, &ptregs);
5282 #ifdef CONFIG_SECCOMP
5283 -void hard_disable_TSC(void)
5284 +static void hard_disable_TSC(void)
5286 write_cr4(read_cr4() | X86_CR4_TSD);
5288 @@ -534,7 +453,7 @@ void disable_TSC(void)
5292 -void hard_enable_TSC(void)
5293 +static void hard_enable_TSC(void)
5295 write_cr4(read_cr4() & ~X86_CR4_TSD);
5297 @@ -543,18 +462,32 @@ void hard_enable_TSC(void)
5298 static noinline void
5299 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
5301 - struct thread_struct *next;
5302 + struct thread_struct *prev, *next;
5303 + unsigned long debugctl;
5305 + prev = &prev_p->thread;
5306 next = &next_p->thread;
5308 + debugctl = prev->debugctlmsr;
5309 + if (next->ds_area_msr != prev->ds_area_msr) {
5310 + /* we clear debugctl to make sure DS
5311 + * is not in use when we change it */
5313 + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5314 + wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
5317 + if (next->debugctlmsr != debugctl)
5318 + wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
5320 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5321 - set_debugreg(next->debugreg[0], 0);
5322 - set_debugreg(next->debugreg[1], 1);
5323 - set_debugreg(next->debugreg[2], 2);
5324 - set_debugreg(next->debugreg[3], 3);
5325 + set_debugreg(next->debugreg0, 0);
5326 + set_debugreg(next->debugreg1, 1);
5327 + set_debugreg(next->debugreg2, 2);
5328 + set_debugreg(next->debugreg3, 3);
5330 - set_debugreg(next->debugreg[6], 6);
5331 - set_debugreg(next->debugreg[7], 7);
5332 + set_debugreg(next->debugreg6, 6);
5333 + set_debugreg(next->debugreg7, 7);
5336 #ifdef CONFIG_SECCOMP
5337 @@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre
5343 + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5344 + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5346 + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5347 + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5352 @@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre
5353 * More important, however, is the fact that this allows us much
5356 - * The return value (in %eax) will be the "prev" task after
5357 + * The return value (in %ax) will be the "prev" task after
5358 * the task-switch, and shows up in ret_from_fork in entry.S,
5361 -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5362 +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
5364 struct thread_struct *prev = &prev_p->thread,
5365 *next = &next_p->thread;
5366 @@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t
5371 - * This is load_esp0(tss, next) with a multicall.
5373 + * This is load_sp0(tss, next) with a multicall.
5375 mcl->op = __HYPERVISOR_stack_switch;
5376 mcl->args[0] = __KERNEL_DS;
5377 - mcl->args[1] = next->esp0;
5378 + mcl->args[1] = next->sp0;
5382 @@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t
5384 asmlinkage int sys_fork(struct pt_regs regs)
5386 - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
5387 + return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL);
5390 asmlinkage int sys_clone(struct pt_regs regs)
5391 @@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs
5392 unsigned long newsp;
5393 int __user *parent_tidptr, *child_tidptr;
5395 - clone_flags = regs.ebx;
5397 - parent_tidptr = (int __user *)regs.edx;
5398 - child_tidptr = (int __user *)regs.edi;
5399 + clone_flags = regs.bx;
5401 + parent_tidptr = (int __user *)regs.dx;
5402 + child_tidptr = (int __user *)regs.di;
5406 return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr);
5409 @@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs
5411 asmlinkage int sys_vfork(struct pt_regs regs)
5413 - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
5414 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL);
5418 @@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs
5422 - filename = getname((char __user *) regs.ebx);
5423 + filename = getname((char __user *) regs.bx);
5424 error = PTR_ERR(filename);
5425 if (IS_ERR(filename))
5427 error = do_execve(filename,
5428 - (char __user * __user *) regs.ecx,
5429 - (char __user * __user *) regs.edx,
5430 + (char __user * __user *) regs.cx,
5431 + (char __user * __user *) regs.dx,
5434 - task_lock(current);
5435 - current->ptrace &= ~PT_DTRACE;
5436 - task_unlock(current);
5437 /* Make sure we don't return using sysenter.. */
5438 set_thread_flag(TIF_IRET);
5440 @@ -800,145 +738,37 @@ out:
5442 unsigned long get_wchan(struct task_struct *p)
5444 - unsigned long ebp, esp, eip;
5445 + unsigned long bp, sp, ip;
5446 unsigned long stack_page;
5448 if (!p || p == current || p->state == TASK_RUNNING)
5450 stack_page = (unsigned long)task_stack_page(p);
5451 - esp = p->thread.esp;
5452 - if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
5453 + sp = p->thread.sp;
5454 + if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
5456 - /* include/asm-i386/system.h:switch_to() pushes ebp last. */
5457 - ebp = *(unsigned long *) esp;
5458 + /* include/asm-i386/system.h:switch_to() pushes bp last. */
5459 + bp = *(unsigned long *) sp;
5461 - if (ebp < stack_page || ebp > top_ebp+stack_page)
5462 + if (bp < stack_page || bp > top_ebp+stack_page)
5464 - eip = *(unsigned long *) (ebp+4);
5465 - if (!in_sched_functions(eip))
5467 - ebp = *(unsigned long *) ebp;
5468 + ip = *(unsigned long *) (bp+4);
5469 + if (!in_sched_functions(ip))
5471 + bp = *(unsigned long *) bp;
5472 } while (count++ < 16);
5477 - * sys_alloc_thread_area: get a yet unused TLS descriptor index.
5479 -static int get_free_idx(void)
5481 - struct thread_struct *t = ¤t->thread;
5484 - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
5485 - if (desc_empty(t->tls_array + idx))
5486 - return idx + GDT_ENTRY_TLS_MIN;
5491 - * Set a given TLS descriptor:
5493 -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
5495 - struct thread_struct *t = ¤t->thread;
5496 - struct user_desc info;
5497 - struct desc_struct *desc;
5500 - if (copy_from_user(&info, u_info, sizeof(info)))
5502 - idx = info.entry_number;
5505 - * index -1 means the kernel should try to find and
5506 - * allocate an empty descriptor:
5509 - idx = get_free_idx();
5512 - if (put_user(idx, &u_info->entry_number))
5516 - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5519 - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
5522 - * We must not get preempted while modifying the TLS.
5526 - if (LDT_empty(&info)) {
5530 - desc->a = LDT_entry_a(&info);
5531 - desc->b = LDT_entry_b(&info);
5541 - * Get the current Thread-Local Storage area:
5544 -#define GET_BASE(desc) ( \
5545 - (((desc)->a >> 16) & 0x0000ffff) | \
5546 - (((desc)->b << 16) & 0x00ff0000) | \
5547 - ( (desc)->b & 0xff000000) )
5549 -#define GET_LIMIT(desc) ( \
5550 - ((desc)->a & 0x0ffff) | \
5551 - ((desc)->b & 0xf0000) )
5553 -#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
5554 -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
5555 -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
5556 -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
5557 -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
5558 -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
5560 -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
5562 - struct user_desc info;
5563 - struct desc_struct *desc;
5566 - if (get_user(idx, &u_info->entry_number))
5568 - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
5571 - memset(&info, 0, sizeof(info));
5573 - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
5575 - info.entry_number = idx;
5576 - info.base_addr = GET_BASE(desc);
5577 - info.limit = GET_LIMIT(desc);
5578 - info.seg_32bit = GET_32BIT(desc);
5579 - info.contents = GET_CONTENTS(desc);
5580 - info.read_exec_only = !GET_WRITABLE(desc);
5581 - info.limit_in_pages = GET_LIMIT_PAGES(desc);
5582 - info.seg_not_present = !GET_PRESENT(desc);
5583 - info.useable = GET_USEABLE(desc);
5585 - if (copy_to_user(u_info, &info, sizeof(info)))
5590 unsigned long arch_align_stack(unsigned long sp)
5592 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5593 sp -= get_random_int() % 8192;
5597 +unsigned long arch_randomize_brk(struct mm_struct *mm)
5599 + unsigned long range_end = mm->brk + 0x02000000;
5600 + return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
5602 --- sle11-2009-10-16.orig/arch/x86/kernel/process_64-xen.c 2009-02-16 16:18:36.000000000 +0100
5603 +++ sle11-2009-10-16/arch/x86/kernel/process_64-xen.c 2009-03-16 16:33:40.000000000 +0100
5606 * Pentium III FXSR, SSE support
5607 * Gareth Hughes <gareth@valinux.com>, May 2000
5614 #include <linux/cpu.h>
5615 #include <linux/errno.h>
5616 #include <linux/sched.h>
5617 +#include <linux/fs.h>
5618 #include <linux/kernel.h>
5619 #include <linux/mm.h>
5620 -#include <linux/fs.h>
5621 #include <linux/elfcore.h>
5622 #include <linux/smp.h>
5623 #include <linux/slab.h>
5624 #include <linux/user.h>
5625 -#include <linux/module.h>
5626 -#include <linux/a.out.h>
5627 #include <linux/interrupt.h>
5628 +#include <linux/utsname.h>
5629 #include <linux/delay.h>
5630 +#include <linux/module.h>
5631 #include <linux/ptrace.h>
5632 -#include <linux/utsname.h>
5633 #include <linux/random.h>
5634 #include <linux/notifier.h>
5635 #include <linux/kprobes.h>
5636 @@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override)
5638 void (*pm_idle)(void);
5639 EXPORT_SYMBOL(pm_idle);
5640 -static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
5642 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
5644 @@ -81,13 +79,6 @@ void idle_notifier_register(struct notif
5646 atomic_notifier_chain_register(&idle_notifier, n);
5648 -EXPORT_SYMBOL_GPL(idle_notifier_register);
5650 -void idle_notifier_unregister(struct notifier_block *n)
5652 - atomic_notifier_chain_unregister(&idle_notifier, n);
5654 -EXPORT_SYMBOL(idle_notifier_unregister);
5656 void enter_idle(void)
5658 @@ -116,7 +107,7 @@ void exit_idle(void)
5659 * to poll the ->need_resched flag instead of waiting for the
5660 * cross-CPU IPI to arrive. Use this option with caution.
5662 -static void poll_idle (void)
5663 +static void poll_idle(void)
5667 @@ -131,10 +122,19 @@ static void xen_idle(void)
5670 local_irq_disable();
5671 - if (!need_resched())
5674 - local_irq_enable();
5675 + if (!need_resched()) {
5680 + t0n = ktime_to_ns(t0);
5681 + safe_halt(); /* enables interrupts racelessly */
5682 + local_irq_disable();
5684 + t1n = ktime_to_ns(t1);
5685 + sched_clock_idle_wakeup_event(t1n - t0n);
5687 + local_irq_enable();
5688 current_thread_info()->status |= TS_POLLING;
5691 @@ -161,19 +161,15 @@ static inline void play_dead(void)
5692 * low exit latency (ie sit in a loop waiting for
5693 * somebody to say that they'd like to reschedule)
5695 -void cpu_idle (void)
5696 +void cpu_idle(void)
5698 current_thread_info()->status |= TS_POLLING;
5699 /* endless idle loop with no priority at all */
5701 + tick_nohz_stop_sched_tick();
5702 while (!need_resched()) {
5705 - if (__get_cpu_var(cpu_idle_state))
5706 - __get_cpu_var(cpu_idle_state) = 0;
5708 - tick_nohz_stop_sched_tick();
5711 idle = xen_idle; /* no alternatives */
5712 if (cpu_is_offline(smp_processor_id()))
5713 @@ -203,49 +199,27 @@ static void do_nothing(void *unused)
5718 + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
5719 + * pm_idle and update to new pm_idle value. Required while changing pm_idle
5720 + * handler on SMP systems.
5722 + * Caller must have changed pm_idle to the new value before the call. Old
5723 + * pm_idle value will not be used by any CPU after the return of this function.
5725 void cpu_idle_wait(void)
5727 - unsigned int cpu, this_cpu = get_cpu();
5728 - cpumask_t map, tmp = current->cpus_allowed;
5730 - set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5734 - for_each_online_cpu(cpu) {
5735 - per_cpu(cpu_idle_state, cpu) = 1;
5736 - cpu_set(cpu, map);
5739 - __get_cpu_var(cpu_idle_state) = 0;
5744 - for_each_online_cpu(cpu) {
5745 - if (cpu_isset(cpu, map) &&
5746 - !per_cpu(cpu_idle_state, cpu))
5747 - cpu_clear(cpu, map);
5749 - cpus_and(map, map, cpu_online_map);
5751 - * We waited 1 sec, if a CPU still did not call idle
5752 - * it may be because it is in idle and not waking up
5753 - * because it has nothing to do.
5754 - * Give all the remaining CPUS a kick.
5756 - smp_call_function_mask(map, do_nothing, 0, 0);
5757 - } while (!cpus_empty(map));
5759 - set_cpus_allowed(current, tmp);
5761 + /* kick all the CPUs so that they exit out of pm_idle */
5762 + smp_call_function(do_nothing, NULL, 0, 1);
5764 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5766 -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5767 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
5771 -static int __init idle_setup (char *str)
5772 +static int __init idle_setup(char *str)
5774 if (!strcmp(str, "poll")) {
5775 printk("using polling idle threads.\n");
5776 @@ -260,13 +234,13 @@ static int __init idle_setup (char *str)
5778 early_param("idle", idle_setup);
5780 -/* Prints also some state that isn't saved in the pt_regs */
5781 +/* Prints also some state that isn't saved in the pt_regs */
5782 void __show_regs(struct pt_regs * regs)
5784 unsigned long fs, gs, shadowgs;
5785 unsigned long d0, d1, d2, d3, d6, d7;
5786 - unsigned int fsindex,gsindex;
5787 - unsigned int ds,cs,es;
5788 + unsigned int fsindex, gsindex;
5789 + unsigned int ds, cs, es;
5793 @@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs)
5794 init_utsname()->release,
5795 (int)strcspn(init_utsname()->version, " "),
5796 init_utsname()->version);
5797 - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5798 - printk_address(regs->rip);
5799 - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
5801 + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
5802 + printk_address(regs->ip, 1);
5803 + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
5805 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
5806 - regs->rax, regs->rbx, regs->rcx);
5807 + regs->ax, regs->bx, regs->cx);
5808 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
5809 - regs->rdx, regs->rsi, regs->rdi);
5810 + regs->dx, regs->si, regs->di);
5811 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
5812 - regs->rbp, regs->r8, regs->r9);
5813 + regs->bp, regs->r8, regs->r9);
5814 printk("R10: %016lx R11: %016lx R12: %016lx\n",
5815 regs->r10, regs->r11, regs->r12);
5816 printk("R13: %016lx R14: %016lx R15: %016lx\n",
5817 @@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs)
5819 printk("CPU %d:", smp_processor_id());
5821 - show_trace(NULL, regs, (void *)(regs + 1));
5822 + show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
5826 @@ -329,7 +303,7 @@ void exit_thread(void)
5827 struct task_struct *me = current;
5828 struct thread_struct *t = &me->thread;
5830 - if (me->thread.io_bitmap_ptr) {
5831 + if (me->thread.io_bitmap_ptr) {
5832 #ifndef CONFIG_X86_NO_TSS
5833 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
5835 @@ -382,7 +356,7 @@ void flush_thread(void)
5836 tsk->thread.debugreg3 = 0;
5837 tsk->thread.debugreg6 = 0;
5838 tsk->thread.debugreg7 = 0;
5839 - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5840 + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
5842 * Forget coprocessor state..
5844 @@ -405,26 +379,21 @@ void release_thread(struct task_struct *
5846 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
5848 - struct user_desc ud = {
5849 + struct user_desc ud = {
5853 .limit_in_pages = 1,
5856 - struct n_desc_struct *desc = (void *)t->thread.tls_array;
5857 + struct desc_struct *desc = t->thread.tls_array;
5859 - desc->a = LDT_entry_a(&ud);
5860 - desc->b = LDT_entry_b(&ud);
5861 + fill_ldt(desc, &ud);
5864 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
5866 - struct desc_struct *desc = (void *)t->thread.tls_array;
5868 - return desc->base0 |
5869 - (((u32)desc->base1) << 16) |
5870 - (((u32)desc->base2) << 24);
5871 + return get_desc_base(&t->thread.tls_array[tls]);
5875 @@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct
5879 -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
5880 +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
5881 unsigned long unused,
5882 struct task_struct * p, struct pt_regs * regs)
5884 @@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl
5885 (THREAD_SIZE + task_stack_page(p))) - 1;
5888 - childregs->rax = 0;
5889 - childregs->rsp = rsp;
5891 - childregs->rsp = (unsigned long)childregs;
5893 - p->thread.rsp = (unsigned long) childregs;
5894 - p->thread.rsp0 = (unsigned long) (childregs+1);
5895 - p->thread.userrsp = me->thread.userrsp;
5896 + childregs->ax = 0;
5897 + childregs->sp = sp;
5899 + childregs->sp = (unsigned long)childregs;
5901 + p->thread.sp = (unsigned long) childregs;
5902 + p->thread.sp0 = (unsigned long) (childregs+1);
5903 + p->thread.usersp = me->thread.usersp;
5905 set_tsk_thread_flag(p, TIF_FORK);
5907 @@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl
5908 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
5910 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5915 * Set a new TLS for the child thread?
5916 @@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl
5917 if (clone_flags & CLONE_SETTLS) {
5918 #ifdef CONFIG_IA32_EMULATION
5919 if (test_thread_flag(TIF_IA32))
5920 - err = ia32_child_tls(p, childregs);
5921 + err = do_set_thread_area(p, -1,
5922 + (struct user_desc __user *)childregs->si, 0);
5925 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
5926 @@ -502,26 +472,32 @@ out:
5930 -static inline void __save_init_fpu( struct task_struct *tsk )
5932 - asm volatile( "rex64 ; fxsave %0 ; fnclex"
5933 - : "=m" (tsk->thread.i387.fxsave));
5934 - tsk->thread_info->status &= ~TS_USEDFPU;
5938 * This special macro can be used to load a debugging register
5940 -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
5941 +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
5943 static inline void __switch_to_xtra(struct task_struct *prev_p,
5944 - struct task_struct *next_p)
5945 + struct task_struct *next_p)
5947 struct thread_struct *prev, *next;
5948 + unsigned long debugctl;
5950 prev = &prev_p->thread,
5951 next = &next_p->thread;
5953 + debugctl = prev->debugctlmsr;
5954 + if (next->ds_area_msr != prev->ds_area_msr) {
5955 + /* we clear debugctl to make sure DS
5956 + * is not in use when we change it */
5958 + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
5959 + wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
5962 + if (next->debugctlmsr != debugctl)
5963 + wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
5965 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5968 @@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru
5974 + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
5975 + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
5977 + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
5978 + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
5983 * switch_to(x,y) should switch tasks from x to y.
5985 - * This could still be optimized:
5986 + * This could still be optimized:
5987 * - fold all the options into a flag word and test it with a single test.
5988 * - could test fs/gs bitsliced
5990 @@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p,
5992 struct thread_struct *prev = &prev_p->thread,
5993 *next = &next_p->thread;
5994 - int cpu = smp_processor_id();
5995 + int cpu = smp_processor_id();
5996 #ifndef CONFIG_X86_NO_TSS
5997 struct tss_struct *tss = &per_cpu(init_tss, cpu);
5999 @@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p,
6000 prev_p->fpu_counter = 0;
6003 - * Reload esp0, LDT and the page table pointer:
6005 + * This is load_sp0(tss, next) with a multicall.
6007 mcl->op = __HYPERVISOR_stack_switch;
6008 mcl->args[0] = __KERNEL_DS;
6009 - mcl->args[1] = next->rsp0;
6010 + mcl->args[1] = next->sp0;
6014 @@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p,
6015 * This is load_TLS(next, cpu) with multicalls.
6018 - if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \
6019 + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
6020 + next->tls_array[i].b != prev->tls_array[i].b)) { \
6021 mcl->op = __HYPERVISOR_update_descriptor; \
6022 mcl->args[0] = virt_to_machine( \
6023 - &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \
6024 - mcl->args[1] = next->tls_array[i]; \
6025 + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
6026 + mcl->args[1] = *(u64 *)&next->tls_array[i]; \
6030 @@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p,
6033 if (unlikely(prev->iopl != next->iopl)) {
6034 - iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
6035 + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
6036 #if CONFIG_XEN_COMPAT > 0x030002
6037 mcl->op = __HYPERVISOR_physdev_op;
6038 mcl->args[0] = PHYSDEVOP_set_iopl;
6039 @@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p,
6041 * Switch the PDA context.
6043 - prev->userrsp = read_pda(oldrsp);
6044 - write_pda(oldrsp, next->userrsp);
6045 + prev->usersp = read_pda(oldrsp);
6046 + write_pda(oldrsp, next->usersp);
6047 write_pda(pcurrent, next_p);
6048 write_pda(kernelstack,
6049 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
6050 @@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p,
6052 * Now maybe reload the debug registers
6054 - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
6055 + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
6056 + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
6057 __switch_to_xtra(prev_p, next_p);
6059 /* If the task has used fpu the last 5 timeslices, just do a full
6060 @@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p,
6062 * sys_execve() executes a new program.
6066 long sys_execve(char __user *name, char __user * __user *argv,
6067 - char __user * __user *envp, struct pt_regs regs)
6068 + char __user * __user *envp, struct pt_regs *regs)
6073 filename = getname(name);
6074 error = PTR_ERR(filename);
6075 - if (IS_ERR(filename))
6076 + if (IS_ERR(filename))
6078 - error = do_execve(filename, argv, envp, ®s);
6080 - task_lock(current);
6081 - current->ptrace &= ~PT_DTRACE;
6082 - task_unlock(current);
6084 + error = do_execve(filename, argv, envp, regs);
6088 @@ -728,18 +710,18 @@ void set_personality_64bit(void)
6089 /* inherit personality from parent */
6091 /* Make sure to be in 64bit mode */
6092 - clear_thread_flag(TIF_IA32);
6093 + clear_thread_flag(TIF_IA32);
6095 /* TBD: overwrites user setup. Should have two bits.
6096 But 64bit processes have always behaved this way,
6097 so it's not too bad. The main problem is just that
6098 - 32bit childs are affected again. */
6099 + 32bit childs are affected again. */
6100 current->personality &= ~READ_IMPLIES_EXEC;
6103 asmlinkage long sys_fork(struct pt_regs *regs)
6105 - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
6106 + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
6110 @@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns
6111 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
6114 - newsp = regs->rsp;
6116 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
6119 @@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns
6121 asmlinkage long sys_vfork(struct pt_regs *regs)
6123 - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
6124 + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
6128 unsigned long get_wchan(struct task_struct *p)
6130 unsigned long stack;
6135 if (!p || p == current || p->state==TASK_RUNNING)
6137 stack = (unsigned long)task_stack_page(p);
6138 - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
6139 + if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
6141 - fp = *(u64 *)(p->thread.rsp);
6142 + fp = *(u64 *)(p->thread.sp);
6144 if (fp < (unsigned long)stack ||
6145 fp > (unsigned long)stack+THREAD_SIZE)
6147 - rip = *(u64 *)(fp+8);
6148 - if (!in_sched_functions(rip))
6150 + ip = *(u64 *)(fp+8);
6151 + if (!in_sched_functions(ip))
6154 } while (count++ < 16);
6156 @@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t
6157 /* Not strictly needed for fs, but do it for symmetry
6159 if (addr >= TASK_SIZE_OF(task))
6163 - /* handle small bases via the GDT because that's faster to
6164 + /* handle small bases via the GDT because that's faster to
6166 - if (addr <= 0xffffffff) {
6167 + if (addr <= 0xffffffff) {
6168 set_32bit_tls(task, FS_TLS, addr);
6170 - load_TLS(&task->thread, cpu);
6172 + load_TLS(&task->thread, cpu);
6173 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
6175 task->thread.fsindex = FS_TLS_SEL;
6176 task->thread.fs = 0;
6179 task->thread.fsindex = 0;
6180 task->thread.fs = addr;
6182 @@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t
6186 - case ARCH_GET_FS: {
6187 - unsigned long base;
6188 + case ARCH_GET_FS: {
6189 + unsigned long base;
6190 if (task->thread.fsindex == FS_TLS_SEL)
6191 base = read_32bit_tls(task, FS_TLS);
6193 rdmsrl(MSR_FS_BASE, base);
6195 base = task->thread.fs;
6196 - ret = put_user(base, (unsigned long __user *)addr);
6198 + ret = put_user(base, (unsigned long __user *)addr);
6201 - case ARCH_GET_GS: {
6202 + case ARCH_GET_GS: {
6205 if (task->thread.gsindex == GS_TLS_SEL)
6206 base = read_32bit_tls(task, GS_TLS);
6208 - asm("movl %%gs,%0" : "=r" (gsindex));
6209 + asm("movl %%gs,%0" : "=r" (gsindex));
6211 rdmsrl(MSR_KERNEL_GS_BASE, base);
6213 @@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t
6216 base = task->thread.gs;
6217 - ret = put_user(base, (unsigned long __user *)addr);
6218 + ret = put_user(base, (unsigned long __user *)addr);
6233 long sys_arch_prctl(int code, unsigned long addr)
6235 return do_arch_prctl(current, code, addr);
6239 - * Capture the user space registers if the task is not running (in user space)
6241 -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
6243 - struct pt_regs *pp, ptregs;
6245 - pp = task_pt_regs(tsk);
6248 - ptregs.cs &= 0xffff;
6249 - ptregs.ss &= 0xffff;
6251 - elf_core_copy_regs(regs, &ptregs);
6253 - boot_option_idle_override = 1;
6257 unsigned long arch_align_stack(unsigned long sp)
6258 @@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned
6259 sp -= get_random_int() % 8192;
6263 +unsigned long arch_randomize_brk(struct mm_struct *mm)
6265 + unsigned long range_end = mm->brk + 0x02000000;
6266 + return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
6268 --- sle11-2009-10-16.orig/arch/x86/kernel/quirks-xen.c 2009-02-16 16:18:36.000000000 +0100
6269 +++ sle11-2009-10-16/arch/x86/kernel/quirks-xen.c 2009-03-16 16:33:40.000000000 +0100
6271 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
6277 /* BIOS may enable hardware IRQ balancing for
6278 * E7520/E7320/E7525(revision ID 0x9 and below)
6279 @@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal
6280 pci_read_config_byte(dev, 0xf4, &config);
6281 pci_write_config_byte(dev, 0xf4, config|0x2);
6283 - /* read xTPR register */
6284 - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
6286 + * read xTPR register. We may not have a pci_dev for device 8
6287 + * because it might be hidden until the above write.
6289 + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
6291 if (!(word & (1 << 13))) {
6292 struct xen_platform_op op;
6294 - printk(KERN_INFO "Intel E7520/7320/7525 detected. "
6295 - "Disabling irq balancing and affinity\n");
6296 + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
6297 + "disabling irq balancing and affinity\n");
6298 op.cmd = XENPF_platform_quirk;
6299 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
6300 WARN_ON(HYPERVISOR_platform_op(&op));
6301 @@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct
6302 pci_read_config_dword(dev, 0xF0, &rcba);
6305 - printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n");
6306 + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
6307 + "cannot force enable HPET\n");
6311 /* use bits 31:14, 16 kB aligned */
6312 rcba_base = ioremap_nocache(rcba, 0x4000);
6313 if (rcba_base == NULL) {
6314 - printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n");
6315 + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
6316 + "cannot force enable HPET\n");
6320 @@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct
6321 /* HPET is enabled in HPTC. Just not reported by BIOS */
6323 force_hpet_address = 0xFED00000 | (val << 12);
6324 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6325 - force_hpet_address);
6326 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6327 + "0x%lx\n", force_hpet_address);
6331 @@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct
6333 force_hpet_address = 0;
6335 - printk(KERN_DEBUG "Failed to force enable HPET\n");
6336 + dev_printk(KERN_DEBUG, &dev->dev,
6337 + "Failed to force enable HPET\n");
6339 force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
6340 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6341 - force_hpet_address);
6342 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6343 + "0x%lx\n", force_hpet_address);
6347 @@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
6348 ich_force_enable_hpet);
6349 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
6350 ich_force_enable_hpet);
6351 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
6352 + ich_force_enable_hpet);
6355 static struct pci_dev *cached_dev;
6356 @@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st
6359 force_hpet_address = 0xFED00000 | (val << 12);
6360 - printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6361 - force_hpet_address);
6362 + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6363 + force_hpet_address);
6367 @@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st
6368 /* HPET is enabled in HPTC. Just not reported by BIOS */
6370 force_hpet_address = 0xFED00000 | (val << 12);
6371 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6372 - force_hpet_address);
6373 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6374 + "0x%lx\n", force_hpet_address);
6376 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
6380 - printk(KERN_DEBUG "Failed to force enable HPET\n");
6381 + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6385 @@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str
6388 force_hpet_address = (val & ~0x3ff);
6389 - printk(KERN_DEBUG "HPET at base address 0x%lx\n",
6390 - force_hpet_address);
6391 + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
6392 + force_hpet_address);
6396 @@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str
6397 pci_read_config_dword(dev, 0x68, &val);
6399 force_hpet_address = (val & ~0x3ff);
6400 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6401 - force_hpet_address);
6402 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
6403 + "0x%lx\n", force_hpet_address);
6405 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
6409 - printk(KERN_DEBUG "Failed to force enable HPET\n");
6410 + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
6413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
6414 @@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str
6415 pci_read_config_dword(dev, 0x44, &val);
6416 force_hpet_address = val & 0xfffffffe;
6417 force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
6418 - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
6419 + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
6420 force_hpet_address);
6423 @@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6424 nvidia_force_enable_hpet);
6427 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
6428 + nvidia_force_enable_hpet);
6429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
6430 nvidia_force_enable_hpet);
6431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
6432 @@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
6433 void force_hpet_resume(void)
6435 switch (force_hpet_resume_type) {
6436 - case ICH_FORCE_HPET_RESUME:
6437 - return ich_force_hpet_resume();
6439 - case OLD_ICH_FORCE_HPET_RESUME:
6440 - return old_ich_force_hpet_resume();
6442 - case VT8237_FORCE_HPET_RESUME:
6443 - return vt8237_force_hpet_resume();
6445 - case NVIDIA_FORCE_HPET_RESUME:
6446 - return nvidia_force_hpet_resume();
6449 + case ICH_FORCE_HPET_RESUME:
6450 + ich_force_hpet_resume();
6452 + case OLD_ICH_FORCE_HPET_RESUME:
6453 + old_ich_force_hpet_resume();
6455 + case VT8237_FORCE_HPET_RESUME:
6456 + vt8237_force_hpet_resume();
6458 + case NVIDIA_FORCE_HPET_RESUME:
6459 + nvidia_force_hpet_resume();
6465 --- sle11-2009-10-16.orig/arch/x86/kernel/rtc.c 2009-10-28 14:55:04.000000000 +0100
6466 +++ sle11-2009-10-16/arch/x86/kernel/rtc.c 2009-03-16 16:33:40.000000000 +0100
6467 @@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void
6469 unsigned long retval, flags;
6472 + if (!is_initial_xendomain())
6473 + return xen_read_persistent_clock();
6475 spin_lock_irqsave(&rtc_lock, flags);
6476 retval = get_wallclock();
6477 spin_unlock_irqrestore(&rtc_lock, flags);
6478 @@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void
6480 int update_persistent_clock(struct timespec now)
6483 + if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
6486 return set_rtc_mmss(now.tv_sec);
6489 --- sle11-2009-10-16.orig/arch/x86/kernel/setup64-xen.c 2009-02-16 16:18:36.000000000 +0100
6490 +++ sle11-2009-10-16/arch/x86/kernel/setup64-xen.c 2009-03-16 16:33:40.000000000 +0100
6492 #include <asm/hypervisor.h>
6495 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
6496 struct boot_params __initdata boot_params;
6498 +struct boot_params boot_params;
6501 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
6503 @@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr
6505 unsigned long __supported_pte_mask __read_mostly = ~0UL;
6506 EXPORT_SYMBOL(__supported_pte_mask);
6508 static int do_not_nx __cpuinitdata = 0;
6511 @@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str
6512 __setup("noexec32=", nonx32_setup);
6515 + * Copy data used in early init routines from the initial arrays to the
6516 + * per cpu data areas. These arrays then become expendable and the
6517 + * *_early_ptr's are zeroed indicating that the static arrays are gone.
6519 +static void __init setup_per_cpu_maps(void)
6524 + for_each_possible_cpu(cpu) {
6526 + if (per_cpu_offset(cpu)) {
6528 + per_cpu(x86_cpu_to_apicid, cpu) =
6529 + x86_cpu_to_apicid_init[cpu];
6530 + per_cpu(x86_bios_cpu_apicid, cpu) =
6531 + x86_bios_cpu_apicid_init[cpu];
6533 + per_cpu(x86_cpu_to_node_map, cpu) =
6534 + x86_cpu_to_node_map_init[cpu];
6539 + printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
6544 + /* indicate the early static arrays will soon be gone */
6545 + x86_cpu_to_apicid_early_ptr = NULL;
6546 + x86_bios_cpu_apicid_early_ptr = NULL;
6548 + x86_cpu_to_node_map_early_ptr = NULL;
6554 * Great future plan:
6555 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
6556 * Always point %gs to its beginning
6557 @@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void)
6558 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
6559 for_each_cpu_mask (i, cpu_possible_map) {
6561 +#ifndef CONFIG_NEED_MULTIPLE_NODES
6562 + ptr = alloc_bootmem_pages(size);
6564 + int node = early_cpu_to_node(i);
6566 - if (!NODE_DATA(cpu_to_node(i))) {
6567 - printk("cpu with no node %d, num_online_nodes %d\n",
6568 - i, num_online_nodes());
6569 + if (!node_online(node) || !NODE_DATA(node))
6570 ptr = alloc_bootmem_pages(size);
6572 - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
6575 + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
6578 panic("Cannot allocate cpu data for CPU %d\n", i);
6579 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
6580 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
6583 + /* setup percpu data maps early */
6584 + setup_per_cpu_maps();
6588 @@ -224,7 +273,8 @@ void syscall_init(void)
6589 wrmsrl(MSR_CSTAR, ignore_sysret);
6591 /* Flags to clear on syscall */
6592 - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
6593 + wrmsrl(MSR_SYSCALL_MASK,
6594 + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
6596 #ifdef CONFIG_IA32_EMULATION
6597 syscall32_cpu_init ();
6598 @@ -303,7 +353,7 @@ void __cpuinit cpu_init (void)
6602 - memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
6603 + memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
6606 cpu_gdt_descr[cpu].size = GDT_SIZE;
6607 @@ -334,10 +384,10 @@ void __cpuinit cpu_init (void)
6610 estacks += PAGE_SIZE << order[v];
6611 - orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
6612 + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
6615 - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
6616 + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
6618 * <= is required because the CPU will access up to
6619 * 8 bits beyond the end of the IO permission bitmap.
6620 --- sle11-2009-10-16.orig/arch/x86/kernel/setup_32-xen.c 2009-02-16 16:18:36.000000000 +0100
6621 +++ sle11-2009-10-16/arch/x86/kernel/setup_32-xen.c 2009-03-16 16:33:40.000000000 +0100
6623 #include <linux/crash_dump.h>
6624 #include <linux/dmi.h>
6625 #include <linux/pfn.h>
6626 +#include <linux/pci.h>
6627 +#include <linux/init_ohci1394_dma.h>
6629 #include <video/edid.h>
6631 +#include <asm/mtrr.h>
6632 #include <asm/apic.h>
6633 #include <asm/e820.h>
6634 #include <asm/mpspec.h>
6635 @@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b
6636 xen_panic_event, NULL, 0 /* try to go last */
6639 -int disable_pse __cpuinitdata = 0;
6644 -extern struct resource code_resource;
6645 -extern struct resource data_resource;
6646 -extern struct resource bss_resource;
6647 +static struct resource data_resource = {
6648 + .name = "Kernel data",
6651 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6654 +static struct resource code_resource = {
6655 + .name = "Kernel code",
6658 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6661 +static struct resource bss_resource = {
6662 + .name = "Kernel bss",
6665 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6668 +static struct resource video_ram_resource = {
6669 + .name = "Video RAM area",
6672 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
6675 +static struct resource standard_io_resources[] = { {
6679 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6684 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6689 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6694 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6696 + .name = "keyboard",
6699 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6701 + .name = "dma page reg",
6704 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6709 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6714 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6719 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
6722 /* cpu data as detected by the assembly code in head.S */
6723 struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6724 @@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini
6725 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
6726 EXPORT_SYMBOL(boot_cpu_data);
6728 +#ifndef CONFIG_X86_PAE
6729 unsigned long mmu_cr4_features;
6731 +unsigned long mmu_cr4_features = X86_CR4_PAE;
6734 /* for MCA, but anyone else can use it if they want */
6735 unsigned int machine_id;
6736 unsigned int machine_submodel_id;
6737 unsigned int BIOS_revision;
6738 -unsigned int mca_pentium_flag;
6740 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6741 int bootloader_type;
6742 @@ -131,13 +206,17 @@ extern int root_mountflags;
6744 unsigned long saved_videomode;
6746 -#define RAMDISK_IMAGE_START_MASK 0x07FF
6747 +#define RAMDISK_IMAGE_START_MASK 0x07FF
6748 #define RAMDISK_PROMPT_FLAG 0x8000
6749 -#define RAMDISK_LOAD_FLAG 0x4000
6750 +#define RAMDISK_LOAD_FLAG 0x4000
6752 static char __initdata command_line[COMMAND_LINE_SIZE];
6754 +#ifndef CONFIG_DEBUG_BOOT_PARAMS
6755 struct boot_params __initdata boot_params;
6757 +struct boot_params boot_params;
6761 * Point at the empty zero page to start with. We map the real shared_info
6762 @@ -198,8 +277,7 @@ static int __init parse_mem(char *arg)
6765 if (strcmp(arg, "nopentium") == 0) {
6766 - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6768 + setup_clear_cpu_cap(X86_FEATURE_PSE);
6770 /* If the user specifies memory size, we
6771 * limit the BIOS-provided memory map to
6772 @@ -208,7 +286,7 @@ static int __init parse_mem(char *arg)
6773 * trim the existing memory map.
6775 unsigned long long mem_size;
6778 mem_size = memparse(arg, &arg);
6779 limit_regions(mem_size);
6780 user_defined_memmap = 1;
6781 @@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v
6783 addr = get_bios_ebda();
6785 - reserve_bootmem(addr, PAGE_SIZE);
6786 + reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
6790 @@ -365,8 +443,6 @@ static unsigned long __init setup_memory
6791 min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
6792 xen_start_info->nr_pt_frames;
6796 max_low_pfn = find_max_low_pfn();
6798 #ifdef CONFIG_HIGHMEM
6799 @@ -447,7 +523,8 @@ static void __init reserve_crashkernel(v
6800 (unsigned long)(total_mem >> 20));
6801 crashk_res.start = crash_base;
6802 crashk_res.end = crash_base + crash_size - 1;
6803 - reserve_bootmem(crash_base, crash_size);
6804 + reserve_bootmem(crash_base, crash_size,
6807 printk(KERN_INFO "crashkernel reservation failed - "
6808 "you have to specify a base address\n");
6809 @@ -461,6 +538,99 @@ static inline void __init reserve_crashk
6813 +#ifdef CONFIG_BLK_DEV_INITRD
6815 +static bool do_relocate_initrd = false;
6817 +static void __init reserve_initrd(void)
6819 + unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6820 + unsigned long ramdisk_size = xen_start_info->mod_len;
6821 + unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
6822 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6823 + unsigned long ramdisk_here;
6827 + if (!xen_start_info->mod_start || !ramdisk_size)
6828 + return; /* No initrd provided by bootloader */
6830 + if (ramdisk_end < ramdisk_image) {
6831 + printk(KERN_ERR "initrd wraps around end of memory, "
6832 + "disabling initrd\n");
6835 + if (ramdisk_size >= end_of_lowmem/2) {
6836 + printk(KERN_ERR "initrd too large to handle, "
6837 + "disabling initrd\n");
6840 + if (ramdisk_end <= end_of_lowmem) {
6841 + /* All in lowmem, easy case */
6842 + reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
6843 + initrd_start = ramdisk_image + PAGE_OFFSET;
6844 + initrd_end = initrd_start+ramdisk_size;
6848 + /* We need to move the initrd down into lowmem */
6849 + ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
6851 + /* Note: this includes all the lowmem currently occupied by
6852 + the initrd, we rely on that fact to keep the data intact. */
6853 + reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
6854 + initrd_start = ramdisk_here + PAGE_OFFSET;
6855 + initrd_end = initrd_start + ramdisk_size;
6857 + do_relocate_initrd = true;
6860 +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
6862 +static void __init relocate_initrd(void)
6864 + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
6865 + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
6866 + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6867 + unsigned long ramdisk_here;
6868 + unsigned long slop, clen, mapaddr;
6871 + if (!do_relocate_initrd)
6874 + ramdisk_here = initrd_start - PAGE_OFFSET;
6876 + q = (char *)initrd_start;
6878 + /* Copy any lowmem portion of the initrd */
6879 + if (ramdisk_image < end_of_lowmem) {
6880 + clen = end_of_lowmem - ramdisk_image;
6881 + p = (char *)__va(ramdisk_image);
6882 + memcpy(q, p, clen);
6884 + ramdisk_image += clen;
6885 + ramdisk_size -= clen;
6888 + /* Copy the highmem portion of the initrd */
6889 + while (ramdisk_size) {
6890 + slop = ramdisk_image & ~PAGE_MASK;
6891 + clen = ramdisk_size;
6892 + if (clen > MAX_MAP_CHUNK-slop)
6893 + clen = MAX_MAP_CHUNK-slop;
6894 + mapaddr = ramdisk_image & PAGE_MASK;
6895 + p = early_ioremap(mapaddr, clen+slop);
6896 + memcpy(q, p+slop, clen);
6897 + early_iounmap(p, clen+slop);
6899 + ramdisk_image += clen;
6900 + ramdisk_size -= clen;
6904 +#endif /* CONFIG_BLK_DEV_INITRD */
6906 void __init setup_bootmem_allocator(void)
6908 unsigned long bootmap_size;
6909 @@ -478,14 +648,15 @@ void __init setup_bootmem_allocator(void
6910 * bootmem allocator with an invalid RAM area.
6912 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
6913 - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
6914 + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
6919 * reserve physical page 0 - it's a special BIOS page on many boxes,
6920 * enabling clean reboots, SMP operation, laptop functions.
6922 - reserve_bootmem(0, PAGE_SIZE);
6923 + reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
6925 /* reserve EBDA region, it's a 4K region */
6926 reserve_ebda_region();
6927 @@ -495,7 +666,7 @@ void __init setup_bootmem_allocator(void
6928 unless you have no PS/2 mouse plugged in. */
6929 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
6930 boot_cpu_data.x86 == 6)
6931 - reserve_bootmem(0xa0000 - 4096, 4096);
6932 + reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
6936 @@ -503,7 +674,7 @@ void __init setup_bootmem_allocator(void
6937 * FIXME: Don't need the extra page at 4K, but need to fix
6938 * trampoline before removing it. (see the GDT stuff)
6940 - reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
6941 + reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
6943 #ifdef CONFIG_ACPI_SLEEP
6945 @@ -511,29 +682,12 @@ void __init setup_bootmem_allocator(void
6947 acpi_reserve_bootmem();
6949 - numa_kva_reserve();
6950 #endif /* !CONFIG_XEN */
6952 #ifdef CONFIG_BLK_DEV_INITRD
6953 - if (xen_start_info->mod_start) {
6954 - unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
6955 - unsigned long ramdisk_size = xen_start_info->mod_len;
6956 - unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
6957 - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
6959 - if (ramdisk_end <= end_of_lowmem) {
6960 - /*reserve_bootmem(ramdisk_image, ramdisk_size);*/
6961 - initrd_start = ramdisk_image + PAGE_OFFSET;
6962 - initrd_end = initrd_start+ramdisk_size;
6963 - initrd_below_start_ok = 1;
6965 - printk(KERN_ERR "initrd extends beyond end of memory "
6966 - "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
6967 - ramdisk_end, end_of_lowmem);
6973 + numa_kva_reserve();
6974 reserve_crashkernel();
6977 @@ -600,20 +754,14 @@ void __init setup_arch(char **cmdline_p)
6978 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
6979 pre_setup_arch_hook();
6981 + early_ioremap_init();
6983 prefill_possible_map();
6987 - * FIXME: This isn't an official loader_type right
6988 - * now but does currently work with elilo.
6989 - * If we were configured as an EFI kernel, check to make
6990 - * sure that we were loaded correctly from elilo and that
6991 - * the system table is valid. If not, then initialize normally.
6994 - if ((boot_params.hdr.type_of_loader == 0x50) &&
6995 - boot_params.efi_info.efi_systab)
6996 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7001 @@ -653,12 +801,9 @@ void __init setup_arch(char **cmdline_p)
7008 - printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7009 - print_memory_map(memory_setup());
7012 + printk(KERN_INFO "BIOS-provided physical RAM map:\n");
7013 + print_memory_map(memory_setup());
7017 @@ -691,6 +836,17 @@ void __init setup_arch(char **cmdline_p)
7018 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
7019 *cmdline_p = command_line;
7024 + /* update e820 for memory not covered by WB MTRRs */
7028 + if (mtrr_trim_uncached_memory(max_pfn))
7032 max_low_pfn = setup_memory();
7035 @@ -715,6 +871,16 @@ void __init setup_arch(char **cmdline_p)
7036 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
7041 + * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
7044 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7045 + if (init_ohci1394_dma_early)
7046 + init_ohci1394_dma_on_all_controllers();
7049 remapped_pgdat_init();
7052 @@ -800,16 +966,20 @@ void __init setup_arch(char **cmdline_p)
7053 * NOTE: at this point the bootmem allocator is fully available.
7056 +#ifdef CONFIG_BLK_DEV_INITRD
7057 + relocate_initrd();
7060 paravirt_post_allocator_init();
7062 if (is_initial_xendomain())
7067 #ifdef CONFIG_X86_GENERICARCH
7068 generic_apic_probe();
7075 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
7076 @@ -827,7 +997,7 @@ void __init setup_arch(char **cmdline_p)
7077 acpi_boot_table_init();
7080 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7085 @@ -873,3 +1043,30 @@ xen_panic_event(struct notifier_block *t
7086 /* we're never actually going to get here... */
7091 + * Request address space for all standard resources
7093 + * This is called just before pcibios_init(), which is also a
7094 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
7096 +static int __init request_standard_resources(void)
7100 + /* Nothing to do if not running in dom0. */
7101 + if (!is_initial_xendomain())
7104 + printk(KERN_INFO "Setting up standard PCI resources\n");
7105 + init_iomem_resources(&code_resource, &data_resource, &bss_resource);
7107 + request_resource(&iomem_resource, &video_ram_resource);
7109 + /* request I/O space for devices used on all i[345]86 PCs */
7110 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7111 + request_resource(&ioport_resource, &standard_io_resources[i]);
7115 +subsys_initcall(request_standard_resources);
7116 --- sle11-2009-10-16.orig/arch/x86/kernel/setup_64-xen.c 2009-02-16 16:18:36.000000000 +0100
7117 +++ sle11-2009-10-16/arch/x86/kernel/setup_64-xen.c 2009-03-16 16:33:40.000000000 +0100
7119 #include <linux/ptrace.h>
7120 #include <linux/slab.h>
7121 #include <linux/user.h>
7122 -#include <linux/a.out.h>
7123 #include <linux/screen_info.h>
7124 #include <linux/ioport.h>
7125 #include <linux/delay.h>
7127 #include <linux/crash_dump.h>
7128 #include <linux/root_dev.h>
7129 #include <linux/pci.h>
7130 +#include <linux/efi.h>
7131 #include <linux/acpi.h>
7132 #include <linux/kallsyms.h>
7133 #include <linux/edd.h>
7135 #include <linux/dmi.h>
7136 #include <linux/dma-mapping.h>
7137 #include <linux/ctype.h>
7138 +#include <linux/uaccess.h>
7139 +#include <linux/init_ohci1394_dma.h>
7141 #include <asm/mtrr.h>
7142 #include <asm/uaccess.h>
7143 #include <asm/system.h>
7144 +#include <asm/vsyscall.h>
7146 #include <asm/smp.h>
7147 #include <asm/msr.h>
7149 #include <video/edid.h>
7150 #include <asm/e820.h>
7151 #include <asm/dma.h>
7152 +#include <asm/gart.h>
7153 #include <asm/mpspec.h>
7154 #include <asm/mmu_context.h>
7155 #include <asm/proto.h>
7157 #include <asm/sections.h>
7158 #include <asm/dmi.h>
7159 #include <asm/cacheflush.h>
7160 +#include <asm/mce.h>
7161 +#include <asm/ds.h>
7162 +#include <asm/topology.h>
7164 #include <linux/percpu.h>
7165 #include <xen/interface/physdev.h>
7166 @@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info);
7167 struct cpuinfo_x86 boot_cpu_data __read_mostly;
7168 EXPORT_SYMBOL(boot_cpu_data);
7170 +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
7172 unsigned long mmu_cr4_features;
7174 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
7175 @@ -117,7 +126,7 @@ unsigned long saved_video_mode;
7177 int force_mwait __cpuinitdata;
7183 int dmi_alloc_index;
7184 @@ -163,25 +172,27 @@ struct resource standard_io_resources[]
7186 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
7188 -struct resource data_resource = {
7189 +static struct resource data_resource = {
7190 .name = "Kernel data",
7193 .flags = IORESOURCE_RAM,
7195 -struct resource code_resource = {
7196 +static struct resource code_resource = {
7197 .name = "Kernel code",
7200 .flags = IORESOURCE_RAM,
7202 -struct resource bss_resource = {
7203 +static struct resource bss_resource = {
7204 .name = "Kernel bss",
7207 .flags = IORESOURCE_RAM,
7210 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
7212 #ifdef CONFIG_PROC_VMCORE
7213 /* elfcorehdr= specifies the location of elf core header
7214 * stored by the crashed kernel. This option will be passed
7215 @@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_
7216 unsigned long bootmap_size, bootmap;
7218 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
7219 - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
7220 + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
7223 - panic("Cannot find bootmem map of size %ld\n",bootmap_size);
7224 + panic("Cannot find bootmem map of size %ld\n", bootmap_size);
7225 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
7226 e820_register_active_regions(0, start_pfn, end_pfn);
7228 @@ -215,8 +227,8 @@ contig_initmem_init(unsigned long start_
7230 free_bootmem_with_active_regions(0, end_pfn);
7232 - reserve_bootmem(bootmap, bootmap_size);
7234 + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
7238 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
7239 @@ -249,27 +261,35 @@ static inline void copy_edd(void)
7241 static void __init reserve_crashkernel(void)
7243 - unsigned long long free_mem;
7244 + unsigned long long total_mem;
7245 unsigned long long crash_size, crash_base;
7248 - free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7249 + total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
7251 - ret = parse_crashkernel(boot_command_line, free_mem,
7252 + ret = parse_crashkernel(boot_command_line, total_mem,
7253 &crash_size, &crash_base);
7254 if (ret == 0 && crash_size) {
7255 - if (crash_base > 0) {
7256 - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7257 - "for crashkernel (System RAM: %ldMB)\n",
7258 - (unsigned long)(crash_size >> 20),
7259 - (unsigned long)(crash_base >> 20),
7260 - (unsigned long)(free_mem >> 20));
7261 - crashk_res.start = crash_base;
7262 - crashk_res.end = crash_base + crash_size - 1;
7263 - reserve_bootmem(crash_base, crash_size);
7265 + if (crash_base <= 0) {
7266 printk(KERN_INFO "crashkernel reservation failed - "
7267 "you have to specify a base address\n");
7271 + if (reserve_bootmem(crash_base, crash_size,
7272 + BOOTMEM_EXCLUSIVE) < 0) {
7273 + printk(KERN_INFO "crashkernel reservation failed - "
7274 + "memory is in use\n");
7278 + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
7279 + "for crashkernel (System RAM: %ldMB)\n",
7280 + (unsigned long)(crash_size >> 20),
7281 + (unsigned long)(crash_base >> 20),
7282 + (unsigned long)(total_mem >> 20));
7283 + crashk_res.start = crash_base;
7284 + crashk_res.end = crash_base + crash_size - 1;
7288 @@ -280,37 +300,21 @@ static inline void __init reserve_crashk
7293 -#define EBDA_ADDR_POINTER 0x40E
7295 -unsigned __initdata ebda_addr;
7296 -unsigned __initdata ebda_size;
7298 -static void discover_ebda(void)
7299 +/* Overridden in paravirt.c if CONFIG_PARAVIRT */
7300 +void __attribute__((weak)) __init memory_setup(void)
7303 - * there is a real-mode segmented pointer pointing to the
7304 - * 4K EBDA area at 0x40E
7306 - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
7309 - ebda_size = *(unsigned short *)__va(ebda_addr);
7311 - /* Round EBDA up to pages */
7312 - if (ebda_size == 0)
7315 - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
7316 - if (ebda_size > 64*1024)
7317 - ebda_size = 64*1024;
7318 + machine_specific_memory_setup();
7321 -#define discover_ebda() ((void)0)
7325 + * setup_arch - architecture-specific boot-time initializations
7327 + * Note: On x86_64, fixmaps are ready for use even before this is called.
7329 void __init setup_arch(char **cmdline_p)
7334 extern struct e820map machine_e820;
7336 @@ -319,6 +323,11 @@ void __init setup_arch(char **cmdline_p)
7337 /* Register a call for panic conditions. */
7338 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
7340 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7341 + VMASST_TYPE_writable_pagetables));
7343 + early_ioremap_init();
7345 ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
7346 screen_info = boot_params.screen_info;
7348 @@ -335,11 +344,6 @@ void __init setup_arch(char **cmdline_p)
7349 screen_info.orig_video_isVGA = 0;
7353 - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
7354 - VMASST_TYPE_writable_pagetables));
7358 printk(KERN_INFO "Command line: %s\n", boot_command_line);
7360 @@ -355,7 +359,15 @@ void __init setup_arch(char **cmdline_p)
7361 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
7362 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
7364 - setup_memory_region();
7366 + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
7376 if (!boot_params.hdr.root_flags)
7377 @@ -379,28 +391,51 @@ void __init setup_arch(char **cmdline_p)
7379 parse_early_param();
7381 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
7382 + if (init_ohci1394_dma_early)
7383 + init_ohci1394_dma_on_all_controllers();
7386 finish_e820_parsing();
7388 + early_gart_iommu_check();
7390 e820_register_active_regions(0, 0, -1UL);
7392 * partially used pages are not usable - thus
7393 * we are rounding upwards:
7395 end_pfn = e820_end_of_ram();
7396 + /* update e820 for memory not covered by WB MTRRs */
7399 + if (mtrr_trim_uncached_memory(end_pfn)) {
7400 + e820_register_active_regions(0, 0, -1UL);
7401 + end_pfn = e820_end_of_ram();
7405 num_physpages = end_pfn;
7406 + max_mapnr = end_pfn;
7412 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
7416 if (is_initial_xendomain())
7421 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
7422 - /* setup to use the static apicid table during kernel startup */
7423 - x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
7424 + /* setup to use the early static init tables during kernel startup */
7425 + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
7426 + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
7428 + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
7432 /* How many end-of-memory variables you have, grandma! */
7433 @@ -419,54 +454,25 @@ void __init setup_arch(char **cmdline_p)
7437 - numa_initmem_init(0, end_pfn);
7438 + numa_initmem_init(0, end_pfn);
7440 contig_initmem_init(0, end_pfn);
7445 - * Reserve kernel, physmap, start info, initial page tables, and
7448 - reserve_bootmem_generic(__pa_symbol(&_text),
7449 - (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
7451 - /* Reserve direct mapping */
7452 - reserve_bootmem_generic(table_start << PAGE_SHIFT,
7453 - (table_end - table_start) << PAGE_SHIFT);
7455 - /* reserve kernel */
7456 - reserve_bootmem_generic(__pa_symbol(&_text),
7457 - __pa_symbol(&_end) - __pa_symbol(&_text));
7458 + early_res_to_bootmem();
7461 +#ifdef CONFIG_ACPI_SLEEP
7463 - * reserve physical page 0 - it's a special BIOS page on many boxes,
7464 - * enabling clean reboots, SMP operation, laptop functions.
7465 + * Reserve low memory region for sleep support.
7467 - reserve_bootmem_generic(0, PAGE_SIZE);
7469 - /* reserve ebda region */
7471 - reserve_bootmem_generic(ebda_addr, ebda_size);
7473 - /* reserve nodemap region */
7475 - reserve_bootmem_generic(nodemap_addr, nodemap_size);
7476 + acpi_reserve_bootmem();
7480 - /* Reserve SMP trampoline */
7481 - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
7484 + efi_reserve_bootmem();
7487 -#ifdef CONFIG_ACPI_SLEEP
7489 - * Reserve low memory region for sleep support.
7491 - acpi_reserve_bootmem();
7493 #ifdef CONFIG_BLK_DEV_INITRD
7495 if (xen_start_info->mod_start) {
7496 @@ -490,6 +496,8 @@ void __init setup_arch(char **cmdline_p)
7497 initrd_below_start_ok = 1;
7500 + /* Assumes everything on node 0 */
7501 + free_bootmem(ramdisk_image, ramdisk_size);
7502 printk(KERN_ERR "initrd extends beyond end of memory "
7503 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
7504 ramdisk_end, end_of_mem);
7505 @@ -499,10 +507,11 @@ void __init setup_arch(char **cmdline_p)
7507 reserve_crashkernel();
7510 #ifdef CONFIG_X86_LOCAL_APIC
7512 - * Find and reserve possible boot-time SMP configuration:
7514 + * Find and reserve possible boot-time SMP configuration:
7519 @@ -590,16 +599,10 @@ void __init setup_arch(char **cmdline_p)
7523 -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
7529 - * set this early, so we dont allocate cpu0
7530 - * if MADT list doesnt list BSP first
7531 - * mpparse.c/MP_processor_info() allocates logical cpu numbers.
7533 - cpu_set(0, cpu_present_map);
7536 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
7537 @@ -623,6 +626,7 @@ void __init setup_arch(char **cmdline_p)
7540 init_apic_mappings();
7541 + ioapic_init_mappings();
7544 #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
7545 @@ -634,18 +638,17 @@ void __init setup_arch(char **cmdline_p)
7548 if (is_initial_xendomain())
7549 - e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
7550 + e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
7551 + &code_resource, &data_resource, &bss_resource);
7553 - e820_reserve_resources(e820.map, e820.nr_map);
7554 + e820_reserve_resources(e820.map, e820.nr_map,
7555 + &code_resource, &data_resource, &bss_resource);
7556 e820_mark_nosave_regions();
7561 /* request I/O space for devices used on all i[345]86 PCs */
7562 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
7563 request_resource(&ioport_resource, &standard_io_resources[i]);
7567 if (is_initial_xendomain())
7568 @@ -679,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
7571 #if defined(CONFIG_VGA_CONSOLE)
7572 - conswitchp = &vga_con;
7573 + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
7574 + conswitchp = &vga_con;
7575 #elif defined(CONFIG_DUMMY_CONSOLE)
7576 conswitchp = &dummy_con;
7578 @@ -723,9 +727,10 @@ static void __cpuinit display_cacheinfo(
7580 if (n >= 0x80000005) {
7581 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
7582 - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
7583 - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7584 - c->x86_cache_size=(ecx>>24)+(edx>>24);
7585 + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
7586 + "D cache %dK (%d bytes/line)\n",
7587 + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
7588 + c->x86_cache_size = (ecx>>24) + (edx>>24);
7589 /* On K8 L1 TLB is inclusive, so don't count it */
7592 @@ -739,27 +744,25 @@ static void __cpuinit display_cacheinfo(
7593 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
7594 c->x86_cache_size, ecx & 0xFF);
7597 - if (n >= 0x80000007)
7598 - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
7599 if (n >= 0x80000008) {
7600 - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7601 + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
7602 c->x86_virt_bits = (eax >> 8) & 0xff;
7603 c->x86_phys_bits = eax & 0xff;
7608 -static int nearby_node(int apicid)
7609 +static int __cpuinit nearby_node(int apicid)
7614 for (i = apicid - 1; i >= 0; i--) {
7615 - int node = apicid_to_node[i];
7616 + node = apicid_to_node[i];
7617 if (node != NUMA_NO_NODE && node_online(node))
7620 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
7621 - int node = apicid_to_node[i];
7622 + node = apicid_to_node[i];
7623 if (node != NUMA_NO_NODE && node_online(node))
7626 @@ -771,7 +774,7 @@ static int nearby_node(int apicid)
7627 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
7628 * Assumes number of cores is a power of two.
7630 -static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
7631 +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
7635 @@ -780,7 +783,54 @@ static void __init amd_detect_cmp(struct
7637 unsigned apicid = hard_smp_processor_id();
7639 - unsigned ecx = cpuid_ecx(0x80000008);
7640 + bits = c->x86_coreid_bits;
7642 + /* Low order bits define the core id (index of core in socket) */
7643 + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7644 + /* Convert the APIC ID into the socket ID */
7645 + c->phys_proc_id = phys_pkg_id(bits);
7648 + node = c->phys_proc_id;
7649 + if (apicid_to_node[apicid] != NUMA_NO_NODE)
7650 + node = apicid_to_node[apicid];
7651 + if (!node_online(node)) {
7652 + /* Two possibilities here:
7653 + - The CPU is missing memory and no node was created.
7654 + In that case try picking one from a nearby CPU
7655 + - The APIC IDs differ from the HyperTransport node IDs
7656 + which the K8 northbridge parsing fills in.
7657 + Assume they are all increased by a constant offset,
7658 + but in the same order as the HT nodeids.
7659 + If that doesn't result in a usable node fall back to the
7660 + path for the previous case. */
7662 + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7664 + if (ht_nodeid >= 0 &&
7665 + apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7666 + node = apicid_to_node[ht_nodeid];
7667 + /* Pick a nearby node */
7668 + if (!node_online(node))
7669 + node = nearby_node(apicid);
7671 + numa_set_node(cpu, node);
7673 + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7678 +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
7681 + unsigned bits, ecx;
7683 + /* Multi core CPU? */
7684 + if (c->extended_cpuid_level < 0x80000008)
7687 + ecx = cpuid_ecx(0x80000008);
7689 c->x86_max_cores = (ecx & 0xff) + 1;
7691 @@ -793,37 +843,8 @@ static void __init amd_detect_cmp(struct
7695 - /* Low order bits define the core id (index of core in socket) */
7696 - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
7697 - /* Convert the APIC ID into the socket ID */
7698 - c->phys_proc_id = phys_pkg_id(bits);
7701 - node = c->phys_proc_id;
7702 - if (apicid_to_node[apicid] != NUMA_NO_NODE)
7703 - node = apicid_to_node[apicid];
7704 - if (!node_online(node)) {
7705 - /* Two possibilities here:
7706 - - The CPU is missing memory and no node was created.
7707 - In that case try picking one from a nearby CPU
7708 - - The APIC IDs differ from the HyperTransport node IDs
7709 - which the K8 northbridge parsing fills in.
7710 - Assume they are all increased by a constant offset,
7711 - but in the same order as the HT nodeids.
7712 - If that doesn't result in a usable node fall back to the
7713 - path for the previous case. */
7714 - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
7715 - if (ht_nodeid >= 0 &&
7716 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
7717 - node = apicid_to_node[ht_nodeid];
7718 - /* Pick a nearby node */
7719 - if (!node_online(node))
7720 - node = nearby_node(apicid);
7722 - numa_set_node(cpu, node);
7723 + c->x86_coreid_bits = bits;
7725 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
7730 @@ -840,8 +861,8 @@ static void __init amd_detect_cmp(struct
7731 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
7732 static __cpuinit int amd_apic_timer_broken(void)
7735 - u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7736 + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
7738 switch (eax & CPUID_XFAM) {
7740 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
7741 @@ -860,6 +881,15 @@ static __cpuinit int amd_apic_timer_brok
7745 +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
7747 + early_init_amd_mc(c);
7749 + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7750 + if (c->x86_power & (1<<8))
7751 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7754 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
7757 @@ -870,7 +900,7 @@ static void __cpuinit init_amd(struct cp
7759 * Disable TLB flush filter by setting HWCR.FFDIS on K8
7760 * bit 6 of msr C001_0015
7763 * Errata 63 for SH-B3 steppings
7764 * Errata 122 for all steppings (F+ have it disabled by default)
7766 @@ -883,35 +913,32 @@ static void __cpuinit init_amd(struct cp
7768 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
7769 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
7770 - clear_bit(0*32+31, &c->x86_capability);
7772 + clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
7774 /* On C+ stepping K8 rep microcode works well for copy/memset */
7775 level = cpuid_eax(1);
7776 - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
7777 - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7778 + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
7780 + set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7781 if (c->x86 == 0x10 || c->x86 == 0x11)
7782 - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7783 + set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7785 /* Enable workaround for FXSAVE leak */
7787 - set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
7788 + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
7790 level = get_model_name(c);
7795 /* Should distinguish Models here, but this is only
7796 a fallback anyways. */
7797 strcpy(c->x86_model_id, "Hammer");
7804 display_cacheinfo(c);
7806 - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
7807 - if (c->x86_power & (1<<8))
7808 - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7810 /* Multi core CPU? */
7811 if (c->extended_cpuid_level >= 0x80000008)
7813 @@ -923,14 +950,10 @@ static void __cpuinit init_amd(struct cp
7814 num_cache_leaves = 3;
7816 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
7817 - set_bit(X86_FEATURE_K8, &c->x86_capability);
7819 - /* RDTSC can be speculated around */
7820 - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7821 + set_cpu_cap(c, X86_FEATURE_K8);
7823 - /* Family 10 doesn't support C states in MWAIT so don't use it */
7824 - if (c->x86 == 0x10 && !force_mwait)
7825 - clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
7826 + /* MFENCE stops RDTSC speculation */
7827 + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
7830 if (amd_apic_timer_broken())
7831 @@ -938,28 +961,29 @@ static void __cpuinit init_amd(struct cp
7835 -static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7836 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
7839 - u32 eax, ebx, ecx, edx;
7840 - int index_msb, core_bits;
7841 + u32 eax, ebx, ecx, edx;
7842 + int index_msb, core_bits;
7844 cpuid(1, &eax, &ebx, &ecx, &edx);
7847 if (!cpu_has(c, X86_FEATURE_HT))
7849 - if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7850 + if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
7853 smp_num_siblings = (ebx & 0xff0000) >> 16;
7855 if (smp_num_siblings == 1) {
7856 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
7857 - } else if (smp_num_siblings > 1 ) {
7858 + } else if (smp_num_siblings > 1) {
7860 if (smp_num_siblings > NR_CPUS) {
7861 - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
7862 + printk(KERN_WARNING "CPU: Unsupported number of "
7863 + "siblings %d", smp_num_siblings);
7864 smp_num_siblings = 1;
7867 @@ -969,7 +993,7 @@ static void __cpuinit detect_ht(struct c
7869 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
7871 - index_msb = get_count_order(smp_num_siblings) ;
7872 + index_msb = get_count_order(smp_num_siblings);
7874 core_bits = get_count_order(c->x86_max_cores);
7876 @@ -978,8 +1002,10 @@ static void __cpuinit detect_ht(struct c
7879 if ((c->x86_max_cores * smp_num_siblings) > 1) {
7880 - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
7881 - printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
7882 + printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
7884 + printk(KERN_INFO "CPU: Processor Core ID: %d\n",
7889 @@ -1003,7 +1029,7 @@ static int __cpuinit intel_num_cpu_cores
7893 -static void srat_detect_node(void)
7894 +static void __cpuinit srat_detect_node(void)
7898 @@ -1013,7 +1039,7 @@ static void srat_detect_node(void)
7899 /* Don't do the funky fallback heuristics the AMD version employs
7901 node = apicid_to_node[apicid];
7902 - if (node == NUMA_NO_NODE)
7903 + if (node == NUMA_NO_NODE || !node_online(node))
7904 node = first_node(node_online_map);
7905 numa_set_node(cpu, node);
7907 @@ -1021,28 +1047,39 @@ static void srat_detect_node(void)
7911 +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
7913 + if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7914 + (c->x86 == 0x6 && c->x86_model >= 0x0e))
7915 + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7918 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
7923 init_intel_cacheinfo(c);
7924 - if (c->cpuid_level > 9 ) {
7925 + if (c->cpuid_level > 9) {
7926 unsigned eax = cpuid_eax(10);
7927 /* Check for version and the number of counters */
7928 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
7929 - set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
7930 + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
7934 unsigned int l1, l2;
7935 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
7936 if (!(l1 & (1<<11)))
7937 - set_bit(X86_FEATURE_BTS, c->x86_capability);
7938 + set_cpu_cap(c, X86_FEATURE_BTS);
7939 if (!(l1 & (1<<12)))
7940 - set_bit(X86_FEATURE_PEBS, c->x86_capability);
7941 + set_cpu_cap(c, X86_FEATURE_PEBS);
7948 n = c->extended_cpuid_level;
7949 if (n >= 0x80000008) {
7950 unsigned eax = cpuid_eax(0x80000008);
7951 @@ -1059,14 +1096,11 @@ static void __cpuinit init_intel(struct
7952 c->x86_cache_alignment = c->x86_clflush_size * 2;
7953 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
7954 (c->x86 == 0x6 && c->x86_model >= 0x0e))
7955 - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
7956 + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
7958 - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
7960 - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7962 - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
7963 - c->x86_max_cores = intel_num_cpu_cores(c);
7964 + set_cpu_cap(c, X86_FEATURE_REP_GOOD);
7965 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
7966 + c->x86_max_cores = intel_num_cpu_cores(c);
7970 @@ -1083,18 +1117,12 @@ static void __cpuinit get_cpu_vendor(str
7971 c->x86_vendor = X86_VENDOR_UNKNOWN;
7974 -struct cpu_model_info {
7977 - char *model_names[16];
7980 /* Do some early cpuid on the boot CPU to get some parameter that are
7981 needed before check_bugs. Everything advanced is in identify_cpu
7983 -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7984 +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
7989 c->loops_per_jiffy = loops_per_jiffy;
7990 c->x86_cache_size = -1;
7991 @@ -1105,6 +1133,7 @@ void __cpuinit early_identify_cpu(struct
7992 c->x86_clflush_size = 64;
7993 c->x86_cache_alignment = c->x86_clflush_size;
7994 c->x86_max_cores = 1;
7995 + c->x86_coreid_bits = 0;
7996 c->extended_cpuid_level = 0;
7997 memset(&c->x86_capability, 0, sizeof c->x86_capability);
7999 @@ -1113,7 +1142,7 @@ void __cpuinit early_identify_cpu(struct
8000 (unsigned int *)&c->x86_vendor_id[0],
8001 (unsigned int *)&c->x86_vendor_id[8],
8002 (unsigned int *)&c->x86_vendor_id[4]);
8007 /* Initialize the standard set of capabilities */
8008 @@ -1131,7 +1160,7 @@ void __cpuinit early_identify_cpu(struct
8009 c->x86 += (tfms >> 20) & 0xff;
8011 c->x86_model += ((tfms >> 16) & 0xF) << 4;
8012 - if (c->x86_capability[0] & (1<<19))
8013 + if (c->x86_capability[0] & (1<<19))
8014 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
8016 /* Have CPUID level 0 only - unheard of */
8017 @@ -1141,18 +1170,6 @@ void __cpuinit early_identify_cpu(struct
8019 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
8024 - * This does the hard work of actually picking apart the CPU stuff...
8026 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8031 - early_identify_cpu(c);
8033 /* AMD-defined flags: level 0x80000001 */
8034 xlvl = cpuid_eax(0x80000000);
8035 c->extended_cpuid_level = xlvl;
8036 @@ -1173,6 +1190,30 @@ void __cpuinit identify_cpu(struct cpuin
8037 c->x86_capability[2] = cpuid_edx(0x80860001);
8040 + c->extended_cpuid_level = cpuid_eax(0x80000000);
8041 + if (c->extended_cpuid_level >= 0x80000007)
8042 + c->x86_power = cpuid_edx(0x80000007);
8044 + switch (c->x86_vendor) {
8045 + case X86_VENDOR_AMD:
8046 + early_init_amd(c);
8048 + case X86_VENDOR_INTEL:
8049 + early_init_intel(c);
8056 + * This does the hard work of actually picking apart the CPU stuff...
8058 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
8062 + early_identify_cpu(c);
8064 init_scattered_cpuid_features(c);
8066 c->apicid = phys_pkg_id(0);
8067 @@ -1202,8 +1243,7 @@ void __cpuinit identify_cpu(struct cpuin
8071 - select_idle_routine(c);
8076 * On SMP, boot_cpu_data holds the common feature set between
8077 @@ -1213,31 +1253,55 @@ void __cpuinit identify_cpu(struct cpuin
8079 if (c != &boot_cpu_data) {
8080 /* AND the already accumulated flags with these */
8081 - for (i = 0 ; i < NCAPINTS ; i++)
8082 + for (i = 0; i < NCAPINTS; i++)
8083 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
8086 + /* Clear all flags overriden by options */
8087 + for (i = 0; i < NCAPINTS; i++)
8088 + c->x86_capability[i] &= ~cleared_cpu_caps[i];
8090 #ifdef CONFIG_X86_MCE
8093 + select_idle_routine(c);
8095 if (c != &boot_cpu_data)
8098 numa_add_cpu(smp_processor_id());
8104 +static __init int setup_noclflush(char *arg)
8106 + setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
8109 +__setup("noclflush", setup_noclflush);
8111 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
8113 if (c->x86_model_id[0])
8114 - printk("%s", c->x86_model_id);
8115 + printk(KERN_CONT "%s", c->x86_model_id);
8117 + if (c->x86_mask || c->cpuid_level >= 0)
8118 + printk(KERN_CONT " stepping %02x\n", c->x86_mask);
8120 + printk(KERN_CONT "\n");
8123 - if (c->x86_mask || c->cpuid_level >= 0)
8124 - printk(" stepping %02x\n", c->x86_mask);
8125 +static __init int setup_disablecpuid(char *arg)
8128 + if (get_option(&arg, &bit) && bit < NCAPINTS*32)
8129 + setup_clear_cpu_cap(bit);
8135 +__setup("clearcpuid=", setup_disablecpuid);
8138 * Get CPU information for use by the procfs.
8139 @@ -1246,116 +1310,41 @@ void __cpuinit print_cpu_info(struct cpu
8140 static int show_cpuinfo(struct seq_file *m, void *v)
8142 struct cpuinfo_x86 *c = v;
8146 - * These flag bits must match the definitions in <asm/cpufeature.h>.
8147 - * NULL means this bit is undefined or reserved; either way it doesn't
8148 - * have meaning as far as Linux is concerned. Note that it's important
8149 - * to realize there is a difference between this table and CPUID -- if
8150 - * applications want to get the raw CPUID data, they should access
8151 - * /dev/cpu/<cpu_nr>/cpuid instead.
8153 - static const char *const x86_cap_flags[] = {
8154 - /* Intel-defined */
8155 - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
8156 - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
8157 - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
8158 - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
8161 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8162 - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
8163 - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
8164 - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
8165 - "3dnowext", "3dnow",
8167 - /* Transmeta-defined */
8168 - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
8169 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8170 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8171 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8173 - /* Other (Linux-defined) */
8174 - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
8175 - NULL, NULL, NULL, NULL,
8176 - "constant_tsc", "up", NULL, "arch_perfmon",
8177 - "pebs", "bts", NULL, "sync_rdtsc",
8178 - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8179 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8181 - /* Intel-defined (#2) */
8182 - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
8183 - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
8184 - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
8185 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8187 - /* VIA/Cyrix/Centaur-defined */
8188 - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
8189 - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
8190 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8191 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8193 - /* AMD-defined (#2) */
8194 - "lahf_lm", "cmp_legacy", "svm", "extapic",
8195 - "cr8_legacy", "abm", "sse4a", "misalignsse",
8196 - "3dnowprefetch", "osvw", "ibs", "sse5",
8197 - "skinit", "wdt", NULL, NULL,
8198 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8199 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8201 - /* Auxiliary (Linux-defined) */
8202 - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8203 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8204 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8205 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
8207 - static const char *const x86_power_flags[] = {
8208 - "ts", /* temperature sensor */
8209 - "fid", /* frequency id control */
8210 - "vid", /* voltage id control */
8211 - "ttp", /* thermal trip */
8216 - "", /* tsc invariant mapped to constant_tsc */
8226 - seq_printf(m,"processor\t: %u\n"
8227 - "vendor_id\t: %s\n"
8228 - "cpu family\t: %d\n"
8230 - "model name\t: %s\n",
8232 - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8234 - (int)c->x86_model,
8235 - c->x86_model_id[0] ? c->x86_model_id : "unknown");
8237 + seq_printf(m, "processor\t: %u\n"
8238 + "vendor_id\t: %s\n"
8239 + "cpu family\t: %d\n"
8241 + "model name\t: %s\n",
8243 + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
8245 + (int)c->x86_model,
8246 + c->x86_model_id[0] ? c->x86_model_id : "unknown");
8248 if (c->x86_mask || c->cpuid_level >= 0)
8249 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
8251 seq_printf(m, "stepping\t: unknown\n");
8253 - if (cpu_has(c,X86_FEATURE_TSC)) {
8255 + if (cpu_has(c, X86_FEATURE_TSC)) {
8256 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
8260 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
8261 - freq / 1000, (freq % 1000));
8262 + freq / 1000, (freq % 1000));
8266 - if (c->x86_cache_size >= 0)
8267 + if (c->x86_cache_size >= 0)
8268 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
8272 if (smp_num_siblings * c->x86_max_cores > 1) {
8273 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
8274 @@ -1364,48 +1353,43 @@ static int show_cpuinfo(struct seq_file
8275 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
8276 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
8283 - "fpu_exception\t: yes\n"
8284 - "cpuid level\t: %d\n"
8288 + "fpu_exception\t: yes\n"
8289 + "cpuid level\t: %d\n"
8296 - for ( i = 0 ; i < 32*NCAPINTS ; i++ )
8297 - if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8298 - seq_printf(m, " %s", x86_cap_flags[i]);
8301 + for (i = 0; i < 32*NCAPINTS; i++)
8302 + if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
8303 + seq_printf(m, " %s", x86_cap_flags[i]);
8305 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
8306 c->loops_per_jiffy/(500000/HZ),
8307 (c->loops_per_jiffy/(5000/HZ)) % 100);
8309 - if (c->x86_tlbsize > 0)
8310 + if (c->x86_tlbsize > 0)
8311 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
8312 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
8313 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
8315 - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8316 + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
8317 c->x86_phys_bits, c->x86_virt_bits);
8319 seq_printf(m, "power management:");
8322 - for (i = 0; i < 32; i++)
8323 - if (c->x86_power & (1 << i)) {
8324 - if (i < ARRAY_SIZE(x86_power_flags) &&
8325 - x86_power_flags[i])
8326 - seq_printf(m, "%s%s",
8327 - x86_power_flags[i][0]?" ":"",
8328 - x86_power_flags[i]);
8330 - seq_printf(m, " [%d]", i);
8332 + for (i = 0; i < 32; i++) {
8333 + if (c->x86_power & (1 << i)) {
8334 + if (i < ARRAY_SIZE(x86_power_flags) &&
8335 + x86_power_flags[i])
8336 + seq_printf(m, "%s%s",
8337 + x86_power_flags[i][0]?" ":"",
8338 + x86_power_flags[i]);
8340 + seq_printf(m, " [%d]", i);
8344 seq_printf(m, "\n\n");
8345 @@ -1432,8 +1416,8 @@ static void c_stop(struct seq_file *m, v
8349 -struct seq_operations cpuinfo_op = {
8351 +const struct seq_operations cpuinfo_op = {
8355 .show = show_cpuinfo,
8356 --- sle11-2009-10-16.orig/arch/x86/kernel/smp_32-xen.c 2009-02-16 16:18:36.000000000 +0100
8357 +++ sle11-2009-10-16/arch/x86/kernel/smp_32-xen.c 2009-03-16 16:33:40.000000000 +0100
8358 @@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh
8362 -void fastcall send_IPI_self(int vector)
8363 +void send_IPI_self(int vector)
8365 __send_IPI_shortcut(APIC_DEST_SELF, vector);
8367 @@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
8368 * We need to reload %cr3 since the page tables may be going
8369 * away from under us..
8371 -void leave_mm(unsigned long cpu)
8372 +void leave_mm(int cpu)
8374 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
8376 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
8377 load_cr3(swapper_pg_dir);
8379 +EXPORT_SYMBOL_GPL(leave_mm);
8383 --- sle11-2009-10-16.orig/arch/x86/kernel/smp_64-xen.c 2009-02-16 16:18:36.000000000 +0100
8384 +++ sle11-2009-10-16/arch/x86/kernel/smp_64-xen.c 2009-03-16 16:33:40.000000000 +0100
8389 - * Smarter SMP flushing macros.
8390 + * Smarter SMP flushing macros.
8391 * c/o Linus Torvalds.
8393 * These mean you can really definitely utterly forget about
8396 * Optimizations Manfred Spraul <manfred@colorfullife.com>
8398 - * More scalable flush, from Andi Kleen
8399 + * More scalable flush, from Andi Kleen
8401 - * To avoid global state use 8 different call vectors.
8402 - * Each CPU uses a specific vector to trigger flushes on other
8403 - * CPUs. Depending on the received vector the target CPUs look into
8404 + * To avoid global state use 8 different call vectors.
8405 + * Each CPU uses a specific vector to trigger flushes on other
8406 + * CPUs. Depending on the received vector the target CPUs look into
8407 * the right per cpu variable for the flush data.
8409 - * With more than 8 CPUs they are hashed to the 8 available
8410 - * vectors. The limited global vector space forces us to this right now.
8411 + * With more than 8 CPUs they are hashed to the 8 available
8412 + * vectors. The limited global vector space forces us to this right now.
8413 * In future when interrupts are split into per CPU domains this could be
8414 * fixed, at the cost of triggering multiple IPIs in some cases.
8416 @@ -59,7 +59,6 @@ union smp_flush_state {
8417 cpumask_t flush_cpumask;
8418 struct mm_struct *flush_mm;
8419 unsigned long flush_va;
8420 -#define FLUSH_ALL -1ULL
8421 spinlock_t tlbstate_lock;
8423 char pad[SMP_CACHE_BYTES];
8424 @@ -71,16 +70,17 @@ union smp_flush_state {
8425 static DEFINE_PER_CPU(union smp_flush_state, flush_state);
8428 - * We cannot call mmdrop() because we are in interrupt context,
8429 + * We cannot call mmdrop() because we are in interrupt context,
8430 * instead update mm->cpu_vm_mask.
8432 -static inline void leave_mm(unsigned long cpu)
8433 +void leave_mm(int cpu)
8435 if (read_pda(mmu_state) == TLBSTATE_OK)
8437 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
8438 load_cr3(swapper_pg_dir);
8440 +EXPORT_SYMBOL_GPL(leave_mm);
8444 @@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon
8445 * 1) switch_mm() either 1a) or 1b)
8446 * 1a) thread switch to a different mm
8447 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
8448 - * Stop ipi delivery for the old mm. This is not synchronized with
8449 - * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8450 - * for the wrong mm, and in the worst case we perform a superfluous
8452 + * Stop ipi delivery for the old mm. This is not synchronized with
8453 + * the other cpus, but smp_invalidate_interrupt ignore flush ipis
8454 + * for the wrong mm, and in the worst case we perform a superfluous
8456 * 1a2) set cpu mmu_state to TLBSTATE_OK
8457 - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8458 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
8459 * was in lazy tlb mode.
8460 * 1a3) update cpu active_mm
8461 - * Now cpu0 accepts tlb flushes for the new mm.
8462 + * Now cpu0 accepts tlb flushes for the new mm.
8463 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
8464 - * Now the other cpus will send tlb flush ipis.
8465 + * Now the other cpus will send tlb flush ipis.
8467 * 1b) thread switch without mm change
8468 * cpu active_mm is correct, cpu0 already handles
8470 * 1b1) set cpu mmu_state to TLBSTATE_OK
8471 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
8472 - * Atomically set the bit [other cpus will start sending flush ipis],
8473 - * and test the bit.
8474 + * Atomically set the bit [other cpus will start sending flush ipis],
8475 + * and test the bit.
8476 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
8477 * 2) switch %%esp, ie current
8479 @@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt
8480 * orig_rax contains the negated interrupt vector.
8481 * Use that to determine where the sender put the data.
8483 - sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
8484 + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
8485 f = &per_cpu(flush_state, sender);
8487 if (!cpu_isset(cpu, f->flush_cpumask))
8491 * This was a BUG() but until someone can quote me the
8492 * line from the intel manual that guarantees an IPI to
8493 * multiple CPUs is retried _only_ on the erroring CPUs
8494 @@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt
8500 if (f->flush_mm == read_pda(active_mm)) {
8501 if (read_pda(mmu_state) == TLBSTATE_OK) {
8502 - if (f->flush_va == FLUSH_ALL)
8503 + if (f->flush_va == TLB_FLUSH_ALL)
8506 __flush_tlb_one(f->flush_va);
8507 @@ -170,19 +170,22 @@ out:
8508 add_pda(irq_tlb_count, 1);
8511 -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
8513 +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
8517 union smp_flush_state *f;
8518 + cpumask_t cpumask = *cpumaskp;
8520 /* Caller has disabled preemption */
8521 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
8522 f = &per_cpu(flush_state, sender);
8524 - /* Could avoid this lock when
8525 - num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8526 - probably not worth checking this for a cache-hot lock. */
8528 + * Could avoid this lock when
8529 + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
8530 + * probably not worth checking this for a cache-hot lock.
8532 spin_lock(&f->tlbstate_lock);
8535 @@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c
8536 int __cpuinit init_smp_flush(void)
8540 for_each_cpu_mask(i, cpu_possible_map) {
8541 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
8546 core_initcall(init_smp_flush);
8549 void flush_tlb_current_task(void)
8551 struct mm_struct *mm = current->mm;
8552 @@ -225,10 +228,9 @@ void flush_tlb_current_task(void)
8555 if (!cpus_empty(cpu_mask))
8556 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8557 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8560 -EXPORT_SYMBOL(flush_tlb_current_task);
8562 void flush_tlb_mm (struct mm_struct * mm)
8564 @@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm
8565 leave_mm(smp_processor_id());
8567 if (!cpus_empty(cpu_mask))
8568 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
8569 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
8573 -EXPORT_SYMBOL(flush_tlb_mm);
8575 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
8577 @@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc
8578 if (current->active_mm == mm) {
8580 __flush_tlb_one(va);
8582 - leave_mm(smp_processor_id());
8584 + leave_mm(smp_processor_id());
8587 if (!cpus_empty(cpu_mask))
8588 @@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc
8592 -EXPORT_SYMBOL(flush_tlb_page);
8594 static void do_flush_tlb_all(void* info)
8596 @@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void)
8597 * this function sends a 'generic call function' IPI to all other CPU
8598 * of the system defined in the mask.
8602 -__smp_call_function_mask(cpumask_t mask,
8603 - void (*func)(void *), void *info,
8605 +static int __smp_call_function_mask(cpumask_t mask,
8606 + void (*func)(void *), void *info,
8609 struct call_data_struct data;
8610 cpumask_t allbutself;
8611 @@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
8614 int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
8615 - int nonatomic, int wait)
8616 + int nonatomic, int wait)
8618 /* prevent preemption and reschedule on another processor */
8620 - int me = get_cpu();
8621 + int ret, me = get_cpu();
8623 /* Can deadlock when called with interrupts disabled */
8624 WARN_ON(irqs_disabled());
8625 @@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy)
8627 cpu_clear(smp_processor_id(), cpu_online_map);
8628 disable_all_local_evtchn();
8635 void smp_send_stop(void)
8637 --- sle11-2009-10-16.orig/arch/x86/kernel/time_32-xen.c 2009-10-28 14:57:59.000000000 +0100
8638 +++ sle11-2009-10-16/arch/x86/kernel/time_32-xen.c 2009-10-28 14:58:05.000000000 +0100
8640 * serialize accesses to xtime/lost_ticks).
8643 -#include <linux/errno.h>
8644 -#include <linux/sched.h>
8645 -#include <linux/kernel.h>
8646 -#include <linux/param.h>
8647 -#include <linux/string.h>
8648 -#include <linux/mm.h>
8649 +#include <linux/init.h>
8650 #include <linux/interrupt.h>
8651 #include <linux/time.h>
8652 -#include <linux/delay.h>
8653 -#include <linux/init.h>
8654 -#include <linux/smp.h>
8655 -#include <linux/module.h>
8656 -#include <linux/sysdev.h>
8657 -#include <linux/bcd.h>
8658 -#include <linux/efi.h>
8659 #include <linux/mca.h>
8660 #include <linux/sysctl.h>
8661 #include <linux/percpu.h>
8663 #include <linux/posix-timers.h>
8664 #include <linux/cpufreq.h>
8665 #include <linux/clocksource.h>
8666 +#include <linux/sysdev.h>
8668 -#include <asm/io.h>
8669 -#include <asm/smp.h>
8670 -#include <asm/irq.h>
8671 -#include <asm/msr.h>
8672 #include <asm/delay.h>
8673 -#include <asm/mpspec.h>
8674 -#include <asm/uaccess.h>
8675 -#include <asm/processor.h>
8676 -#include <asm/timer.h>
8677 #include <asm/time.h>
8678 -#include <asm/sections.h>
8680 -#include "mach_time.h"
8682 -#include <linux/timex.h>
8684 -#include <asm/hpet.h>
8686 -#include <asm/arch_hooks.h>
8688 #include <xen/evtchn.h>
8689 #include <xen/sysctl.h>
8690 @@ -89,9 +61,6 @@ volatile unsigned long __jiffies __secti
8691 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
8692 EXPORT_SYMBOL(cpu_khz);
8694 -DEFINE_SPINLOCK(rtc_lock);
8695 -EXPORT_SYMBOL(rtc_lock);
8697 /* These are peridically updated in shared_info, and then copied here. */
8698 struct shadow_time_info {
8699 u64 tsc_timestamp; /* TSC at last update of time vals. */
8700 @@ -154,6 +123,11 @@ static int __init __independent_wallcloc
8702 __setup("independent_wallclock", __independent_wallclock);
8704 +int xen_independent_wallclock(void)
8706 + return independent_wallclock;
8709 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
8710 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
8711 static int __init __permitted_clock_jitter(char *str)
8712 @@ -223,7 +197,6 @@ static inline u64 get64(volatile u64 *pt
8713 return cmpxchg64(ptr, 0, 0);
8716 -#define cmpxchg64 cmpxchg
8720 @@ -233,7 +206,6 @@ static inline u64 get64_local(volatile u
8721 return cmpxchg64_local(ptr, 0, 0);
8724 -#define cmpxchg64_local cmpxchg_local
8728 @@ -339,35 +311,6 @@ static inline int time_values_up_to_date
8729 return (dst->version == src->version);
8733 - * This is a special lock that is owned by the CPU and holds the index
8734 - * register we are working with. It is required for NMI access to the
8735 - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
8737 -volatile unsigned long cmos_lock = 0;
8738 -EXPORT_SYMBOL(cmos_lock);
8740 -/* Routines for accessing the CMOS RAM/RTC. */
8741 -unsigned char rtc_cmos_read(unsigned char addr)
8743 - unsigned char val;
8744 - lock_cmos_prefix(addr);
8745 - outb_p(addr, RTC_PORT(0));
8746 - val = inb_p(RTC_PORT(1));
8747 - lock_cmos_suffix(addr);
8750 -EXPORT_SYMBOL(rtc_cmos_read);
8752 -void rtc_cmos_write(unsigned char val, unsigned char addr)
8754 - lock_cmos_prefix(addr);
8755 - outb_p(addr, RTC_PORT(0));
8756 - outb_p(val, RTC_PORT(1));
8757 - lock_cmos_suffix(addr);
8759 -EXPORT_SYMBOL(rtc_cmos_write);
8761 static void sync_xen_wallclock(unsigned long dummy);
8762 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
8763 static void sync_xen_wallclock(unsigned long dummy)
8764 @@ -376,7 +319,8 @@ static void sync_xen_wallclock(unsigned
8766 struct xen_platform_op op;
8768 - if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
8769 + BUG_ON(!is_initial_xendomain());
8770 + if (!ntp_synced() || independent_wallclock)
8773 write_seqlock_irq(&xtime_lock);
8774 @@ -399,23 +343,6 @@ static void sync_xen_wallclock(unsigned
8775 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
8778 -static int set_rtc_mmss(unsigned long nowtime)
8781 - unsigned long flags;
8783 - if (independent_wallclock || !is_initial_xendomain())
8786 - /* gets recalled with irq locally disabled */
8787 - /* XXX - does irqsave resolve this? -johnstul */
8788 - spin_lock_irqsave(&rtc_lock, flags);
8789 - retval = set_wallclock(nowtime);
8790 - spin_unlock_irqrestore(&rtc_lock, flags);
8795 static unsigned long long local_clock(void)
8797 unsigned int cpu = get_cpu();
8798 @@ -498,28 +425,24 @@ unsigned long profile_pc(struct pt_regs
8800 #if defined(CONFIG_SMP) || defined(__x86_64__)
8802 - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs)
8803 + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
8805 if (!user_mode(regs)
8807 && in_lock_functions(pc)) {
8808 # ifdef CONFIG_FRAME_POINTER
8810 - return ((unsigned long *)regs->ebp)[1];
8812 - return ((unsigned long *)regs->rbp)[1];
8814 + return ((unsigned long *)regs->bp)[1];
8817 - unsigned long *sp = (unsigned long *)®s->esp;
8818 + unsigned long *sp = (unsigned long *)®s->sp;
8820 - unsigned long *sp = (unsigned long *)regs->rsp;
8821 + unsigned long *sp = (unsigned long *)regs->sp;
8824 /* Return address is either directly at stack pointer
8825 - or above a saved eflags. Eflags has bits 22-31 zero,
8826 + or above a saved flags. Eflags has bits 22-31 zero,
8827 kernel addresses don't. */
8833 @@ -749,25 +672,32 @@ static void init_missing_ticks_accountin
8834 runstate->time[RUNSTATE_offline];
8837 -/* not static: needed by APM */
8838 -unsigned long read_persistent_clock(void)
8839 +unsigned long xen_read_persistent_clock(void)
8841 - unsigned long retval;
8842 - unsigned long flags;
8844 - spin_lock_irqsave(&rtc_lock, flags);
8845 + const shared_info_t *s = HYPERVISOR_shared_info;
8846 + u32 version, sec, nsec;
8849 - retval = get_wallclock();
8851 + version = s->wc_version;
8854 + nsec = s->wc_nsec;
8856 + } while ((s->wc_version & 1) | (version ^ s->wc_version));
8858 - spin_unlock_irqrestore(&rtc_lock, flags);
8859 + delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
8860 + do_div(delta, NSEC_PER_SEC);
8866 -int update_persistent_clock(struct timespec now)
8867 +int xen_update_persistent_clock(void)
8869 + if (!is_initial_xendomain())
8871 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
8872 - return set_rtc_mmss(now.tv_sec);
8876 extern void (*late_time_init)(void);
8877 --- sle11-2009-10-16.orig/arch/x86/kernel/traps_32-xen.c 2009-02-16 16:18:36.000000000 +0100
8878 +++ sle11-2009-10-16/arch/x86/kernel/traps_32-xen.c 2009-03-16 16:33:40.000000000 +0100
8879 @@ -79,7 +79,8 @@ char ignore_fpu_irq = 0;
8880 * F0 0F bug workaround.. We have a special link segment
8883 -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
8884 +gate_desc idt_table[256]
8885 + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
8888 asmlinkage void divide_error(void);
8889 @@ -109,6 +110,34 @@ asmlinkage void machine_check(void);
8890 int kstack_depth_to_print = 24;
8891 static unsigned int code_bytes = 64;
8893 +void printk_address(unsigned long address, int reliable)
8895 +#ifdef CONFIG_KALLSYMS
8896 + unsigned long offset = 0, symsize;
8897 + const char *symname;
8899 + char *delim = ":";
8900 + char namebuf[128];
8901 + char reliab[4] = "";
8903 + symname = kallsyms_lookup(address, &symsize, &offset,
8904 + &modname, namebuf);
8906 + printk(" [<%08lx>]\n", address);
8910 + strcpy(reliab, "? ");
8913 + modname = delim = "";
8914 + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
8915 + address, reliab, delim, modname, delim, symname, offset, symsize);
8917 + printk(" [<%08lx>]\n", address);
8921 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
8923 return p > (void *)tinfo &&
8924 @@ -122,48 +151,35 @@ struct stack_frame {
8927 static inline unsigned long print_context_stack(struct thread_info *tinfo,
8928 - unsigned long *stack, unsigned long ebp,
8929 + unsigned long *stack, unsigned long bp,
8930 const struct stacktrace_ops *ops, void *data)
8932 -#ifdef CONFIG_FRAME_POINTER
8933 - struct stack_frame *frame = (struct stack_frame *)ebp;
8934 - while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
8935 - struct stack_frame *next;
8936 - unsigned long addr;
8937 + struct stack_frame *frame = (struct stack_frame *)bp;
8939 - addr = frame->return_address;
8940 - ops->address(data, addr);
8942 - * break out of recursive entries (such as
8943 - * end_of_stack_stop_unwind_function). Also,
8944 - * we can never allow a frame pointer to
8947 - next = frame->next_frame;
8948 - if (next <= frame)
8953 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
8957 - if (__kernel_text_address(addr))
8958 - ops->address(data, addr);
8960 + if (__kernel_text_address(addr)) {
8961 + if ((unsigned long) stack == bp + 4) {
8962 + ops->address(data, addr, 1);
8963 + frame = frame->next_frame;
8964 + bp = (unsigned long) frame;
8966 + ops->address(data, addr, bp == 0);
8976 #define MSG(msg) ops->warning(data, msg)
8978 void dump_trace(struct task_struct *task, struct pt_regs *regs,
8979 - unsigned long *stack,
8980 + unsigned long *stack, unsigned long bp,
8981 const struct stacktrace_ops *ops, void *data)
8983 - unsigned long ebp = 0;
8988 @@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task
8989 unsigned long dummy;
8991 if (task != current)
8992 - stack = (unsigned long *)task->thread.esp;
8993 + stack = (unsigned long *)task->thread.sp;
8996 #ifdef CONFIG_FRAME_POINTER
8999 if (task == current) {
9000 - /* Grab ebp right from our regs */
9001 - asm ("movl %%ebp, %0" : "=r" (ebp) : );
9002 + /* Grab bp right from our regs */
9003 + asm ("movl %%ebp, %0" : "=r" (bp) : );
9005 - /* ebp is the last reg pushed by switch_to */
9006 - ebp = *(unsigned long *) task->thread.esp;
9007 + /* bp is the last reg pushed by switch_to */
9008 + bp = *(unsigned long *) task->thread.sp;
9012 @@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task
9013 struct thread_info *context;
9014 context = (struct thread_info *)
9015 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
9016 - ebp = print_context_stack(context, stack, ebp, ops, data);
9017 + bp = print_context_stack(context, stack, bp, ops, data);
9018 /* Should be after the line below, but somewhere
9019 in early boot context comes out corrupted and we
9020 can't reference it -AK */
9021 @@ -225,9 +241,11 @@ static int print_trace_stack(void *data,
9023 * Print one address/symbol entries per line.
9025 -static void print_trace_address(void *data, unsigned long addr)
9026 +static void print_trace_address(void *data, unsigned long addr, int reliable)
9028 printk("%s [<%08lx>] ", (char *)data, addr);
9031 print_symbol("%s\n", addr);
9032 touch_nmi_watchdog();
9034 @@ -241,32 +259,32 @@ static const struct stacktrace_ops print
9037 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
9038 - unsigned long * stack, char *log_lvl)
9039 + unsigned long *stack, unsigned long bp, char *log_lvl)
9041 - dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
9042 + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
9043 printk("%s =======================\n", log_lvl);
9046 void show_trace(struct task_struct *task, struct pt_regs *regs,
9047 - unsigned long * stack)
9048 + unsigned long *stack, unsigned long bp)
9050 - show_trace_log_lvl(task, regs, stack, "");
9051 + show_trace_log_lvl(task, regs, stack, bp, "");
9054 static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
9055 - unsigned long *esp, char *log_lvl)
9056 + unsigned long *sp, unsigned long bp, char *log_lvl)
9058 unsigned long *stack;
9061 - if (esp == NULL) {
9064 - esp = (unsigned long*)task->thread.esp;
9065 + sp = (unsigned long*)task->thread.sp;
9067 - esp = (unsigned long *)&esp;
9068 + sp = (unsigned long *)&sp;
9073 for(i = 0; i < kstack_depth_to_print; i++) {
9074 if (kstack_end(stack))
9076 @@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta
9077 printk("%08lx ", *stack++);
9079 printk("\n%sCall Trace:\n", log_lvl);
9080 - show_trace_log_lvl(task, regs, esp, log_lvl);
9081 + show_trace_log_lvl(task, regs, sp, bp, log_lvl);
9084 -void show_stack(struct task_struct *task, unsigned long *esp)
9085 +void show_stack(struct task_struct *task, unsigned long *sp)
9088 - show_stack_log_lvl(task, NULL, esp, "");
9089 + show_stack_log_lvl(task, NULL, sp, 0, "");
9093 @@ -290,13 +308,19 @@ void show_stack(struct task_struct *task
9094 void dump_stack(void)
9096 unsigned long stack;
9097 + unsigned long bp = 0;
9099 +#ifdef CONFIG_FRAME_POINTER
9101 + asm("movl %%ebp, %0" : "=r" (bp):);
9104 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9105 current->pid, current->comm, print_tainted(),
9106 init_utsname()->release,
9107 (int)strcspn(init_utsname()->version, " "),
9108 init_utsname()->version);
9109 - show_trace(current, NULL, &stack);
9110 + show_trace(current, NULL, &stack, bp);
9113 EXPORT_SYMBOL(dump_stack);
9114 @@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs
9115 * time of the fault..
9117 if (!user_mode_vm(regs)) {
9120 unsigned int code_prologue = code_bytes * 43 / 64;
9121 unsigned int code_len = code_bytes;
9124 printk("\n" KERN_EMERG "Stack: ");
9125 - show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG);
9126 + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG);
9128 printk(KERN_EMERG "Code: ");
9130 - eip = (u8 *)regs->eip - code_prologue;
9131 - if (eip < (u8 *)PAGE_OFFSET ||
9132 - probe_kernel_address(eip, c)) {
9133 + ip = (u8 *)regs->ip - code_prologue;
9134 + if (ip < (u8 *)PAGE_OFFSET ||
9135 + probe_kernel_address(ip, c)) {
9136 /* try starting at EIP */
9137 - eip = (u8 *)regs->eip;
9138 + ip = (u8 *)regs->ip;
9139 code_len = code_len - code_prologue + 1;
9141 - for (i = 0; i < code_len; i++, eip++) {
9142 - if (eip < (u8 *)PAGE_OFFSET ||
9143 - probe_kernel_address(eip, c)) {
9144 + for (i = 0; i < code_len; i++, ip++) {
9145 + if (ip < (u8 *)PAGE_OFFSET ||
9146 + probe_kernel_address(ip, c)) {
9147 printk(" Bad EIP value.");
9150 - if (eip == (u8 *)regs->eip)
9151 + if (ip == (u8 *)regs->ip)
9152 printk("<%02x> ", c);
9155 @@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs
9159 -int is_valid_bugaddr(unsigned long eip)
9160 +int is_valid_bugaddr(unsigned long ip)
9164 - if (eip < PAGE_OFFSET)
9165 + if (ip < PAGE_OFFSET)
9167 - if (probe_kernel_address((unsigned short *)eip, ud2))
9168 + if (probe_kernel_address((unsigned short *)ip, ud2))
9171 return ud2 == 0x0b0f;
9174 +static int die_counter;
9176 +int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9179 + unsigned short ss;
9181 + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
9182 +#ifdef CONFIG_PREEMPT
9183 + printk("PREEMPT ");
9188 +#ifdef CONFIG_DEBUG_PAGEALLOC
9189 + printk("DEBUG_PAGEALLOC");
9193 + if (notify_die(DIE_OOPS, str, regs, err,
9194 + current->thread.trap_no, SIGSEGV) !=
9196 + show_registers(regs);
9197 + /* Executive summary in case the oops scrolled away */
9198 + sp = (unsigned long) (®s->sp);
9199 + savesegment(ss, ss);
9200 + if (user_mode(regs)) {
9202 + ss = regs->ss & 0xffff;
9204 + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
9205 + print_symbol("%s", regs->ip);
9206 + printk(" SS:ESP %04x:%08lx\n", ss, sp);
9214 * This is gone through when something in the kernel has done something bad and
9215 * is about to be terminated.
9216 @@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg
9218 .lock_owner_depth = 0
9220 - static int die_counter;
9221 unsigned long flags;
9224 @@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg
9225 raw_local_irq_save(flags);
9227 if (++die.lock_owner_depth < 3) {
9228 - unsigned long esp;
9229 - unsigned short ss;
9231 - report_bug(regs->eip, regs);
9233 - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
9235 -#ifdef CONFIG_PREEMPT
9236 - printk("PREEMPT ");
9241 -#ifdef CONFIG_DEBUG_PAGEALLOC
9242 - printk("DEBUG_PAGEALLOC");
9245 + report_bug(regs->ip, regs);
9247 - if (notify_die(DIE_OOPS, str, regs, err,
9248 - current->thread.trap_no, SIGSEGV) !=
9250 - show_registers(regs);
9251 - /* Executive summary in case the oops scrolled away */
9252 - esp = (unsigned long) (®s->esp);
9253 - savesegment(ss, ss);
9254 - if (user_mode(regs)) {
9256 - ss = regs->xss & 0xffff;
9258 - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
9259 - print_symbol("%s", regs->eip);
9260 - printk(" SS:ESP %04x:%08lx\n", ss, esp);
9263 + if (__die(str, regs, err))
9267 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
9271 die.lock_owner = -1;
9272 @@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr
9274 struct task_struct *tsk = current;
9276 - if (regs->eflags & VM_MASK) {
9277 + if (regs->flags & VM_MASK) {
9281 @@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr
9284 #define DO_ERROR(trapnr, signr, str, name) \
9285 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9286 +void do_##name(struct pt_regs * regs, long error_code) \
9288 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9290 @@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs *
9293 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
9294 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9295 +void do_##name(struct pt_regs * regs, long error_code) \
9299 @@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs *
9302 #define DO_VM86_ERROR(trapnr, signr, str, name) \
9303 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9304 +void do_##name(struct pt_regs * regs, long error_code) \
9306 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
9308 @@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs *
9311 #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
9312 -fastcall void do_##name(struct pt_regs * regs, long error_code) \
9313 +void do_##name(struct pt_regs * regs, long error_code) \
9316 info.si_signo = signr; \
9317 @@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs *
9318 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
9321 -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
9322 +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
9323 #ifndef CONFIG_KPROBES
9324 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
9326 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
9327 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
9328 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
9329 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
9330 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
9331 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
9332 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
9333 @@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS, "stack segment", s
9334 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
9335 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
9337 -fastcall void __kprobes do_general_protection(struct pt_regs * regs,
9338 +void __kprobes do_general_protection(struct pt_regs * regs,
9341 - if (regs->eflags & VM_MASK)
9342 + if (regs->flags & VM_MASK)
9345 if (!user_mode(regs))
9346 @@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote
9347 current->thread.error_code = error_code;
9348 current->thread.trap_no = 13;
9349 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
9350 - printk_ratelimit())
9351 + printk_ratelimit()) {
9353 - "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
9354 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
9355 current->comm, task_pid_nr(current),
9356 - regs->eip, regs->esp, error_code);
9357 + regs->ip, regs->sp, error_code);
9358 + print_vma_addr(" in ", regs->ip);
9362 force_sig(SIGSEGV, current);
9364 @@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r
9367 printk(KERN_EMERG "%s", msg);
9368 - printk(" on CPU%d, eip %08lx, registers:\n",
9369 - smp_processor_id(), regs->eip);
9370 + printk(" on CPU%d, ip %08lx, registers:\n",
9371 + smp_processor_id(), regs->ip);
9372 show_registers(regs);
9374 spin_unlock(&nmi_print_lock);
9375 @@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str
9377 static int ignore_nmis;
9379 -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
9380 +__kprobes void do_nmi(struct pt_regs * regs, long error_code)
9384 @@ -762,7 +797,7 @@ void restart_nmi(void)
9387 #ifdef CONFIG_KPROBES
9388 -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
9389 +void __kprobes do_int3(struct pt_regs *regs, long error_code)
9391 trace_hardirqs_fixup();
9393 @@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p
9394 * find every occurrence of the TF bit that could be saved away even
9397 -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
9398 +void __kprobes do_debug(struct pt_regs * regs, long error_code)
9400 unsigned int condition;
9401 struct task_struct *tsk = current;
9402 @@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct
9404 get_debugreg(condition, 6);
9407 + * The processor cleared BTF, so don't mark that we need it set.
9409 + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
9410 + tsk->thread.debugctlmsr = 0;
9412 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
9413 SIGTRAP) == NOTIFY_STOP)
9415 /* It's safe to allow irq's after DR6 has been saved */
9416 - if (regs->eflags & X86_EFLAGS_IF)
9417 + if (regs->flags & X86_EFLAGS_IF)
9420 /* Mask out spurious debug traps due to lazy DR7 setting */
9421 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
9422 - if (!tsk->thread.debugreg[7])
9423 + if (!tsk->thread.debugreg7)
9427 - if (regs->eflags & VM_MASK)
9428 + if (regs->flags & VM_MASK)
9431 /* Save debug status register where ptrace can see it */
9432 - tsk->thread.debugreg[6] = condition;
9433 + tsk->thread.debugreg6 = condition;
9436 * Single-stepping through TF: make sure we ignore any events in
9437 @@ -856,7 +897,7 @@ debug_vm86:
9440 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
9441 - regs->eflags &= ~TF_MASK;
9442 + regs->flags &= ~TF_MASK;
9446 @@ -865,7 +906,7 @@ clear_TF_reenable:
9447 * the correct behaviour even in the presence of the asynchronous
9450 -void math_error(void __user *eip)
9451 +void math_error(void __user *ip)
9453 struct task_struct * task;
9455 @@ -881,7 +922,7 @@ void math_error(void __user *eip)
9456 info.si_signo = SIGFPE;
9458 info.si_code = __SI_FAULT;
9459 - info.si_addr = eip;
9460 + info.si_addr = ip;
9462 * (~cwd & swd) will mask out exceptions that are not set to unmasked
9463 * status. 0x3f is the exception bits in these regs, 0x200 is the
9464 @@ -924,13 +965,13 @@ void math_error(void __user *eip)
9465 force_sig_info(SIGFPE, &info, task);
9468 -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
9469 +void do_coprocessor_error(struct pt_regs * regs, long error_code)
9472 - math_error((void __user *)regs->eip);
9473 + math_error((void __user *)regs->ip);
9476 -static void simd_math_error(void __user *eip)
9477 +static void simd_math_error(void __user *ip)
9479 struct task_struct * task;
9481 @@ -946,7 +987,7 @@ static void simd_math_error(void __user
9482 info.si_signo = SIGFPE;
9484 info.si_code = __SI_FAULT;
9485 - info.si_addr = eip;
9486 + info.si_addr = ip;
9488 * The SIMD FPU exceptions are handled a little differently, as there
9489 * is only a single status/control register. Thus, to determine which
9490 @@ -978,19 +1019,19 @@ static void simd_math_error(void __user
9491 force_sig_info(SIGFPE, &info, task);
9494 -fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
9495 +void do_simd_coprocessor_error(struct pt_regs * regs,
9499 /* Handle SIMD FPU exceptions on PIII+ processors. */
9501 - simd_math_error((void __user *)regs->eip);
9502 + simd_math_error((void __user *)regs->ip);
9505 * Handle strange cache flush from user space exception
9506 * in all other cases. This is undocumented behaviour.
9508 - if (regs->eflags & VM_MASK) {
9509 + if (regs->flags & VM_MASK) {
9510 handle_vm86_fault((struct kernel_vm86_regs *)regs,
9513 @@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error(
9517 -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
9518 +void do_spurious_interrupt_bug(struct pt_regs * regs,
9522 @@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug(
9526 -fastcall unsigned long patch_espfix_desc(unsigned long uesp,
9527 +unsigned long patch_espfix_desc(unsigned long uesp,
9530 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
9531 @@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg)
9532 * NB. All these are "trap gates" (i.e. events_mask isn't set) except
9533 * for those that specify <dpl>|4 in the second field.
9535 -static trap_info_t __cpuinitdata trap_table[] = {
9536 +static const trap_info_t __cpuinitconst trap_table[] = {
9537 { 0, 0, __KERNEL_CS, (unsigned long)divide_error },
9538 { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
9539 { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
9540 @@ -1105,17 +1146,12 @@ void __init trap_init(void)
9542 printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
9545 + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9546 + * Generate a build-time error if the alignment is wrong.
9548 + BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
9551 - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
9552 - * Generates a compile-time "error: zero width for bit-field" if
9553 - * the alignment is wrong.
9555 - struct fxsrAlignAssert {
9556 - int _:!(offsetof(struct task_struct,
9557 - thread.i387.fxsave) & 15);
9560 printk(KERN_INFO "Enabling fast FPU save and restore... ");
9561 set_in_cr4(X86_CR4_OSFXSR);
9563 --- sle11-2009-10-16.orig/arch/x86/kernel/traps_64-xen.c 2009-02-16 16:18:36.000000000 +0100
9564 +++ sle11-2009-10-16/arch/x86/kernel/traps_64-xen.c 2009-03-16 16:33:40.000000000 +0100
9565 @@ -74,38 +74,41 @@ asmlinkage void alignment_check(void);
9566 asmlinkage void machine_check(void);
9567 asmlinkage void spurious_interrupt_bug(void);
9569 +static unsigned int code_bytes = 64;
9571 static inline void conditional_sti(struct pt_regs *regs)
9573 - if (regs->eflags & X86_EFLAGS_IF)
9574 + if (regs->flags & X86_EFLAGS_IF)
9578 static inline void preempt_conditional_sti(struct pt_regs *regs)
9580 - preempt_disable();
9581 - if (regs->eflags & X86_EFLAGS_IF)
9582 + inc_preempt_count();
9583 + if (regs->flags & X86_EFLAGS_IF)
9587 static inline void preempt_conditional_cli(struct pt_regs *regs)
9589 - if (regs->eflags & X86_EFLAGS_IF)
9590 + if (regs->flags & X86_EFLAGS_IF)
9591 local_irq_disable();
9592 /* Make sure to not schedule here because we could be running
9593 on an exception stack. */
9594 - preempt_enable_no_resched();
9595 + dec_preempt_count();
9598 int kstack_depth_to_print = 12;
9600 -#ifdef CONFIG_KALLSYMS
9601 -void printk_address(unsigned long address)
9602 +void printk_address(unsigned long address, int reliable)
9604 +#ifdef CONFIG_KALLSYMS
9605 unsigned long offset = 0, symsize;
9606 const char *symname;
9609 - char namebuf[128];
9610 + char namebuf[KSYM_NAME_LEN];
9611 + char reliab[4] = "";
9613 symname = kallsyms_lookup(address, &symsize, &offset,
9615 @@ -113,17 +116,17 @@ void printk_address(unsigned long addres
9616 printk(" [<%016lx>]\n", address);
9620 + strcpy(reliab, "? ");
9623 - modname = delim = "";
9624 - printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
9625 - address, delim, modname, delim, symname, offset, symsize);
9627 + modname = delim = "";
9628 + printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
9629 + address, reliab, delim, modname, delim, symname, offset, symsize);
9631 -void printk_address(unsigned long address)
9633 printk(" [<%016lx>]\n", address);
9638 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
9639 unsigned *usedp, char **idp)
9640 @@ -210,14 +213,53 @@ static unsigned long *in_exception_stack
9641 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
9644 -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
9645 +static inline int valid_stack_ptr(struct thread_info *tinfo,
9646 + void *p, unsigned int size, void *end)
9648 - void *t = (void *)tinfo;
9649 - return p > t && p < t + THREAD_SIZE - 3;
9652 + if (p < end && p >= (end-THREAD_SIZE))
9657 + return p > t && p < t + THREAD_SIZE - size;
9660 +/* The form of the top of the frame on the stack */
9661 +struct stack_frame {
9662 + struct stack_frame *next_frame;
9663 + unsigned long return_address;
9667 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
9668 + unsigned long *stack, unsigned long bp,
9669 + const struct stacktrace_ops *ops, void *data,
9670 + unsigned long *end)
9672 + struct stack_frame *frame = (struct stack_frame *)bp;
9674 + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
9675 + unsigned long addr;
9678 + if (__kernel_text_address(addr)) {
9679 + if ((unsigned long) stack == bp + 8) {
9680 + ops->address(data, addr, 1);
9681 + frame = frame->next_frame;
9682 + bp = (unsigned long) frame;
9684 + ops->address(data, addr, bp == 0);
9692 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
9693 - unsigned long *stack,
9694 + unsigned long *stack, unsigned long bp,
9695 const struct stacktrace_ops *ops, void *data)
9697 const unsigned cpu = get_cpu();
9698 @@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk,
9702 + tinfo = task_thread_info(tsk);
9705 unsigned long dummy;
9707 if (tsk && tsk != current)
9708 - stack = (unsigned long *)tsk->thread.rsp;
9709 + stack = (unsigned long *)tsk->thread.sp;
9713 - * Print function call entries within a stack. 'cond' is the
9714 - * "end of stackframe" condition, that the 'stack++'
9715 - * iteration will eventually trigger.
9717 -#define HANDLE_STACK(cond) \
9718 - do while (cond) { \
9719 - unsigned long addr = *stack++; \
9720 - /* Use unlocked access here because except for NMIs \
9721 - we should be already protected against module unloads */ \
9722 - if (__kernel_text_address(addr)) { \
9724 - * If the address is either in the text segment of the \
9725 - * kernel, or in the region which contains vmalloc'ed \
9726 - * memory, it *may* be the address of a calling \
9727 - * routine; if so, print it so that someone tracing \
9728 - * down the cause of the crash will be able to figure \
9729 - * out the call path that was taken. \
9731 - ops->address(data, addr); \
9734 +#ifdef CONFIG_FRAME_POINTER
9736 + if (tsk == current) {
9737 + /* Grab bp right from our regs */
9738 + asm("movq %%rbp, %0" : "=r" (bp):);
9740 + /* bp is the last reg pushed by switch_to */
9741 + bp = *(unsigned long *) tsk->thread.sp;
9749 * Print function call entries in all stacks, starting at the
9750 @@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk,
9752 if (ops->stack(data, id) < 0)
9754 - HANDLE_STACK (stack < estack_end);
9756 + bp = print_context_stack(tinfo, stack, bp, ops,
9757 + data, estack_end);
9758 ops->stack(data, "<EOE>");
9760 * We link to the next stack via the
9761 @@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk,
9762 if (stack >= irqstack && stack < irqstack_end) {
9763 if (ops->stack(data, "IRQ") < 0)
9765 - HANDLE_STACK (stack < irqstack_end);
9766 + bp = print_context_stack(tinfo, stack, bp,
9767 + ops, data, irqstack_end);
9769 * We link to the next stack (which would be
9770 * the process stack normally) the last
9771 @@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk,
9773 * This handles the process stack:
9775 - tinfo = task_thread_info(tsk);
9776 - HANDLE_STACK (valid_stack_ptr(tinfo, stack));
9777 -#undef HANDLE_STACK
9778 + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
9781 EXPORT_SYMBOL(dump_trace);
9782 @@ -333,10 +368,10 @@ static int print_trace_stack(void *data,
9786 -static void print_trace_address(void *data, unsigned long addr)
9787 +static void print_trace_address(void *data, unsigned long addr, int reliable)
9789 touch_nmi_watchdog();
9790 - printk_address(addr);
9791 + printk_address(addr, reliable);
9794 static const struct stacktrace_ops print_trace_ops = {
9795 @@ -347,15 +382,17 @@ static const struct stacktrace_ops print
9799 -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
9800 +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
9803 printk("\nCall Trace:\n");
9804 - dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
9805 + dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
9810 -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
9811 +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
9814 unsigned long *stack;
9816 @@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str
9817 // debugging aid: "show_stack(NULL, NULL);" prints the
9818 // back trace for this cpu.
9820 - if (rsp == NULL) {
9823 - rsp = (unsigned long *)tsk->thread.rsp;
9824 + sp = (unsigned long *)tsk->thread.sp;
9826 - rsp = (unsigned long *)&rsp;
9827 + sp = (unsigned long *)&sp;
9832 for(i=0; i < kstack_depth_to_print; i++) {
9833 if (stack >= irqstack && stack <= irqstack_end) {
9834 if (stack == irqstack_end) {
9835 @@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str
9836 printk(" %016lx", *stack++);
9837 touch_nmi_watchdog();
9839 - show_trace(tsk, regs, rsp);
9840 + show_trace(tsk, regs, sp, bp);
9843 -void show_stack(struct task_struct *tsk, unsigned long * rsp)
9844 +void show_stack(struct task_struct *tsk, unsigned long * sp)
9846 - _show_stack(tsk, NULL, rsp);
9847 + _show_stack(tsk, NULL, sp, 0);
9851 @@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk,
9852 void dump_stack(void)
9854 unsigned long dummy;
9855 + unsigned long bp = 0;
9857 +#ifdef CONFIG_FRAME_POINTER
9859 + asm("movq %%rbp, %0" : "=r" (bp):);
9862 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
9863 current->pid, current->comm, print_tainted(),
9864 init_utsname()->release,
9865 (int)strcspn(init_utsname()->version, " "),
9866 init_utsname()->version);
9867 - show_trace(NULL, NULL, &dummy);
9868 + show_trace(NULL, NULL, &dummy, bp);
9871 EXPORT_SYMBOL(dump_stack);
9872 @@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack);
9873 void show_registers(struct pt_regs *regs)
9876 - int in_kernel = !user_mode(regs);
9877 - unsigned long rsp;
9879 const int cpu = smp_processor_id();
9880 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
9882 + unsigned int code_prologue = code_bytes * 43 / 64;
9883 + unsigned int code_len = code_bytes;
9887 + ip = (u8 *) regs->ip - code_prologue;
9888 printk("CPU %d ", cpu);
9890 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
9891 @@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs
9892 * When in-kernel, we also print out the stack and code at the
9893 * time of the fault..
9896 + if (!user_mode(regs)) {
9899 - _show_stack(NULL, regs, (unsigned long*)rsp);
9900 + _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
9903 - printk("\nCode: ");
9904 - if (regs->rip < PAGE_OFFSET)
9907 - for (i=0; i<20; i++) {
9909 - if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
9911 + printk(KERN_EMERG "Code: ");
9912 + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
9913 + /* try starting at RIP */
9914 + ip = (u8 *) regs->ip;
9915 + code_len = code_len - code_prologue + 1;
9917 + for (i = 0; i < code_len; i++, ip++) {
9918 + if (ip < (u8 *)PAGE_OFFSET ||
9919 + probe_kernel_address(ip, c)) {
9920 printk(" Bad RIP value.");
9923 - printk("%02x ", c);
9924 + if (ip == (u8 *)regs->ip)
9925 + printk("<%02x> ", c);
9927 + printk("%02x ", c);
9933 -int is_valid_bugaddr(unsigned long rip)
9934 +int is_valid_bugaddr(unsigned long ip)
9938 - if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
9939 + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
9942 return ud2 == 0x0b0f;
9946 -void out_of_line_bug(void)
9950 -EXPORT_SYMBOL(out_of_line_bug);
9953 static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
9954 static int die_owner = -1;
9955 static unsigned int die_nest_count;
9956 @@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void)
9960 -void __kprobes oops_end(unsigned long flags)
9961 +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
9965 @@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl
9966 /* Nest count reaches zero, release the lock. */
9967 __raw_spin_unlock(&die_lock);
9968 raw_local_irq_restore(flags);
9974 panic("Fatal exception");
9979 -void __kprobes __die(const char * str, struct pt_regs * regs, long err)
9980 +int __kprobes __die(const char * str, struct pt_regs * regs, long err)
9982 static int die_counter;
9983 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
9984 @@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s
9985 printk("DEBUG_PAGEALLOC");
9988 - notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
9989 + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
9991 show_registers(regs);
9992 add_taint(TAINT_DIE);
9993 /* Executive summary in case the oops scrolled away */
9994 printk(KERN_ALERT "RIP ");
9995 - printk_address(regs->rip);
9996 - printk(" RSP <%016lx>\n", regs->rsp);
9997 + printk_address(regs->ip, 1);
9998 + printk(" RSP <%016lx>\n", regs->sp);
9999 if (kexec_should_crash(current))
10004 void die(const char * str, struct pt_regs * regs, long err)
10005 @@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg
10006 unsigned long flags = oops_begin();
10008 if (!user_mode(regs))
10009 - report_bug(regs->rip, regs);
10010 + report_bug(regs->ip, regs);
10012 - __die(str, regs, err);
10014 - do_exit(SIGSEGV);
10015 + if (__die(str, regs, err))
10017 + oops_end(flags, regs, SIGSEGV);
10020 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
10021 @@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct
10023 if (do_panic || panic_on_oops)
10024 panic("Non maskable interrupt");
10026 + oops_end(flags, NULL, SIGBUS);
10028 local_irq_enable();
10029 - do_exit(SIGSEGV);
10034 @@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr
10035 tsk->thread.trap_no = trapnr;
10037 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
10038 - printk_ratelimit())
10039 + printk_ratelimit()) {
10041 - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
10042 + "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
10043 tsk->comm, tsk->pid, str,
10044 - regs->rip, regs->rsp, error_code);
10045 + regs->ip, regs->sp, error_code);
10046 + print_vma_addr(" in ", regs->ip);
10051 force_sig_info(signr, info, tsk);
10052 @@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr
10056 - /* kernel trap */
10058 - const struct exception_table_entry *fixup;
10059 - fixup = search_exception_tables(regs->rip);
10061 - regs->rip = fixup->fixup;
10063 - tsk->thread.error_code = error_code;
10064 - tsk->thread.trap_no = trapnr;
10065 - die(str, regs, error_code);
10068 + if (!fixup_exception(regs)) {
10069 + tsk->thread.error_code = error_code;
10070 + tsk->thread.trap_no = trapnr;
10071 + die(str, regs, error_code);
10076 #define DO_ERROR(trapnr, signr, str, name) \
10077 @@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs
10078 do_trap(trapnr, signr, str, regs, error_code, &info); \
10081 -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip)
10082 +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
10083 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
10084 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
10085 -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
10086 +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
10087 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
10088 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
10089 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
10090 @@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro
10091 tsk->thread.trap_no = 13;
10093 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
10094 - printk_ratelimit())
10095 + printk_ratelimit()) {
10097 - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
10098 + "%s[%d] general protection ip:%lx sp:%lx error:%lx",
10099 tsk->comm, tsk->pid,
10100 - regs->rip, regs->rsp, error_code);
10101 + regs->ip, regs->sp, error_code);
10102 + print_vma_addr(" in ", regs->ip);
10106 force_sig(SIGSEGV, tsk);
10112 - const struct exception_table_entry *fixup;
10113 - fixup = search_exception_tables(regs->rip);
10115 - regs->rip = fixup->fixup;
10118 + if (fixup_exception(regs))
10121 - tsk->thread.error_code = error_code;
10122 - tsk->thread.trap_no = 13;
10123 - if (notify_die(DIE_GPF, "general protection fault", regs,
10124 - error_code, 13, SIGSEGV) == NOTIFY_STOP)
10126 - die("general protection fault", regs, error_code);
10128 + tsk->thread.error_code = error_code;
10129 + tsk->thread.trap_no = 13;
10130 + if (notify_die(DIE_GPF, "general protection fault", regs,
10131 + error_code, 13, SIGSEGV) == NOTIFY_STOP)
10133 + die("general protection fault", regs, error_code);
10136 static __kprobes void
10137 @@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn
10139 struct pt_regs *regs = eregs;
10140 /* Did already sync */
10141 - if (eregs == (struct pt_regs *)eregs->rsp)
10142 + if (eregs == (struct pt_regs *)eregs->sp)
10144 /* Exception from user space */
10145 else if (user_mode(eregs))
10146 regs = task_pt_regs(current);
10147 /* Exception from kernel and interrupts are enabled. Move to
10148 kernel process stack. */
10149 - else if (eregs->eflags & X86_EFLAGS_IF)
10150 - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
10151 + else if (eregs->flags & X86_EFLAGS_IF)
10152 + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
10156 @@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc
10158 get_debugreg(condition, 6);
10161 + * The processor cleared BTF, so don't mark that we need it set.
10163 + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
10164 + tsk->thread.debugctlmsr = 0;
10166 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
10167 SIGTRAP) == NOTIFY_STOP)
10169 @@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc
10171 tsk->thread.debugreg6 = condition;
10173 - /* Mask out spurious TF errors due to lazy TF clearing */
10176 + * Single-stepping through TF: make sure we ignore any events in
10177 + * kernel space (but re-enable TF when returning to user mode).
10179 if (condition & DR_STEP) {
10181 - * The TF error should be masked out only if the current
10182 - * process is not traced and if the TRAP flag has been set
10183 - * previously by a tracing process (condition detected by
10184 - * the PT_DTRACE flag); remember that the i386 TRAP flag
10185 - * can be modified by the process itself in user mode,
10186 - * allowing programs to debug themselves without the ptrace()
10189 if (!user_mode(regs))
10190 goto clear_TF_reenable;
10192 - * Was the TF flag set by a debugger? If so, clear it now,
10193 - * so that register information is correct.
10195 - if (tsk->ptrace & PT_DTRACE) {
10196 - regs->eflags &= ~TF_MASK;
10197 - tsk->ptrace &= ~PT_DTRACE;
10201 /* Ok, finally something we can handle */
10202 @@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc
10203 info.si_signo = SIGTRAP;
10205 info.si_code = TRAP_BRKPT;
10206 - info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
10207 + info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
10208 force_sig_info(SIGTRAP, &info, tsk);
10211 @@ -913,18 +949,15 @@ clear_dr7:
10214 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
10215 - regs->eflags &= ~TF_MASK;
10216 + regs->flags &= ~X86_EFLAGS_TF;
10217 preempt_conditional_cli(regs);
10220 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
10222 - const struct exception_table_entry *fixup;
10223 - fixup = search_exception_tables(regs->rip);
10225 - regs->rip = fixup->fixup;
10226 + if (fixup_exception(regs))
10230 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
10231 /* Illegal floating point operation in the kernel */
10232 current->thread.trap_no = trapnr;
10233 @@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r
10235 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
10237 - void __user *rip = (void __user *)(regs->rip);
10238 + void __user *ip = (void __user *)(regs->ip);
10239 struct task_struct * task;
10241 unsigned short cwd, swd;
10242 @@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str
10243 info.si_signo = SIGFPE;
10245 info.si_code = __SI_FAULT;
10246 - info.si_addr = rip;
10247 + info.si_addr = ip;
10249 * (~cwd & swd) will mask out exceptions that are not set to unmasked
10250 * status. 0x3f is the exception bits in these regs, 0x200 is the
10251 @@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void)
10253 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
10255 - void __user *rip = (void __user *)(regs->rip);
10256 + void __user *ip = (void __user *)(regs->ip);
10257 struct task_struct * task;
10259 unsigned short mxcsr;
10260 @@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro
10261 info.si_signo = SIGFPE;
10263 info.si_code = __SI_FAULT;
10264 - info.si_addr = rip;
10265 + info.si_addr = ip;
10267 * The SIMD FPU exceptions are handled a little differently, as there
10268 * is only a single status/control register. Thus, to determine which
10269 @@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void)
10270 task_thread_info(me)->status |= TS_USEDFPU;
10273 +EXPORT_SYMBOL_GPL(math_state_restore);
10277 * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
10278 * specify <dpl>|4 in the second field.
10280 -static trap_info_t __cpuinitdata trap_table[] = {
10281 +static const trap_info_t __cpuinitconst trap_table[] = {
10282 { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error },
10283 { 1, 0|4, __KERNEL_CS, (unsigned long)debug },
10284 { 3, 3|4, __KERNEL_CS, (unsigned long)int3 },
10285 @@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s)
10288 early_param("kstack", kstack_setup);
10291 +static int __init code_bytes_setup(char *s)
10293 + code_bytes = simple_strtoul(s, NULL, 0);
10294 + if (code_bytes > 8192)
10295 + code_bytes = 8192;
10299 +__setup("code_bytes=", code_bytes_setup);
10300 --- sle11-2009-10-16.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-02-16 16:18:36.000000000 +0100
10301 +++ sle11-2009-10-16/arch/x86/kernel/vsyscall_64-xen.c 2009-03-16 16:33:40.000000000 +0100
10303 #include <asm/vgtod.h>
10305 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
10306 -#define __syscall_clobber "r11","rcx","memory"
10307 -#define __pa_vsymbol(x) \
10308 - ({unsigned long v; \
10309 - extern char __vsyscall_0; \
10310 - asm("" : "=r" (v) : "0" (x)); \
10311 - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
10312 +#define __syscall_clobber "r11","cx","memory"
10315 * vsyscall_gtod_data contains data that is :
10316 @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st
10317 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
10320 - asm volatile("vsysc2: syscall"
10321 + asm volatile("syscall"
10323 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
10324 : __syscall_clobber );
10325 @@ -112,7 +107,7 @@ static __always_inline int gettimeofday(
10326 static __always_inline long time_syscall(long *t)
10329 - asm volatile("vsysc1: syscall"
10330 + asm volatile("syscall"
10332 : "0" (__NR_time),"D" (t) : __syscall_clobber);
10334 @@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t)
10336 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
10338 - unsigned int dummy, p;
10340 unsigned long j = 0;
10342 /* Fast cache - only recompute value once per jiffies and avoid
10343 @@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
10344 p = tcache->blob[1];
10345 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
10346 /* Load per CPU data from RDTSCP */
10347 - rdtscp(dummy, dummy, p);
10348 + native_read_tscp(&p);
10350 /* Load per CPU data from GDT */
10351 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
10352 @@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
10354 #ifdef CONFIG_SYSCTL
10356 -#define SYSCALL 0x050f
10357 -#define NOP2 0x9090
10360 - * NOP out syscall in vsyscall page when not needed.
10362 -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10363 - void __user *buffer, size_t *lenp, loff_t *ppos)
10365 +vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
10366 + void __user *buffer, size_t *lenp, loff_t *ppos)
10368 - extern u16 vsysc1, vsysc2;
10369 - u16 __iomem *map1;
10370 - u16 __iomem *map2;
10371 - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10374 - /* gcc has some trouble with __va(__pa()), so just do it this
10376 - map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
10379 - map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
10384 - if (!vsyscall_gtod_data.sysctl_enabled) {
10385 - writew(SYSCALL, map1);
10386 - writew(SYSCALL, map2);
10388 - writew(NOP2, map1);
10389 - writew(NOP2, map2);
10395 + return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
10398 static ctl_table kernel_table2[] = {
10399 @@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] =
10400 .child = kernel_table2 },
10406 /* Assume __initcall executes before all user space. Hopefully kmod
10407 @@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i
10409 d |= (node & 0xf) << 12;
10410 d |= (node >> 4) << 48;
10411 - if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
10412 + if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
10413 + GDT_ENTRY_PER_CPU),
10416 @@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl
10417 return NOTIFY_DONE;
10420 -static void __init map_vsyscall(void)
10421 +void __init map_vsyscall(void)
10423 extern char __vsyscall_0;
10424 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
10425 @@ -338,7 +301,6 @@ static int __init vsyscall_init(void)
10426 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
10427 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
10428 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
10431 vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
10432 if (boot_cpu_has(X86_FEATURE_RDTSCP))
10433 --- sle11-2009-10-16.orig/arch/x86/kernel/xen_entry_64.S 2009-10-28 14:55:04.000000000 +0100
10434 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
10437 - * Copied from arch/xen/i386/kernel/entry.S
10439 -/* Offsets into shared_info_t. */
10440 -#define evtchn_upcall_pending /* 0 */
10441 -#define evtchn_upcall_mask 1
10443 -#define sizeof_vcpu_shift 6
10446 -//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
10447 -//#define preempt_enable(reg) decl threadinfo_preempt_count(reg)
10448 -#define preempt_disable(reg)
10449 -#define preempt_enable(reg)
10450 -#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \
10451 - movq %gs:pda_cpunumber,reg ; \
10453 - shr $32-sizeof_vcpu_shift,reg ; \
10454 - addq HYPERVISOR_shared_info,reg
10455 -#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \
10456 -#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
10458 -#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
10459 -#define XEN_PUT_VCPU_INFO(reg)
10460 -#define XEN_PUT_VCPU_INFO_fixup
10463 -#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg)
10464 -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
10465 -#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
10466 - XEN_LOCKED_BLOCK_EVENTS(reg) ; \
10467 - XEN_PUT_VCPU_INFO(reg)
10468 -#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \
10469 - XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \
10470 - XEN_PUT_VCPU_INFO(reg)
10471 -#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg)
10472 --- sle11-2009-10-16.orig/arch/x86/mach-xen/setup.c 2009-02-16 16:17:21.000000000 +0100
10473 +++ sle11-2009-10-16/arch/x86/mach-xen/setup.c 2009-03-16 16:33:40.000000000 +0100
10474 @@ -161,15 +161,12 @@ void __init machine_specific_arch_setup(
10476 /* Do an early initialization of the fixmap area */
10478 - extern pte_t swapper_pg_pmd[PTRS_PER_PTE];
10479 + extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
10480 unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
10481 - pgd_t *pgd = (pgd_t *)xen_start_info->pt_base;
10482 - pud_t *pud = pud_offset(pgd + pgd_index(addr), addr);
10483 + pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
10484 pmd_t *pmd = pmd_offset(pud, addr);
10486 - swapper_pg_dir = pgd;
10487 - init_mm.pgd = pgd;
10488 - make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables);
10489 - set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE));
10490 + make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
10491 + set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
10494 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
10495 +++ sle11-2009-10-16/arch/x86/mm/fault-xen.c 2009-03-16 16:33:40.000000000 +0100
10498 + * Copyright (C) 1995 Linus Torvalds
10499 + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
10502 +#include <linux/signal.h>
10503 +#include <linux/sched.h>
10504 +#include <linux/kernel.h>
10505 +#include <linux/errno.h>
10506 +#include <linux/string.h>
10507 +#include <linux/types.h>
10508 +#include <linux/ptrace.h>
10509 +#include <linux/mman.h>
10510 +#include <linux/mm.h>
10511 +#include <linux/smp.h>
10512 +#include <linux/interrupt.h>
10513 +#include <linux/init.h>
10514 +#include <linux/tty.h>
10515 +#include <linux/vt_kern.h> /* For unblank_screen() */
10516 +#include <linux/compiler.h>
10517 +#include <linux/highmem.h>
10518 +#include <linux/bootmem.h> /* for max_low_pfn */
10519 +#include <linux/vmalloc.h>
10520 +#include <linux/module.h>
10521 +#include <linux/kprobes.h>
10522 +#include <linux/uaccess.h>
10523 +#include <linux/kdebug.h>
10525 +#include <asm/system.h>
10526 +#include <asm/desc.h>
10527 +#include <asm/segment.h>
10528 +#include <asm/pgalloc.h>
10529 +#include <asm/smp.h>
10530 +#include <asm/tlbflush.h>
10531 +#include <asm/proto.h>
10532 +#include <asm-generic/sections.h>
10535 + * Page fault error code bits
10536 + * bit 0 == 0 means no page found, 1 means protection fault
10537 + * bit 1 == 0 means read, 1 means write
10538 + * bit 2 == 0 means kernel, 1 means user-mode
10539 + * bit 3 == 1 means use of reserved bit detected
10540 + * bit 4 == 1 means fault was an instruction fetch
10542 +#define PF_PROT (1<<0)
10543 +#define PF_WRITE (1<<1)
10544 +#define PF_USER (1<<2)
10545 +#define PF_RSVD (1<<3)
10546 +#define PF_INSTR (1<<4)
10548 +static inline int notify_page_fault(struct pt_regs *regs)
10550 +#ifdef CONFIG_KPROBES
10553 + /* kprobe_running() needs smp_processor_id() */
10554 +#ifdef CONFIG_X86_32
10555 + if (!user_mode_vm(regs)) {
10557 + if (!user_mode(regs)) {
10559 + preempt_disable();
10560 + if (kprobe_running() && kprobe_fault_handler(regs, 14))
10562 + preempt_enable();
10573 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
10574 + * Check that here and ignore it.
10577 + * Sometimes the CPU reports invalid exceptions on prefetch.
10578 + * Check that here and ignore it.
10580 + * Opcode checker based on code by Richard Brunner
10582 +static int is_prefetch(struct pt_regs *regs, unsigned long addr,
10583 + unsigned long error_code)
10585 + unsigned char *instr;
10586 + int scan_more = 1;
10587 + int prefetch = 0;
10588 + unsigned char *max_instr;
10591 + * If it was a exec (instruction fetch) fault on NX page, then
10592 + * do not ignore the fault:
10594 + if (error_code & PF_INSTR)
10597 + instr = (unsigned char *)convert_ip_to_linear(current, regs);
10598 + max_instr = instr + 15;
10600 + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
10603 + while (scan_more && instr < max_instr) {
10604 + unsigned char opcode;
10605 + unsigned char instr_hi;
10606 + unsigned char instr_lo;
10608 + if (probe_kernel_address(instr, opcode))
10611 + instr_hi = opcode & 0xf0;
10612 + instr_lo = opcode & 0x0f;
10615 + switch (instr_hi) {
10619 + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
10620 + * In X86_64 long mode, the CPU will signal invalid
10621 + * opcode if some of these prefixes are present so
10622 + * X86_64 will never get here anyway
10624 + scan_more = ((instr_lo & 7) == 0x6);
10626 +#ifdef CONFIG_X86_64
10629 + * In AMD64 long mode 0x40..0x4F are valid REX prefixes
10630 + * Need to figure out under what instruction mode the
10631 + * instruction was issued. Could check the LDT for lm,
10632 + * but for now it's good enough to assume that long
10633 + * mode only uses well known segments or kernel.
10635 + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
10639 + /* 0x64 thru 0x67 are valid prefixes in all modes. */
10640 + scan_more = (instr_lo & 0xC) == 0x4;
10643 + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
10644 + scan_more = !instr_lo || (instr_lo>>1) == 1;
10647 + /* Prefetch instruction is 0x0F0D or 0x0F18 */
10650 + if (probe_kernel_address(instr, opcode))
10652 + prefetch = (instr_lo == 0xF) &&
10653 + (opcode == 0x0D || opcode == 0x18);
10663 +static void force_sig_info_fault(int si_signo, int si_code,
10664 + unsigned long address, struct task_struct *tsk)
10668 + info.si_signo = si_signo;
10669 + info.si_errno = 0;
10670 + info.si_code = si_code;
10671 + info.si_addr = (void __user *)address;
10672 + force_sig_info(si_signo, &info, tsk);
10675 +#ifdef CONFIG_X86_64
10676 +static int bad_address(void *p)
10678 + unsigned long dummy;
10679 + return probe_kernel_address((unsigned long *)p, dummy);
10683 +static void dump_pagetable(unsigned long address)
10685 +#ifdef CONFIG_X86_32
10686 + __typeof__(pte_val(__pte(0))) page;
10688 + page = read_cr3();
10689 + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
10690 +#ifdef CONFIG_X86_PAE
10691 + printk("*pdpt = %016Lx ", page);
10692 + if ((page & _PAGE_PRESENT)
10693 + && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
10694 + page = mfn_to_pfn(page >> PAGE_SHIFT);
10695 + page <<= PAGE_SHIFT;
10696 + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
10697 + & (PTRS_PER_PMD - 1)];
10698 + printk(KERN_CONT "*pde = %016Lx ", page);
10699 + page &= ~_PAGE_NX;
10702 + printk("*pde = %08lx ", page);
10706 + * We must not directly access the pte in the highpte
10707 + * case if the page table is located in highmem.
10708 + * And let's rather not kmap-atomic the pte, just in case
10709 + * it's allocated already.
10711 + if ((page & _PAGE_PRESENT)
10712 + && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
10713 + && !(page & _PAGE_PSE)) {
10714 + page = mfn_to_pfn(page >> PAGE_SHIFT);
10715 + page <<= PAGE_SHIFT;
10716 + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
10717 + & (PTRS_PER_PTE - 1)];
10718 + printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
10721 + printk(KERN_CONT "\n");
10722 +#else /* CONFIG_X86_64 */
10728 + pgd = (pgd_t *)read_cr3();
10730 + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
10731 + pgd += pgd_index(address);
10732 + if (bad_address(pgd)) goto bad;
10733 + printk("PGD %lx ", pgd_val(*pgd));
10734 + if (!pgd_present(*pgd)) goto ret;
10736 + pud = pud_offset(pgd, address);
10737 + if (bad_address(pud)) goto bad;
10738 + printk(KERN_CONT "PUD %lx ", pud_val(*pud));
10739 + if (!pud_present(*pud) || pud_large(*pud))
10742 + pmd = pmd_offset(pud, address);
10743 + if (bad_address(pmd)) goto bad;
10744 + printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
10745 + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
10747 + pte = pte_offset_kernel(pmd, address);
10748 + if (bad_address(pte)) goto bad;
10749 + printk(KERN_CONT "PTE %lx", pte_val(*pte));
10751 + printk(KERN_CONT "\n");
10758 +#ifdef CONFIG_X86_32
10759 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
10761 + unsigned index = pgd_index(address);
10763 + pud_t *pud, *pud_k;
10764 + pmd_t *pmd, *pmd_k;
10767 + pgd_k = init_mm.pgd + index;
10769 + if (!pgd_present(*pgd_k))
10773 + * set_pgd(pgd, *pgd_k); here would be useless on PAE
10774 + * and redundant with the set_pmd() on non-PAE. As would
10778 + pud = pud_offset(pgd, address);
10779 + pud_k = pud_offset(pgd_k, address);
10780 + if (!pud_present(*pud_k))
10783 + pmd = pmd_offset(pud, address);
10784 + pmd_k = pmd_offset(pud_k, address);
10785 + if (!pmd_present(*pmd_k))
10787 + if (!pmd_present(*pmd)) {
10788 + bool lazy = x86_read_percpu(xen_lazy_mmu);
10790 + x86_write_percpu(xen_lazy_mmu, false);
10791 +#if CONFIG_XEN_COMPAT > 0x030002
10792 + set_pmd(pmd, *pmd_k);
10795 + * When running on older Xen we must launder *pmd_k through
10796 + * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
10798 + set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
10800 + x86_write_percpu(xen_lazy_mmu, lazy);
10802 + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
10807 +#ifdef CONFIG_X86_64
10808 +static const char errata93_warning[] =
10809 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
10810 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
10811 +KERN_ERR "******* Please consider a BIOS update.\n"
10812 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
10815 +/* Workaround for K8 erratum #93 & buggy BIOS.
10816 + BIOS SMM functions are required to use a specific workaround
10817 + to avoid corruption of the 64bit RIP register on C stepping K8.
10818 + A lot of BIOS that didn't get tested properly miss this.
10819 + The OS sees this as a page fault with the upper 32bits of RIP cleared.
10820 + Try to work around it here.
10821 + Note we only handle faults in kernel here.
10822 + Does nothing for X86_32
10824 +static int is_errata93(struct pt_regs *regs, unsigned long address)
10826 +#ifdef CONFIG_X86_64
10827 + static int warned;
10828 + if (address != regs->ip)
10830 + if ((address >> 32) != 0)
10832 + address |= 0xffffffffUL << 32;
10833 + if ((address >= (u64)_stext && address <= (u64)_etext) ||
10834 + (address >= MODULES_VADDR && address <= MODULES_END)) {
10836 + printk(errata93_warning);
10839 + regs->ip = address;
10847 + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
10848 + * addresses >4GB. We catch this in the page fault handler because these
10849 + * addresses are not reachable. Just detect this case and return. Any code
10850 + * segment in LDT is compatibility mode.
10852 +static int is_errata100(struct pt_regs *regs, unsigned long address)
10854 +#ifdef CONFIG_X86_64
10855 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
10862 +void do_invalid_op(struct pt_regs *, unsigned long);
10864 +static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
10866 +#ifdef CONFIG_X86_F00F_BUG
10867 + unsigned long nr;
10869 + * Pentium F0 0F C7 C8 bug workaround.
10871 + if (boot_cpu_data.f00f_bug) {
10872 + nr = (address - idt_descr.address) >> 3;
10875 + do_invalid_op(regs, 0);
10883 +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
10884 + unsigned long address)
10886 +#ifdef CONFIG_X86_32
10887 + if (!oops_may_print())
10891 +#ifdef CONFIG_X86_PAE
10892 + if (error_code & PF_INSTR) {
10893 + unsigned int level;
10894 + pte_t *pte = lookup_address(address, &level);
10896 + if (pte && pte_present(*pte) && !pte_exec(*pte))
10897 + printk(KERN_CRIT "kernel tried to execute "
10898 + "NX-protected page - exploit attempt? "
10899 + "(uid: %d)\n", current->uid);
10903 + printk(KERN_ALERT "BUG: unable to handle kernel ");
10904 + if (address < PAGE_SIZE)
10905 + printk(KERN_CONT "NULL pointer dereference");
10907 + printk(KERN_CONT "paging request");
10908 +#ifdef CONFIG_X86_32
10909 + printk(KERN_CONT " at %08lx\n", address);
10911 + printk(KERN_CONT " at %016lx\n", address);
10913 + printk(KERN_ALERT "IP:");
10914 + printk_address(regs->ip, 1);
10915 + dump_pagetable(address);
10918 +#ifdef CONFIG_X86_64
10919 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
10920 + unsigned long error_code)
10922 + unsigned long flags = oops_begin();
10923 + struct task_struct *tsk;
10925 + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
10926 + current->comm, address);
10927 + dump_pagetable(address);
10929 + tsk->thread.cr2 = address;
10930 + tsk->thread.trap_no = 14;
10931 + tsk->thread.error_code = error_code;
10932 + if (__die("Bad pagetable", regs, error_code))
10934 + oops_end(flags, regs, SIGKILL);
10938 +static int spurious_fault_check(unsigned long error_code, pte_t *pte)
10940 + if ((error_code & PF_WRITE) && !pte_write(*pte))
10942 + if ((error_code & PF_INSTR) && !pte_exec(*pte))
10949 + * Handle a spurious fault caused by a stale TLB entry. This allows
10950 + * us to lazily refresh the TLB when increasing the permissions of a
10951 + * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
10952 + * expensive since that implies doing a full cross-processor TLB
10953 + * flush, even if no stale TLB entries exist on other processors.
10954 + * There are no security implications to leaving a stale TLB when
10955 + * increasing the permissions on a page.
10957 +static int spurious_fault(unsigned long address,
10958 + unsigned long error_code)
10965 + /* Reserved-bit violation or user access to kernel space? */
10966 + if (error_code & (PF_USER | PF_RSVD))
10969 + pgd = init_mm.pgd + pgd_index(address);
10970 + if (!pgd_present(*pgd))
10973 + pud = pud_offset(pgd, address);
10974 + if (!pud_present(*pud))
10977 + if (pud_large(*pud))
10978 + return spurious_fault_check(error_code, (pte_t *) pud);
10980 + pmd = pmd_offset(pud, address);
10981 + if (!pmd_present(*pmd))
10984 + if (pmd_large(*pmd))
10985 + return spurious_fault_check(error_code, (pte_t *) pmd);
10987 + pte = pte_offset_kernel(pmd, address);
10988 + if (!pte_present(*pte))
10991 + return spurious_fault_check(error_code, pte);
10996 + * Handle a fault on the vmalloc or module mapping area
10999 + * Handle a fault on the vmalloc area
11001 + * This assumes no large pages in there.
11003 +static int vmalloc_fault(unsigned long address)
11005 +#ifdef CONFIG_X86_32
11006 + unsigned long pgd_paddr;
11010 + * Synchronize this task's top level page-table
11011 + * with the 'reference' page table.
11013 + * Do _not_ use "current" here. We might be inside
11014 + * an interrupt in the middle of a task switch..
11016 + pgd_paddr = read_cr3();
11017 + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11020 + pte_k = pte_offset_kernel(pmd_k, address);
11021 + if (!pte_present(*pte_k))
11025 + pgd_t *pgd, *pgd_ref;
11026 + pud_t *pud, *pud_ref;
11027 + pmd_t *pmd, *pmd_ref;
11028 + pte_t *pte, *pte_ref;
11030 + /* Make sure we are in vmalloc area */
11031 + if (!(address >= VMALLOC_START && address < VMALLOC_END))
11034 + /* Copy kernel mappings over when needed. This can also
11035 + happen within a race in page table update. In the later
11036 + case just flush. */
11038 + /* On Xen the line below does not always work. Needs investigating! */
11039 + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
11040 + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
11041 + pgd += pgd_index(address);
11042 + pgd_ref = pgd_offset_k(address);
11043 + if (pgd_none(*pgd_ref))
11045 + if (pgd_none(*pgd))
11046 + set_pgd(pgd, *pgd_ref);
11048 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11050 + /* Below here mismatches are bugs because these lower tables
11053 + pud = pud_offset(pgd, address);
11054 + pud_ref = pud_offset(pgd_ref, address);
11055 + if (pud_none(*pud_ref))
11057 + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
11059 + pmd = pmd_offset(pud, address);
11060 + pmd_ref = pmd_offset(pud_ref, address);
11061 + if (pmd_none(*pmd_ref))
11063 + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
11065 + pte_ref = pte_offset_kernel(pmd_ref, address);
11066 + if (!pte_present(*pte_ref))
11068 + pte = pte_offset_kernel(pmd, address);
11069 + /* Don't use pte_page here, because the mappings can point
11070 + outside mem_map, and the NUMA hash lookup cannot handle
11072 + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
11078 +int show_unhandled_signals = 1;
11081 + * This routine handles page faults. It determines the address,
11082 + * and the problem, and then passes it off to one of the appropriate
11085 +#ifdef CONFIG_X86_64
11088 +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
11090 + struct task_struct *tsk;
11091 + struct mm_struct *mm;
11092 + struct vm_area_struct *vma;
11093 + unsigned long address;
11094 + int write, si_code;
11096 +#ifdef CONFIG_X86_64
11097 + unsigned long flags;
11101 + * We can fault from pretty much anywhere, with unknown IRQ state.
11103 + trace_hardirqs_fixup();
11105 + /* Set the "privileged fault" bit to something sane. */
11106 + if (user_mode_vm(regs))
11107 + error_code |= PF_USER;
11109 + error_code &= ~PF_USER;
11113 + prefetchw(&mm->mmap_sem);
11115 + /* get the address */
11116 + address = read_cr2();
11118 + si_code = SEGV_MAPERR;
11120 + if (notify_page_fault(regs))
11124 + * We fault-in kernel-space virtual memory on-demand. The
11125 + * 'reference' page table is init_mm.pgd.
11127 + * NOTE! We MUST NOT take any locks for this case. We may
11128 + * be in an interrupt or a critical region, and should
11129 + * only copy the information from the master page table,
11132 + * This verifies that the fault happens in kernel space
11133 + * (error_code & 4) == 0, and that the fault was not a
11134 + * protection error (error_code & 9) == 0.
11136 +#ifdef CONFIG_X86_32
11137 + if (unlikely(address >= TASK_SIZE)) {
11139 + if (unlikely(address >= TASK_SIZE64)) {
11141 + /* Faults in hypervisor area can never be patched up. */
11142 +#if defined(CONFIG_X86_XEN)
11143 + if (address >= hypervisor_virt_start)
11144 + goto bad_area_nosemaphore;
11145 +#elif defined(CONFIG_X86_64_XEN)
11146 + if (address >= HYPERVISOR_VIRT_START
11147 + && address < HYPERVISOR_VIRT_END)
11148 + goto bad_area_nosemaphore;
11150 + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
11151 + vmalloc_fault(address) >= 0)
11154 + /* Can handle a stale RO->RW TLB */
11155 + if (spurious_fault(address, error_code))
11159 + * Don't take the mm semaphore here. If we fixup a prefetch
11160 + * fault we could otherwise deadlock.
11162 + goto bad_area_nosemaphore;
11166 +#ifdef CONFIG_X86_32
11167 + /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11168 + fault has been handled. */
11169 + if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
11170 + local_irq_enable();
11173 + * If we're in an interrupt, have no user context or are running in an
11174 + * atomic region then we must not take the fault.
11176 + if (in_atomic() || !mm)
11177 + goto bad_area_nosemaphore;
11178 +#else /* CONFIG_X86_64 */
11179 + if (likely(regs->flags & X86_EFLAGS_IF))
11180 + local_irq_enable();
11182 + if (unlikely(error_code & PF_RSVD))
11183 + pgtable_bad(address, regs, error_code);
11186 + * If we're in an interrupt, have no user context or are running in an
11187 + * atomic region then we must not take the fault.
11189 + if (unlikely(in_atomic() || !mm))
11190 + goto bad_area_nosemaphore;
11193 + * User-mode registers count as a user access even for any
11194 + * potential system fault or CPU buglet.
11196 + if (user_mode_vm(regs))
11197 + error_code |= PF_USER;
11200 + /* When running in the kernel we expect faults to occur only to
11201 + * addresses in user space. All other faults represent errors in the
11202 + * kernel and should generate an OOPS. Unfortunately, in the case of an
11203 + * erroneous fault occurring in a code path which already holds mmap_sem
11204 + * we will deadlock attempting to validate the fault against the
11205 + * address space. Luckily the kernel only validly references user
11206 + * space from well defined areas of code, which are listed in the
11207 + * exceptions table.
11209 + * As the vast majority of faults will be valid we will only perform
11210 + * the source reference check when there is a possibility of a deadlock.
11211 + * Attempt to lock the address space, if we cannot we then validate the
11212 + * source. If this is invalid we can skip the address space check,
11213 + * thus avoiding the deadlock.
11215 + if (!down_read_trylock(&mm->mmap_sem)) {
11216 + if ((error_code & PF_USER) == 0 &&
11217 + !search_exception_tables(regs->ip))
11218 + goto bad_area_nosemaphore;
11219 + down_read(&mm->mmap_sem);
11222 + vma = find_vma(mm, address);
11225 + if (vma->vm_start <= address)
11227 + if (!(vma->vm_flags & VM_GROWSDOWN))
11229 + if (error_code & PF_USER) {
11231 + * Accessing the stack below %sp is always a bug.
11232 + * The large cushion allows instructions like enter
11233 + * and pusha to work. ("enter $65535,$31" pushes
11234 + * 32 pointers and then decrements %sp by 65535.)
11236 + if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
11239 + if (expand_stack(vma, address))
11242 + * Ok, we have a good vm_area for this memory access, so
11243 + * we can handle it..
11246 + si_code = SEGV_ACCERR;
11248 + switch (error_code & (PF_PROT|PF_WRITE)) {
11249 + default: /* 3: write, present */
11250 + /* fall through */
11251 + case PF_WRITE: /* write, not present */
11252 + if (!(vma->vm_flags & VM_WRITE))
11256 + case PF_PROT: /* read, present */
11258 + case 0: /* read, not present */
11259 + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
11263 +#ifdef CONFIG_X86_32
11267 + * If for any reason at all we couldn't handle the fault,
11268 + * make sure we exit gracefully rather than endlessly redo
11271 + fault = handle_mm_fault(mm, vma, address, write);
11272 + if (unlikely(fault & VM_FAULT_ERROR)) {
11273 + if (fault & VM_FAULT_OOM)
11274 + goto out_of_memory;
11275 + else if (fault & VM_FAULT_SIGBUS)
11279 + if (fault & VM_FAULT_MAJOR)
11284 +#ifdef CONFIG_X86_32
11286 + * Did it hit the DOS screen memory VA from vm86 mode?
11288 + if (v8086_mode(regs)) {
11289 + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
11291 + tsk->thread.screen_bitmap |= 1 << bit;
11294 + up_read(&mm->mmap_sem);
11298 + * Something tried to access memory that isn't in our memory map..
11299 + * Fix it, but check if it's kernel or user first..
11302 + up_read(&mm->mmap_sem);
11304 +bad_area_nosemaphore:
11305 + /* User mode accesses just cause a SIGSEGV */
11306 + if (error_code & PF_USER) {
11308 + * It's possible to have interrupts off here.
11310 + local_irq_enable();
11313 + * Valid to do another page fault here because this one came
11314 + * from user space.
11316 + if (is_prefetch(regs, address, error_code))
11319 + if (is_errata100(regs, address))
11322 + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
11323 + printk_ratelimit()) {
11325 +#ifdef CONFIG_X86_32
11326 + "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
11328 + "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
11330 + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
11331 + tsk->comm, task_pid_nr(tsk), address, regs->ip,
11332 + regs->sp, error_code);
11333 + print_vma_addr(" in ", regs->ip);
11337 + tsk->thread.cr2 = address;
11338 + /* Kernel addresses are always protection faults */
11339 + tsk->thread.error_code = error_code | (address >= TASK_SIZE);
11340 + tsk->thread.trap_no = 14;
11341 + force_sig_info_fault(SIGSEGV, si_code, address, tsk);
11345 + if (is_f00f_bug(regs, address))
11349 + /* Are we prepared to handle this kernel fault? */
11350 + if (fixup_exception(regs))
11355 + * Valid to do another page fault here, because if this fault
11356 + * had been triggered by is_prefetch fixup_exception would have
11360 + * Hall of shame of CPU/BIOS bugs.
11362 + if (is_prefetch(regs, address, error_code))
11365 + if (is_errata93(regs, address))
11369 + * Oops. The kernel tried to access some bad page. We'll have to
11370 + * terminate things with extreme prejudice.
11372 +#ifdef CONFIG_X86_32
11373 + bust_spinlocks(1);
11375 + flags = oops_begin();
11378 + show_fault_oops(regs, error_code, address);
11380 + tsk->thread.cr2 = address;
11381 + tsk->thread.trap_no = 14;
11382 + tsk->thread.error_code = error_code;
11384 +#ifdef CONFIG_X86_32
11385 + die("Oops", regs, error_code);
11386 + bust_spinlocks(0);
11387 + do_exit(SIGKILL);
11389 + if (__die("Oops", regs, error_code))
11391 + /* Executive summary in case the body of the oops scrolled away */
11392 + printk(KERN_EMERG "CR2: %016lx\n", address);
11393 + oops_end(flags, regs, SIGKILL);
11397 + * We ran out of memory, or some other thing happened to us that made
11398 + * us unable to handle the page fault gracefully.
11401 + up_read(&mm->mmap_sem);
11402 + if (is_global_init(tsk)) {
11404 +#ifdef CONFIG_X86_32
11405 + down_read(&mm->mmap_sem);
11412 + printk("VM: killing process %s\n", tsk->comm);
11413 + if (error_code & PF_USER)
11414 + do_group_exit(SIGKILL);
11418 + up_read(&mm->mmap_sem);
11420 + /* Kernel mode? Handle exceptions or die */
11421 + if (!(error_code & PF_USER))
11423 +#ifdef CONFIG_X86_32
11424 + /* User space => ok to do another page fault */
11425 + if (is_prefetch(regs, address, error_code))
11428 + tsk->thread.cr2 = address;
11429 + tsk->thread.error_code = error_code;
11430 + tsk->thread.trap_no = 14;
11431 + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
11434 +DEFINE_SPINLOCK(pgd_lock);
11435 +LIST_HEAD(pgd_list);
11437 +void vmalloc_sync_all(void)
11439 +#ifdef CONFIG_X86_32
11441 + * Note that races in the updates of insync and start aren't
11442 + * problematic: insync can only get set bits added, and updates to
11443 + * start are only improving performance (without affecting correctness
11445 + * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
11446 + * This change works just fine with 2-level paging too.
11448 +#define sync_index(a) ((a) >> PMD_SHIFT)
11449 + static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
11450 + static unsigned long start = TASK_SIZE;
11451 + unsigned long address;
11453 + if (SHARED_KERNEL_PMD)
11456 + BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
11457 + for (address = start;
11458 + address < hypervisor_virt_start;
11459 + address += PMD_SIZE) {
11460 + if (!test_bit(sync_index(address), insync)) {
11461 + unsigned long flags;
11462 + struct page *page;
11464 + spin_lock_irqsave(&pgd_lock, flags);
11465 + /* XEN: failure path assumes non-empty pgd_list. */
11466 + if (unlikely(list_empty(&pgd_list))) {
11467 + spin_unlock_irqrestore(&pgd_lock, flags);
11470 + list_for_each_entry(page, &pgd_list, lru) {
11471 + if (!vmalloc_sync_one(page_address(page),
11475 + spin_unlock_irqrestore(&pgd_lock, flags);
11477 + set_bit(sync_index(address), insync);
11479 + if (address == start && test_bit(sync_index(address), insync))
11480 + start = address + PMD_SIZE;
11482 +#else /* CONFIG_X86_64 */
11484 + * Note that races in the updates of insync and start aren't
11485 + * problematic: insync can only get set bits added, and updates to
11486 + * start are only improving performance (without affecting correctness
11489 + static DECLARE_BITMAP(insync, PTRS_PER_PGD);
11490 + static unsigned long start = VMALLOC_START & PGDIR_MASK;
11491 + unsigned long address;
11493 + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
11494 + if (!test_bit(pgd_index(address), insync)) {
11495 + const pgd_t *pgd_ref = pgd_offset_k(address);
11496 + unsigned long flags;
11497 + struct page *page;
11499 + if (pgd_none(*pgd_ref))
11501 + spin_lock_irqsave(&pgd_lock, flags);
11502 + list_for_each_entry(page, &pgd_list, lru) {
11504 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
11505 + if (pgd_none(*pgd))
11506 + set_pgd(pgd, *pgd_ref);
11508 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
11510 + spin_unlock_irqrestore(&pgd_lock, flags);
11511 + set_bit(pgd_index(address), insync);
11513 + if (address == start)
11514 + start = address + PGDIR_SIZE;
11516 + /* Check that there is no need to do the same for the modules area. */
11517 + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
11518 + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
11519 + (__START_KERNEL & PGDIR_MASK)));
11522 --- sle11-2009-10-16.orig/arch/x86/mm/fault_32-xen.c 2009-02-16 16:18:36.000000000 +0100
11523 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
11526 - * linux/arch/i386/mm/fault.c
11528 - * Copyright (C) 1995 Linus Torvalds
11531 -#include <linux/signal.h>
11532 -#include <linux/sched.h>
11533 -#include <linux/kernel.h>
11534 -#include <linux/errno.h>
11535 -#include <linux/string.h>
11536 -#include <linux/types.h>
11537 -#include <linux/ptrace.h>
11538 -#include <linux/mman.h>
11539 -#include <linux/mm.h>
11540 -#include <linux/smp.h>
11541 -#include <linux/interrupt.h>
11542 -#include <linux/init.h>
11543 -#include <linux/tty.h>
11544 -#include <linux/vt_kern.h> /* For unblank_screen() */
11545 -#include <linux/highmem.h>
11546 -#include <linux/bootmem.h> /* for max_low_pfn */
11547 -#include <linux/vmalloc.h>
11548 -#include <linux/module.h>
11549 -#include <linux/kprobes.h>
11550 -#include <linux/uaccess.h>
11551 -#include <linux/kdebug.h>
11552 -#include <linux/kprobes.h>
11554 -#include <asm/system.h>
11555 -#include <asm/desc.h>
11556 -#include <asm/segment.h>
11558 -extern void die(const char *,struct pt_regs *,long);
11560 -#ifdef CONFIG_KPROBES
11561 -static inline int notify_page_fault(struct pt_regs *regs)
11565 - /* kprobe_running() needs smp_processor_id() */
11566 - if (!user_mode_vm(regs)) {
11567 - preempt_disable();
11568 - if (kprobe_running() && kprobe_fault_handler(regs, 14))
11570 - preempt_enable();
11576 -static inline int notify_page_fault(struct pt_regs *regs)
11583 - * Return EIP plus the CS segment base. The segment limit is also
11584 - * adjusted, clamped to the kernel/user address space (whichever is
11585 - * appropriate), and returned in *eip_limit.
11587 - * The segment is checked, because it might have been changed by another
11588 - * task between the original faulting instruction and here.
11590 - * If CS is no longer a valid code segment, or if EIP is beyond the
11591 - * limit, or if it is a kernel address when CS is not a kernel segment,
11592 - * then the returned value will be greater than *eip_limit.
11594 - * This is slow, but is very rarely executed.
11596 -static inline unsigned long get_segment_eip(struct pt_regs *regs,
11597 - unsigned long *eip_limit)
11599 - unsigned long eip = regs->eip;
11600 - unsigned seg = regs->xcs & 0xffff;
11601 - u32 seg_ar, seg_limit, base, *desc;
11603 - /* Unlikely, but must come before segment checks. */
11604 - if (unlikely(regs->eflags & VM_MASK)) {
11606 - *eip_limit = base + 0xffff;
11607 - return base + (eip & 0xffff);
11610 - /* The standard kernel/user address space limit. */
11611 - *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
11613 - /* By far the most common cases. */
11614 - if (likely(SEGMENT_IS_FLAT_CODE(seg)))
11617 - /* Check the segment exists, is within the current LDT/GDT size,
11618 - that kernel/user (ring 0..3) has the appropriate privilege,
11619 - that it's a code segment, and get the limit. */
11620 - __asm__ ("larl %3,%0; lsll %3,%1"
11621 - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
11622 - if ((~seg_ar & 0x9800) || eip > seg_limit) {
11624 - return 1; /* So that returned eip > *eip_limit. */
11627 - /* Get the GDT/LDT descriptor base.
11628 - When you look for races in this code remember that
11629 - LDT and other horrors are only used in user space. */
11630 - if (seg & (1<<2)) {
11631 - /* Must lock the LDT while reading it. */
11632 - mutex_lock(¤t->mm->context.lock);
11633 - desc = current->mm->context.ldt;
11634 - desc = (void *)desc + (seg & ~7);
11636 - /* Must disable preemption while reading the GDT. */
11637 - desc = (u32 *)get_cpu_gdt_table(get_cpu());
11638 - desc = (void *)desc + (seg & ~7);
11641 - /* Decode the code segment base from the descriptor */
11642 - base = get_desc_base((unsigned long *)desc);
11644 - if (seg & (1<<2)) {
11645 - mutex_unlock(¤t->mm->context.lock);
11649 - /* Adjust EIP and segment limit, and clamp at the kernel limit.
11650 - It's legitimate for segments to wrap at 0xffffffff. */
11651 - seg_limit += base;
11652 - if (seg_limit < *eip_limit && seg_limit >= base)
11653 - *eip_limit = seg_limit;
11654 - return eip + base;
11658 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
11659 - * Check that here and ignore it.
11661 -static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
11663 - unsigned long limit;
11664 - unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
11665 - int scan_more = 1;
11666 - int prefetch = 0;
11669 - for (i = 0; scan_more && i < 15; i++) {
11670 - unsigned char opcode;
11671 - unsigned char instr_hi;
11672 - unsigned char instr_lo;
11674 - if (instr > (unsigned char *)limit)
11676 - if (probe_kernel_address(instr, opcode))
11679 - instr_hi = opcode & 0xf0;
11680 - instr_lo = opcode & 0x0f;
11683 - switch (instr_hi) {
11686 - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
11687 - scan_more = ((instr_lo & 7) == 0x6);
11691 - /* 0x64 thru 0x67 are valid prefixes in all modes. */
11692 - scan_more = (instr_lo & 0xC) == 0x4;
11695 - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
11696 - scan_more = !instr_lo || (instr_lo>>1) == 1;
11699 - /* Prefetch instruction is 0x0F0D or 0x0F18 */
11701 - if (instr > (unsigned char *)limit)
11703 - if (probe_kernel_address(instr, opcode))
11705 - prefetch = (instr_lo == 0xF) &&
11706 - (opcode == 0x0D || opcode == 0x18);
11716 -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
11717 - unsigned long error_code)
11719 - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
11720 - boot_cpu_data.x86 >= 6)) {
11721 - /* Catch an obscure case of prefetch inside an NX page. */
11722 - if (nx_enabled && (error_code & 16))
11724 - return __is_prefetch(regs, addr);
11729 -static noinline void force_sig_info_fault(int si_signo, int si_code,
11730 - unsigned long address, struct task_struct *tsk)
11734 - info.si_signo = si_signo;
11735 - info.si_errno = 0;
11736 - info.si_code = si_code;
11737 - info.si_addr = (void __user *)address;
11738 - force_sig_info(si_signo, &info, tsk);
11741 -fastcall void do_invalid_op(struct pt_regs *, unsigned long);
11743 -#ifdef CONFIG_X86_PAE
11744 -static void dump_fault_path(unsigned long address)
11746 - unsigned long *p, page;
11747 - unsigned long mfn;
11749 - page = read_cr3();
11750 - p = (unsigned long *)__va(page);
11751 - p += (address >> 30) * 2;
11752 - printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
11753 - if (p[0] & _PAGE_PRESENT) {
11754 - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
11755 - page = mfn_to_pfn(mfn) << PAGE_SHIFT;
11756 - p = (unsigned long *)__va(page);
11757 - address &= 0x3fffffff;
11758 - p += (address >> 21) * 2;
11759 - printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
11760 - page, p[1], p[0]);
11761 - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
11762 -#ifdef CONFIG_HIGHPTE
11763 - if (mfn_to_pfn(mfn) >= highstart_pfn)
11766 - if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) {
11767 - page = mfn_to_pfn(mfn) << PAGE_SHIFT;
11768 - p = (unsigned long *) __va(page);
11769 - address &= 0x001fffff;
11770 - p += (address >> 12) * 2;
11771 - printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
11772 - page, p[1], p[0]);
11777 -static void dump_fault_path(unsigned long address)
11779 - unsigned long page;
11781 - page = read_cr3();
11782 - page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
11783 - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
11784 - machine_to_phys(page));
11786 - * We must not directly access the pte in the highpte
11787 - * case if the page table is located in highmem.
11788 - * And lets rather not kmap-atomic the pte, just in case
11789 - * it's allocated already.
11791 - if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
11792 - && (page & _PAGE_PRESENT)
11793 - && !(page & _PAGE_PSE)) {
11794 - page = machine_to_phys(page & PAGE_MASK);
11795 - page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
11796 - & (PTRS_PER_PTE - 1)];
11797 - printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
11798 - machine_to_phys(page));
11803 -static int spurious_fault(struct pt_regs *regs,
11804 - unsigned long address,
11805 - unsigned long error_code)
11812 - /* Reserved-bit violation or user access to kernel space? */
11813 - if (error_code & 0x0c)
11816 - pgd = init_mm.pgd + pgd_index(address);
11817 - if (!pgd_present(*pgd))
11820 - pud = pud_offset(pgd, address);
11821 - if (!pud_present(*pud))
11824 - pmd = pmd_offset(pud, address);
11825 - if (!pmd_present(*pmd))
11828 - pte = pte_offset_kernel(pmd, address);
11829 - if (!pte_present(*pte))
11831 - if ((error_code & 0x02) && !pte_write(*pte))
11833 -#ifdef CONFIG_X86_PAE
11834 - if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
11841 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
11843 - unsigned index = pgd_index(address);
11845 - pud_t *pud, *pud_k;
11846 - pmd_t *pmd, *pmd_k;
11849 - pgd_k = init_mm.pgd + index;
11851 - if (!pgd_present(*pgd_k))
11855 - * set_pgd(pgd, *pgd_k); here would be useless on PAE
11856 - * and redundant with the set_pmd() on non-PAE. As would
11860 - pud = pud_offset(pgd, address);
11861 - pud_k = pud_offset(pgd_k, address);
11862 - if (!pud_present(*pud_k))
11865 - pmd = pmd_offset(pud, address);
11866 - pmd_k = pmd_offset(pud_k, address);
11867 - if (!pmd_present(*pmd_k))
11869 - if (!pmd_present(*pmd)) {
11870 - bool lazy = x86_read_percpu(xen_lazy_mmu);
11872 - x86_write_percpu(xen_lazy_mmu, false);
11873 -#if CONFIG_XEN_COMPAT > 0x030002
11874 - set_pmd(pmd, *pmd_k);
11877 - * When running on older Xen we must launder *pmd_k through
11878 - * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
11880 - set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
11882 - x86_write_percpu(xen_lazy_mmu, lazy);
11884 - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
11889 - * Handle a fault on the vmalloc or module mapping area
11891 - * This assumes no large pages in there.
11893 -static inline int vmalloc_fault(unsigned long address)
11895 - unsigned long pgd_paddr;
11899 - * Synchronize this task's top level page-table
11900 - * with the 'reference' page table.
11902 - * Do _not_ use "current" here. We might be inside
11903 - * an interrupt in the middle of a task switch..
11905 - pgd_paddr = read_cr3();
11906 - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
11909 - pte_k = pte_offset_kernel(pmd_k, address);
11910 - if (!pte_present(*pte_k))
11915 -int show_unhandled_signals = 1;
11918 - * This routine handles page faults. It determines the address,
11919 - * and the problem, and then passes it off to one of the appropriate
11923 - * bit 0 == 0 means no page found, 1 means protection fault
11924 - * bit 1 == 0 means read, 1 means write
11925 - * bit 2 == 0 means kernel, 1 means user-mode
11926 - * bit 3 == 1 means use of reserved bit detected
11927 - * bit 4 == 1 means fault was an instruction fetch
11929 -fastcall void __kprobes do_page_fault(struct pt_regs *regs,
11930 - unsigned long error_code)
11932 - struct task_struct *tsk;
11933 - struct mm_struct *mm;
11934 - struct vm_area_struct * vma;
11935 - unsigned long address;
11936 - int write, si_code;
11940 - * We can fault from pretty much anywhere, with unknown IRQ state.
11942 - trace_hardirqs_fixup();
11944 - /* get the address */
11945 - address = read_cr2();
11947 - /* Set the "privileged fault" bit to something sane. */
11948 - error_code &= ~4;
11949 - error_code |= (regs->xcs & 2) << 1;
11950 - if (regs->eflags & X86_EFLAGS_VM)
11955 - si_code = SEGV_MAPERR;
11958 - * We fault-in kernel-space virtual memory on-demand. The
11959 - * 'reference' page table is init_mm.pgd.
11961 - * NOTE! We MUST NOT take any locks for this case. We may
11962 - * be in an interrupt or a critical region, and should
11963 - * only copy the information from the master page table,
11966 - * This verifies that the fault happens in kernel space
11967 - * (error_code & 4) == 0, and that the fault was not a
11968 - * protection error (error_code & 9) == 0.
11970 - if (unlikely(address >= TASK_SIZE)) {
11972 - /* Faults in hypervisor area can never be patched up. */
11973 - if (address >= hypervisor_virt_start)
11974 - goto bad_area_nosemaphore;
11976 - if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
11978 - /* Can take a spurious fault if mapping changes R/O -> R/W. */
11979 - if (spurious_fault(regs, address, error_code))
11981 - if (notify_page_fault(regs))
11984 - * Don't take the mm semaphore here. If we fixup a prefetch
11985 - * fault we could otherwise deadlock.
11987 - goto bad_area_nosemaphore;
11990 - if (notify_page_fault(regs))
11993 - /* It's safe to allow irq's after cr2 has been saved and the vmalloc
11994 - fault has been handled. */
11995 - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
11996 - local_irq_enable();
12001 - * If we're in an interrupt, have no user context or are running in an
12002 - * atomic region then we must not take the fault..
12004 - if (in_atomic() || !mm)
12005 - goto bad_area_nosemaphore;
12007 - /* When running in the kernel we expect faults to occur only to
12008 - * addresses in user space. All other faults represent errors in the
12009 - * kernel and should generate an OOPS. Unfortunately, in the case of an
12010 - * erroneous fault occurring in a code path which already holds mmap_sem
12011 - * we will deadlock attempting to validate the fault against the
12012 - * address space. Luckily the kernel only validly references user
12013 - * space from well defined areas of code, which are listed in the
12014 - * exceptions table.
12016 - * As the vast majority of faults will be valid we will only perform
12017 - * the source reference check when there is a possibility of a deadlock.
12018 - * Attempt to lock the address space, if we cannot we then validate the
12019 - * source. If this is invalid we can skip the address space check,
12020 - * thus avoiding the deadlock.
12022 - if (!down_read_trylock(&mm->mmap_sem)) {
12023 - if ((error_code & 4) == 0 &&
12024 - !search_exception_tables(regs->eip))
12025 - goto bad_area_nosemaphore;
12026 - down_read(&mm->mmap_sem);
12029 - vma = find_vma(mm, address);
12032 - if (vma->vm_start <= address)
12034 - if (!(vma->vm_flags & VM_GROWSDOWN))
12036 - if (error_code & 4) {
12038 - * Accessing the stack below %esp is always a bug.
12039 - * The large cushion allows instructions like enter
12040 - * and pusha to work. ("enter $65535,$31" pushes
12041 - * 32 pointers and then decrements %esp by 65535.)
12043 - if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
12046 - if (expand_stack(vma, address))
12049 - * Ok, we have a good vm_area for this memory access, so
12050 - * we can handle it..
12053 - si_code = SEGV_ACCERR;
12055 - switch (error_code & 3) {
12056 - default: /* 3: write, present */
12057 - /* fall through */
12058 - case 2: /* write, not present */
12059 - if (!(vma->vm_flags & VM_WRITE))
12063 - case 1: /* read, present */
12065 - case 0: /* read, not present */
12066 - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12072 - * If for any reason at all we couldn't handle the fault,
12073 - * make sure we exit gracefully rather than endlessly redo
12076 - fault = handle_mm_fault(mm, vma, address, write);
12077 - if (unlikely(fault & VM_FAULT_ERROR)) {
12078 - if (fault & VM_FAULT_OOM)
12079 - goto out_of_memory;
12080 - else if (fault & VM_FAULT_SIGBUS)
12084 - if (fault & VM_FAULT_MAJOR)
12090 - * Did it hit the DOS screen memory VA from vm86 mode?
12092 - if (regs->eflags & VM_MASK) {
12093 - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
12095 - tsk->thread.screen_bitmap |= 1 << bit;
12097 - up_read(&mm->mmap_sem);
12101 - * Something tried to access memory that isn't in our memory map..
12102 - * Fix it, but check if it's kernel or user first..
12105 - up_read(&mm->mmap_sem);
12107 -bad_area_nosemaphore:
12108 - /* User mode accesses just cause a SIGSEGV */
12109 - if (error_code & 4) {
12111 - * It's possible to have interrupts off here.
12113 - local_irq_enable();
12116 - * Valid to do another page fault here because this one came
12117 - * from user space.
12119 - if (is_prefetch(regs, address, error_code))
12122 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12123 - printk_ratelimit()) {
12124 - printk("%s%s[%d]: segfault at %08lx eip %08lx "
12125 - "esp %08lx error %lx\n",
12126 - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
12127 - tsk->comm, task_pid_nr(tsk), address, regs->eip,
12128 - regs->esp, error_code);
12130 - tsk->thread.cr2 = address;
12131 - /* Kernel addresses are always protection faults */
12132 - tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12133 - tsk->thread.trap_no = 14;
12134 - force_sig_info_fault(SIGSEGV, si_code, address, tsk);
12138 -#ifdef CONFIG_X86_F00F_BUG
12140 - * Pentium F0 0F C7 C8 bug workaround.
12142 - if (boot_cpu_data.f00f_bug) {
12143 - unsigned long nr;
12145 - nr = (address - idt_descr.address) >> 3;
12148 - do_invalid_op(regs, 0);
12155 - /* Are we prepared to handle this kernel fault? */
12156 - if (fixup_exception(regs))
12160 - * Valid to do another page fault here, because if this fault
12161 - * had been triggered by is_prefetch fixup_exception would have
12164 - if (is_prefetch(regs, address, error_code))
12168 - * Oops. The kernel tried to access some bad page. We'll have to
12169 - * terminate things with extreme prejudice.
12172 - bust_spinlocks(1);
12174 - if (oops_may_print()) {
12175 -#ifdef CONFIG_X86_PAE
12176 - if (error_code & 16) {
12177 - pte_t *pte = lookup_address(address);
12179 - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
12180 - printk(KERN_CRIT "kernel tried to execute "
12181 - "NX-protected page - exploit attempt? "
12182 - "(uid: %d)\n", current->uid);
12185 - if (address < PAGE_SIZE)
12186 - printk(KERN_ALERT "BUG: unable to handle kernel NULL "
12187 - "pointer dereference");
12189 - printk(KERN_ALERT "BUG: unable to handle kernel paging"
12191 - printk(" at virtual address %08lx\n",address);
12192 - printk(KERN_ALERT "printing eip: %08lx\n", regs->eip);
12193 - dump_fault_path(address);
12195 - tsk->thread.cr2 = address;
12196 - tsk->thread.trap_no = 14;
12197 - tsk->thread.error_code = error_code;
12198 - die("Oops", regs, error_code);
12199 - bust_spinlocks(0);
12200 - do_exit(SIGKILL);
12203 - * We ran out of memory, or some other thing happened to us that made
12204 - * us unable to handle the page fault gracefully.
12207 - up_read(&mm->mmap_sem);
12208 - if (is_global_init(tsk)) {
12210 - down_read(&mm->mmap_sem);
12213 - printk("VM: killing process %s\n", tsk->comm);
12214 - if (error_code & 4)
12215 - do_group_exit(SIGKILL);
12219 - up_read(&mm->mmap_sem);
12221 - /* Kernel mode? Handle exceptions or die */
12222 - if (!(error_code & 4))
12225 - /* User space => ok to do another page fault */
12226 - if (is_prefetch(regs, address, error_code))
12229 - tsk->thread.cr2 = address;
12230 - tsk->thread.error_code = error_code;
12231 - tsk->thread.trap_no = 14;
12232 - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
12235 -void vmalloc_sync_all(void)
12238 - * Note that races in the updates of insync and start aren't
12239 - * problematic: insync can only get set bits added, and updates to
12240 - * start are only improving performance (without affecting correctness
12242 - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
12243 - * This change works just fine with 2-level paging too.
12245 -#define sync_index(a) ((a) >> PMD_SHIFT)
12246 - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
12247 - static unsigned long start = TASK_SIZE;
12248 - unsigned long address;
12250 - if (SHARED_KERNEL_PMD)
12253 - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
12254 - for (address = start;
12255 - address >= TASK_SIZE && address < hypervisor_virt_start;
12256 - address += 1UL << PMD_SHIFT) {
12257 - if (!test_bit(sync_index(address), insync)) {
12258 - unsigned long flags;
12259 - struct page *page;
12261 - spin_lock_irqsave(&pgd_lock, flags);
12262 - /* XEN: failure path assumes non-empty pgd_list. */
12263 - if (unlikely(!pgd_list)) {
12264 - spin_unlock_irqrestore(&pgd_lock, flags);
12267 - for (page = pgd_list; page; page =
12268 - (struct page *)page->index)
12269 - if (!vmalloc_sync_one(page_address(page),
12271 - BUG_ON(page != pgd_list);
12274 - spin_unlock_irqrestore(&pgd_lock, flags);
12276 - set_bit(sync_index(address), insync);
12278 - if (address == start && test_bit(sync_index(address), insync))
12279 - start = address + (1UL << PMD_SHIFT);
12282 --- sle11-2009-10-16.orig/arch/x86/mm/fault_64-xen.c 2009-02-16 16:18:36.000000000 +0100
12283 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
12286 - * linux/arch/x86-64/mm/fault.c
12288 - * Copyright (C) 1995 Linus Torvalds
12289 - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
12292 -#include <linux/signal.h>
12293 -#include <linux/sched.h>
12294 -#include <linux/kernel.h>
12295 -#include <linux/errno.h>
12296 -#include <linux/string.h>
12297 -#include <linux/types.h>
12298 -#include <linux/ptrace.h>
12299 -#include <linux/mman.h>
12300 -#include <linux/mm.h>
12301 -#include <linux/smp.h>
12302 -#include <linux/interrupt.h>
12303 -#include <linux/init.h>
12304 -#include <linux/tty.h>
12305 -#include <linux/vt_kern.h> /* For unblank_screen() */
12306 -#include <linux/compiler.h>
12307 -#include <linux/vmalloc.h>
12308 -#include <linux/module.h>
12309 -#include <linux/kprobes.h>
12310 -#include <linux/uaccess.h>
12311 -#include <linux/kdebug.h>
12312 -#include <linux/kprobes.h>
12314 -#include <asm/system.h>
12315 -#include <asm/pgalloc.h>
12316 -#include <asm/smp.h>
12317 -#include <asm/tlbflush.h>
12318 -#include <asm/proto.h>
12319 -#include <asm-generic/sections.h>
12321 -/* Page fault error code bits */
12322 -#define PF_PROT (1<<0) /* or no page found */
12323 -#define PF_WRITE (1<<1)
12324 -#define PF_USER (1<<2)
12325 -#define PF_RSVD (1<<3)
12326 -#define PF_INSTR (1<<4)
12328 -#ifdef CONFIG_KPROBES
12329 -static inline int notify_page_fault(struct pt_regs *regs)
12333 - /* kprobe_running() needs smp_processor_id() */
12334 - if (!user_mode(regs)) {
12335 - preempt_disable();
12336 - if (kprobe_running() && kprobe_fault_handler(regs, 14))
12338 - preempt_enable();
12344 -static inline int notify_page_fault(struct pt_regs *regs)
12350 -/* Sometimes the CPU reports invalid exceptions on prefetch.
12351 - Check that here and ignore.
12352 - Opcode checker based on code by Richard Brunner */
12353 -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
12354 - unsigned long error_code)
12356 - unsigned char *instr;
12357 - int scan_more = 1;
12358 - int prefetch = 0;
12359 - unsigned char *max_instr;
12361 - /* If it was a exec fault ignore */
12362 - if (error_code & PF_INSTR)
12365 - instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
12366 - max_instr = instr + 15;
12368 - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
12371 - while (scan_more && instr < max_instr) {
12372 - unsigned char opcode;
12373 - unsigned char instr_hi;
12374 - unsigned char instr_lo;
12376 - if (probe_kernel_address(instr, opcode))
12379 - instr_hi = opcode & 0xf0;
12380 - instr_lo = opcode & 0x0f;
12383 - switch (instr_hi) {
12386 - /* Values 0x26,0x2E,0x36,0x3E are valid x86
12387 - prefixes. In long mode, the CPU will signal
12388 - invalid opcode if some of these prefixes are
12389 - present so we will never get here anyway */
12390 - scan_more = ((instr_lo & 7) == 0x6);
12394 - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
12395 - Need to figure out under what instruction mode the
12396 - instruction was issued ... */
12397 - /* Could check the LDT for lm, but for now it's good
12398 - enough to assume that long mode only uses well known
12399 - segments or kernel. */
12400 - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
12404 - /* 0x64 thru 0x67 are valid prefixes in all modes. */
12405 - scan_more = (instr_lo & 0xC) == 0x4;
12408 - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
12409 - scan_more = !instr_lo || (instr_lo>>1) == 1;
12412 - /* Prefetch instruction is 0x0F0D or 0x0F18 */
12414 - if (probe_kernel_address(instr, opcode))
12416 - prefetch = (instr_lo == 0xF) &&
12417 - (opcode == 0x0D || opcode == 0x18);
12427 -static int bad_address(void *p)
12429 - unsigned long dummy;
12430 - return probe_kernel_address((unsigned long *)p, dummy);
12433 -void dump_pagetable(unsigned long address)
12440 - pgd = (pgd_t *)read_cr3();
12442 - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
12443 - pgd += pgd_index(address);
12444 - if (bad_address(pgd)) goto bad;
12445 - printk("PGD %lx ", pgd_val(*pgd));
12446 - if (!pgd_present(*pgd)) goto ret;
12448 - pud = pud_offset(pgd, address);
12449 - if (bad_address(pud)) goto bad;
12450 - printk("PUD %lx ", pud_val(*pud));
12451 - if (!pud_present(*pud)) goto ret;
12453 - pmd = pmd_offset(pud, address);
12454 - if (bad_address(pmd)) goto bad;
12455 - printk("PMD %lx ", pmd_val(*pmd));
12456 - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
12458 - pte = pte_offset_kernel(pmd, address);
12459 - if (bad_address(pte)) goto bad;
12460 - printk("PTE %lx", pte_val(*pte));
12468 -static const char errata93_warning[] =
12469 -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
12470 -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
12471 -KERN_ERR "******* Please consider a BIOS update.\n"
12472 -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
12474 -/* Workaround for K8 erratum #93 & buggy BIOS.
12475 - BIOS SMM functions are required to use a specific workaround
12476 - to avoid corruption of the 64bit RIP register on C stepping K8.
12477 - A lot of BIOS that didn't get tested properly miss this.
12478 - The OS sees this as a page fault with the upper 32bits of RIP cleared.
12479 - Try to work around it here.
12480 - Note we only handle faults in kernel here. */
12482 -static int is_errata93(struct pt_regs *regs, unsigned long address)
12484 - static int warned;
12485 - if (address != regs->rip)
12487 - if ((address >> 32) != 0)
12489 - address |= 0xffffffffUL << 32;
12490 - if ((address >= (u64)_stext && address <= (u64)_etext) ||
12491 - (address >= MODULES_VADDR && address <= MODULES_END)) {
12493 - printk(errata93_warning);
12496 - regs->rip = address;
12502 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
12503 - unsigned long error_code)
12505 - unsigned long flags = oops_begin();
12506 - struct task_struct *tsk;
12508 - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
12509 - current->comm, address);
12510 - dump_pagetable(address);
12512 - tsk->thread.cr2 = address;
12513 - tsk->thread.trap_no = 14;
12514 - tsk->thread.error_code = error_code;
12515 - __die("Bad pagetable", regs, error_code);
12517 - do_exit(SIGKILL);
12521 - * Handle a fault on the vmalloc area
12523 - * This assumes no large pages in there.
12525 -static int vmalloc_fault(unsigned long address)
12527 - pgd_t *pgd, *pgd_ref;
12528 - pud_t *pud, *pud_ref;
12529 - pmd_t *pmd, *pmd_ref;
12530 - pte_t *pte, *pte_ref;
12532 - /* Copy kernel mappings over when needed. This can also
12533 - happen within a race in page table update. In the later
12534 - case just flush. */
12536 - /* On Xen the line below does not always work. Needs investigating! */
12537 - /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
12538 - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
12539 - pgd += pgd_index(address);
12540 - pgd_ref = pgd_offset_k(address);
12541 - if (pgd_none(*pgd_ref))
12543 - if (pgd_none(*pgd))
12544 - set_pgd(pgd, *pgd_ref);
12546 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12548 - /* Below here mismatches are bugs because these lower tables
12551 - pud = pud_offset(pgd, address);
12552 - pud_ref = pud_offset(pgd_ref, address);
12553 - if (pud_none(*pud_ref))
12555 - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
12557 - pmd = pmd_offset(pud, address);
12558 - pmd_ref = pmd_offset(pud_ref, address);
12559 - if (pmd_none(*pmd_ref))
12561 - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
12563 - pte_ref = pte_offset_kernel(pmd_ref, address);
12564 - if (!pte_present(*pte_ref))
12566 - pte = pte_offset_kernel(pmd, address);
12567 - /* Don't use pte_page here, because the mappings can point
12568 - outside mem_map, and the NUMA hash lookup cannot handle
12570 - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
12575 -int show_unhandled_signals = 1;
12578 -#define MEM_VERBOSE 1
12580 -#ifdef MEM_VERBOSE
12581 -#define MEM_LOG(_f, _a...) \
12582 - printk("fault.c:[%d]-> " _f "\n", \
12583 - __LINE__ , ## _a )
12585 -#define MEM_LOG(_f, _a...) ((void)0)
12588 -static int spurious_fault(struct pt_regs *regs,
12589 - unsigned long address,
12590 - unsigned long error_code)
12598 - /* Faults in hypervisor area are never spurious. */
12599 - if ((address >= HYPERVISOR_VIRT_START) &&
12600 - (address < HYPERVISOR_VIRT_END))
12604 - /* Reserved-bit violation or user access to kernel space? */
12605 - if (error_code & (PF_RSVD|PF_USER))
12608 - pgd = init_mm.pgd + pgd_index(address);
12609 - if (!pgd_present(*pgd))
12612 - pud = pud_offset(pgd, address);
12613 - if (!pud_present(*pud))
12616 - pmd = pmd_offset(pud, address);
12617 - if (!pmd_present(*pmd))
12620 - pte = pte_offset_kernel(pmd, address);
12621 - if (!pte_present(*pte))
12623 - if ((error_code & PF_WRITE) && !pte_write(*pte))
12625 - if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
12632 - * This routine handles page faults. It determines the address,
12633 - * and the problem, and then passes it off to one of the appropriate
12636 -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
12637 - unsigned long error_code)
12639 - struct task_struct *tsk;
12640 - struct mm_struct *mm;
12641 - struct vm_area_struct * vma;
12642 - unsigned long address;
12643 - const struct exception_table_entry *fixup;
12644 - int write, fault;
12645 - unsigned long flags;
12648 - if (!user_mode(regs))
12649 - error_code &= ~PF_USER; /* means kernel */
12652 - * We can fault from pretty much anywhere, with unknown IRQ state.
12654 - trace_hardirqs_fixup();
12658 - prefetchw(&mm->mmap_sem);
12660 - /* get the address */
12661 - address = read_cr2();
12663 - info.si_code = SEGV_MAPERR;
12667 - * We fault-in kernel-space virtual memory on-demand. The
12668 - * 'reference' page table is init_mm.pgd.
12670 - * NOTE! We MUST NOT take any locks for this case. We may
12671 - * be in an interrupt or a critical region, and should
12672 - * only copy the information from the master page table,
12675 - * This verifies that the fault happens in kernel space
12676 - * (error_code & 4) == 0, and that the fault was not a
12677 - * protection error (error_code & 9) == 0.
12679 - if (unlikely(address >= TASK_SIZE64)) {
12681 - * Don't check for the module range here: its PML4
12682 - * is always initialized because it's shared with the main
12683 - * kernel text. Only vmalloc may need PML4 syncups.
12685 - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
12686 - ((address >= VMALLOC_START && address < VMALLOC_END))) {
12687 - if (vmalloc_fault(address) >= 0)
12690 - /* Can take a spurious fault if mapping changes R/O -> R/W. */
12691 - if (spurious_fault(regs, address, error_code))
12693 - if (notify_page_fault(regs))
12696 - * Don't take the mm semaphore here. If we fixup a prefetch
12697 - * fault we could otherwise deadlock.
12699 - goto bad_area_nosemaphore;
12702 - if (notify_page_fault(regs))
12705 - if (likely(regs->eflags & X86_EFLAGS_IF))
12706 - local_irq_enable();
12708 - if (unlikely(error_code & PF_RSVD))
12709 - pgtable_bad(address, regs, error_code);
12712 - * If we're in an interrupt or have no user
12713 - * context, we must not take the fault..
12715 - if (unlikely(in_atomic() || !mm))
12716 - goto bad_area_nosemaphore;
12719 - * User-mode registers count as a user access even for any
12720 - * potential system fault or CPU buglet.
12722 - if (user_mode_vm(regs))
12723 - error_code |= PF_USER;
12726 - /* When running in the kernel we expect faults to occur only to
12727 - * addresses in user space. All other faults represent errors in the
12728 - * kernel and should generate an OOPS. Unfortunately, in the case of an
12729 - * erroneous fault occurring in a code path which already holds mmap_sem
12730 - * we will deadlock attempting to validate the fault against the
12731 - * address space. Luckily the kernel only validly references user
12732 - * space from well defined areas of code, which are listed in the
12733 - * exceptions table.
12735 - * As the vast majority of faults will be valid we will only perform
12736 - * the source reference check when there is a possibility of a deadlock.
12737 - * Attempt to lock the address space, if we cannot we then validate the
12738 - * source. If this is invalid we can skip the address space check,
12739 - * thus avoiding the deadlock.
12741 - if (!down_read_trylock(&mm->mmap_sem)) {
12742 - if ((error_code & PF_USER) == 0 &&
12743 - !search_exception_tables(regs->rip))
12744 - goto bad_area_nosemaphore;
12745 - down_read(&mm->mmap_sem);
12748 - vma = find_vma(mm, address);
12751 - if (likely(vma->vm_start <= address))
12753 - if (!(vma->vm_flags & VM_GROWSDOWN))
12755 - if (error_code & 4) {
12756 - /* Allow userspace just enough access below the stack pointer
12757 - * to let the 'enter' instruction work.
12759 - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
12762 - if (expand_stack(vma, address))
12765 - * Ok, we have a good vm_area for this memory access, so
12766 - * we can handle it..
12769 - info.si_code = SEGV_ACCERR;
12771 - switch (error_code & (PF_PROT|PF_WRITE)) {
12772 - default: /* 3: write, present */
12773 - /* fall through */
12774 - case PF_WRITE: /* write, not present */
12775 - if (!(vma->vm_flags & VM_WRITE))
12779 - case PF_PROT: /* read, present */
12781 - case 0: /* read, not present */
12782 - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
12787 - * If for any reason at all we couldn't handle the fault,
12788 - * make sure we exit gracefully rather than endlessly redo
12791 - fault = handle_mm_fault(mm, vma, address, write);
12792 - if (unlikely(fault & VM_FAULT_ERROR)) {
12793 - if (fault & VM_FAULT_OOM)
12794 - goto out_of_memory;
12795 - else if (fault & VM_FAULT_SIGBUS)
12799 - if (fault & VM_FAULT_MAJOR)
12803 - up_read(&mm->mmap_sem);
12807 - * Something tried to access memory that isn't in our memory map..
12808 - * Fix it, but check if it's kernel or user first..
12811 - up_read(&mm->mmap_sem);
12813 -bad_area_nosemaphore:
12814 - /* User mode accesses just cause a SIGSEGV */
12815 - if (error_code & PF_USER) {
12818 - * It's possible to have interrupts off here.
12820 - local_irq_enable();
12822 - if (is_prefetch(regs, address, error_code))
12825 - /* Work around K8 erratum #100 K8 in compat mode
12826 - occasionally jumps to illegal addresses >4GB. We
12827 - catch this here in the page fault handler because
12828 - these addresses are not reachable. Just detect this
12829 - case and return. Any code segment in LDT is
12830 - compatibility mode. */
12831 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
12835 - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
12836 - printk_ratelimit()) {
12838 - "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
12839 - tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
12840 - tsk->comm, tsk->pid, address, regs->rip,
12841 - regs->rsp, error_code);
12844 - tsk->thread.cr2 = address;
12845 - /* Kernel addresses are always protection faults */
12846 - tsk->thread.error_code = error_code | (address >= TASK_SIZE);
12847 - tsk->thread.trap_no = 14;
12848 - info.si_signo = SIGSEGV;
12849 - info.si_errno = 0;
12850 - /* info.si_code has been set above */
12851 - info.si_addr = (void __user *)address;
12852 - force_sig_info(SIGSEGV, &info, tsk);
12858 - /* Are we prepared to handle this kernel fault? */
12859 - fixup = search_exception_tables(regs->rip);
12861 - regs->rip = fixup->fixup;
12866 - * Hall of shame of CPU/BIOS bugs.
12869 - if (is_prefetch(regs, address, error_code))
12872 - if (is_errata93(regs, address))
12876 - * Oops. The kernel tried to access some bad page. We'll have to
12877 - * terminate things with extreme prejudice.
12880 - flags = oops_begin();
12882 - if (address < PAGE_SIZE)
12883 - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
12885 - printk(KERN_ALERT "Unable to handle kernel paging request");
12886 - printk(" at %016lx RIP: \n" KERN_ALERT,address);
12887 - printk_address(regs->rip);
12888 - dump_pagetable(address);
12889 - tsk->thread.cr2 = address;
12890 - tsk->thread.trap_no = 14;
12891 - tsk->thread.error_code = error_code;
12892 - __die("Oops", regs, error_code);
12893 - /* Executive summary in case the body of the oops scrolled away */
12894 - printk(KERN_EMERG "CR2: %016lx\n", address);
12896 - do_exit(SIGKILL);
12899 - * We ran out of memory, or some other thing happened to us that made
12900 - * us unable to handle the page fault gracefully.
12903 - up_read(&mm->mmap_sem);
12904 - if (is_global_init(current)) {
12908 - printk("VM: killing process %s\n", tsk->comm);
12909 - if (error_code & 4)
12910 - do_group_exit(SIGKILL);
12914 - up_read(&mm->mmap_sem);
12916 - /* Kernel mode? Handle exceptions or die */
12917 - if (!(error_code & PF_USER))
12920 - tsk->thread.cr2 = address;
12921 - tsk->thread.error_code = error_code;
12922 - tsk->thread.trap_no = 14;
12923 - info.si_signo = SIGBUS;
12924 - info.si_errno = 0;
12925 - info.si_code = BUS_ADRERR;
12926 - info.si_addr = (void __user *)address;
12927 - force_sig_info(SIGBUS, &info, tsk);
12931 -DEFINE_SPINLOCK(pgd_lock);
12932 -LIST_HEAD(pgd_list);
12934 -void vmalloc_sync_all(void)
12936 - /* Note that races in the updates of insync and start aren't
12938 - insync can only get set bits added, and updates to start are only
12939 - improving performance (without affecting correctness if undone). */
12940 - static DECLARE_BITMAP(insync, PTRS_PER_PGD);
12941 - static unsigned long start = VMALLOC_START & PGDIR_MASK;
12942 - unsigned long address;
12944 - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
12945 - if (!test_bit(pgd_index(address), insync)) {
12946 - const pgd_t *pgd_ref = pgd_offset_k(address);
12947 - struct page *page;
12949 - if (pgd_none(*pgd_ref))
12951 - spin_lock(&pgd_lock);
12952 - list_for_each_entry(page, &pgd_list, lru) {
12954 - pgd = (pgd_t *)page_address(page) + pgd_index(address);
12955 - if (pgd_none(*pgd))
12956 - set_pgd(pgd, *pgd_ref);
12958 - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
12960 - spin_unlock(&pgd_lock);
12961 - set_bit(pgd_index(address), insync);
12963 - if (address == start)
12964 - start = address + PGDIR_SIZE;
12966 - /* Check that there is no need to do the same for the modules area. */
12967 - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
12968 - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
12969 - (__START_KERNEL & PGDIR_MASK)));
12971 --- sle11-2009-10-16.orig/arch/x86/mm/highmem_32-xen.c 2009-02-16 16:17:21.000000000 +0100
12972 +++ sle11-2009-10-16/arch/x86/mm/highmem_32-xen.c 2009-03-16 16:33:40.000000000 +0100
12973 @@ -18,6 +18,49 @@ void kunmap(struct page *page)
12977 +static void debug_kmap_atomic_prot(enum km_type type)
12979 +#ifdef CONFIG_DEBUG_HIGHMEM
12980 + static unsigned warn_count = 10;
12982 + if (unlikely(warn_count == 0))
12985 + if (unlikely(in_interrupt())) {
12987 + if (type != KM_IRQ0 && type != KM_IRQ1 &&
12988 + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
12989 + type != KM_BOUNCE_READ) {
12993 + } else if (!irqs_disabled()) { /* softirq */
12994 + if (type != KM_IRQ0 && type != KM_IRQ1 &&
12995 + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
12996 + type != KM_SKB_SUNRPC_DATA &&
12997 + type != KM_SKB_DATA_SOFTIRQ &&
12998 + type != KM_BOUNCE_READ) {
13005 + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
13006 + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
13007 + if (!irqs_disabled()) {
13011 + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
13012 + if (irq_count() == 0 && !irqs_disabled()) {
13021 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
13022 * no global lock is needed and because the kmap code must perform a global TLB
13023 @@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page
13024 if (!PageHighMem(page))
13025 return page_address(page);
13027 + debug_kmap_atomic_prot(type);
13029 idx = type + KM_TYPE_NR*smp_processor_id();
13030 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
13031 BUG_ON(!pte_none(*(kmap_pte-idx)));
13032 --- sle11-2009-10-16.orig/arch/x86/mm/hypervisor.c 2009-05-06 10:23:43.000000000 +0200
13033 +++ sle11-2009-10-16/arch/x86/mm/hypervisor.c 2009-05-14 11:18:39.000000000 +0200
13034 @@ -869,15 +869,11 @@ int xen_limit_pages_to_max_mfn(
13036 EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
13039 -int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
13040 +int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
13042 - __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
13043 - maddr_t mach_lp = arbitrary_virt_to_machine(lp);
13044 - return HYPERVISOR_update_descriptor(
13045 - mach_lp, (u64)entry_a | ((u64)entry_b<<32));
13046 + maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
13047 + return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
13051 #define MAX_BATCHED_FULL_PTES 32
13053 --- sle11-2009-10-16.orig/arch/x86/mm/init_32-xen.c 2009-02-16 16:18:36.000000000 +0100
13054 +++ sle11-2009-10-16/arch/x86/mm/init_32-xen.c 2009-03-16 16:33:40.000000000 +0100
13055 @@ -27,13 +27,13 @@
13056 #include <linux/bootmem.h>
13057 #include <linux/slab.h>
13058 #include <linux/proc_fs.h>
13059 -#include <linux/efi.h>
13060 #include <linux/memory_hotplug.h>
13061 #include <linux/initrd.h>
13062 #include <linux/cpumask.h>
13063 #include <linux/dma-mapping.h>
13064 #include <linux/scatterlist.h>
13066 +#include <asm/asm.h>
13067 #include <asm/processor.h>
13068 #include <asm/system.h>
13069 #include <asm/uaccess.h>
13070 @@ -42,18 +42,22 @@
13071 #include <asm/fixmap.h>
13072 #include <asm/e820.h>
13073 #include <asm/apic.h>
13074 +#include <asm/bugs.h>
13075 #include <asm/tlb.h>
13076 #include <asm/tlbflush.h>
13077 +#include <asm/pgalloc.h>
13078 #include <asm/sections.h>
13079 #include <asm/hypervisor.h>
13080 #include <asm/swiotlb.h>
13081 +#include <asm/setup.h>
13082 +#include <asm/cacheflush.h>
13084 unsigned int __VMALLOC_RESERVE = 128 << 20;
13086 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13087 unsigned long highstart_pfn, highend_pfn;
13089 -static int noinline do_test_wp_bit(void);
13090 +static noinline int do_test_wp_bit(void);
13093 * Creates a middle page table and puts a pointer to it in the
13094 @@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init(
13100 #ifdef CONFIG_X86_PAE
13101 if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
13102 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
13104 - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
13105 + paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
13106 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
13107 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
13108 pud = pud_offset(pgd, 0);
13109 - if (pmd_table != pmd_offset(pud, 0))
13111 + BUG_ON(pmd_table != pmd_offset(pud, 0));
13114 pud = pud_offset(pgd, 0);
13115 @@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init(
13118 * Create a page table and place a pointer to it in a middle page
13119 - * directory entry.
13120 + * directory entry:
13122 static pte_t * __init one_page_table_init(pmd_t *pmd)
13124 @@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini
13125 #ifdef CONFIG_DEBUG_PAGEALLOC
13126 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
13129 + if (!page_table) {
13131 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
13134 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
13135 make_lowmem_page_readonly(page_table,
13136 @@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini
13140 - * This function initializes a certain range of kernel virtual memory
13141 + * This function initializes a certain range of kernel virtual memory
13142 * with new bootmem page tables, everywhere page tables are missing in
13147 - * NOTE: The pagetables are allocated contiguous on the physical space
13148 - * so we can cache the place of the first one and move around without
13150 + * NOTE: The pagetables are allocated contiguous on the physical space
13151 + * so we can cache the place of the first one and move around without
13152 * checking the pgd every time.
13154 -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
13155 +static void __init
13156 +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
13160 int pgd_idx, pmd_idx;
13161 unsigned long vaddr;
13166 pgd_idx = pgd_index(vaddr);
13167 @@ -139,7 +142,8 @@ static void __init page_table_range_init
13168 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
13169 pmd = one_md_table_init(pgd);
13170 pmd = pmd + pmd_index(vaddr);
13171 - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
13172 + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
13173 + pmd++, pmd_idx++) {
13174 if (vaddr < hypervisor_virt_start)
13175 one_page_table_init(pmd);
13177 @@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne
13181 - * This maps the physical memory to kernel virtual address space, a total
13182 - * of max_low_pfn pages, by creating page tables starting from address
13184 + * This maps the physical memory to kernel virtual address space, a total
13185 + * of max_low_pfn pages, by creating page tables starting from address
13188 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
13190 + int pgd_idx, pmd_idx, pte_ofs;
13195 - int pgd_idx, pmd_idx, pte_ofs;
13197 unsigned long max_ram_pfn = xen_start_info->nr_pages;
13198 if (max_ram_pfn > max_low_pfn)
13199 @@ -195,36 +199,49 @@ static void __init kernel_physical_mappi
13200 if (pfn >= max_low_pfn)
13203 - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
13204 - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
13205 - if (address >= hypervisor_virt_start)
13206 + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
13207 + pmd++, pmd_idx++) {
13208 + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
13210 + if (addr >= hypervisor_virt_start)
13213 - /* Map with big pages if possible, otherwise create normal page tables. */
13215 + * Map with big pages if possible, otherwise
13216 + * create normal page tables:
13219 - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
13220 - if (is_kernel_text(address) || is_kernel_text(address2))
13221 - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
13223 - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
13224 + unsigned int addr2;
13225 + pgprot_t prot = PAGE_KERNEL_LARGE;
13227 + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
13228 + PAGE_OFFSET + PAGE_SIZE-1;
13230 + if (is_kernel_text(addr) ||
13231 + is_kernel_text(addr2))
13232 + prot = PAGE_KERNEL_LARGE_EXEC;
13234 + set_pmd(pmd, pfn_pmd(pfn, prot));
13236 pfn += PTRS_PER_PTE;
13238 - pte = one_page_table_init(pmd);
13241 + pte = one_page_table_init(pmd);
13243 + for (pte += pte_ofs;
13244 + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13245 + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
13246 + pgprot_t prot = PAGE_KERNEL;
13248 + /* XEN: Only map initial RAM allocation. */
13249 + if ((pfn >= max_ram_pfn) || pte_present(*pte))
13251 + if (is_kernel_text(addr))
13252 + prot = PAGE_KERNEL_EXEC;
13254 - for (pte += pte_ofs;
13255 - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
13256 - pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
13257 - /* XEN: Only map initial RAM allocation. */
13258 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
13260 - if (is_kernel_text(address))
13261 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
13263 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
13266 + set_pte(pte, pfn_pte(pfn, prot));
13272 @@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign
13276 -int page_is_ram(unsigned long pagenr)
13279 - unsigned long addr, end;
13281 - if (efi_enabled) {
13282 - efi_memory_desc_t *md;
13285 - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
13287 - if (!is_available_memory(md))
13289 - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13290 - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
13292 - if ((pagenr >= addr) && (pagenr < end))
13298 - for (i = 0; i < e820.nr_map; i++) {
13300 - if (e820.map[i].type != E820_RAM) /* not usable memory */
13303 - * !!!FIXME!!! Some BIOSen report areas as RAM that
13304 - * are not. Notably the 640->1Mb area. We need a sanity
13307 - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
13308 - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
13309 - if ((pagenr >= addr) && (pagenr < end))
13315 #ifdef CONFIG_HIGHMEM
13317 pgprot_t kmap_prot;
13319 -#define kmap_get_fixmap_pte(vaddr) \
13320 - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
13321 +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
13323 + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
13324 + vaddr), vaddr), vaddr);
13327 static void __init kmap_init(void)
13329 unsigned long kmap_vstart;
13331 - /* cache the first kmap pte */
13333 + * Cache the first kmap pte:
13335 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
13336 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
13338 @@ -304,11 +287,11 @@ static void __init kmap_init(void)
13340 static void __init permanent_kmaps_init(pgd_t *pgd_base)
13342 + unsigned long vaddr;
13347 - unsigned long vaddr;
13349 vaddr = PKMAP_BASE;
13350 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
13351 @@ -317,7 +300,7 @@ static void __init permanent_kmaps_init(
13352 pud = pud_offset(pgd, vaddr);
13353 pmd = pmd_offset(pud, vaddr);
13354 pte = pte_offset_kernel(pmd, vaddr);
13355 - pkmap_page_table = pte;
13356 + pkmap_page_table = pte;
13359 static void __meminit free_new_highpage(struct page *page, int pfn)
13360 @@ -337,7 +320,8 @@ void __init add_one_highpage_init(struct
13361 SetPageReserved(page);
13364 -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13365 +static int __meminit
13366 +add_one_highpage_hotplug(struct page *page, unsigned long pfn)
13368 free_new_highpage(page, pfn);
13370 @@ -345,6 +329,7 @@ static int __meminit add_one_highpage_ho
13371 max_mapnr = max(pfn, max_mapnr);
13378 @@ -352,7 +337,7 @@ static int __meminit add_one_highpage_ho
13379 * Not currently handling the NUMA case.
13380 * Assuming single node and all memory that
13381 * has been added dynamically that would be
13382 - * onlined here is in HIGHMEM
13383 + * onlined here is in HIGHMEM.
13385 void __meminit online_page(struct page *page)
13387 @@ -360,13 +345,11 @@ void __meminit online_page(struct page *
13388 add_one_highpage_hotplug(page, page_to_pfn(page));
13392 -#ifdef CONFIG_NUMA
13393 -extern void set_highmem_pages_init(int);
13395 +#ifndef CONFIG_NUMA
13396 static void __init set_highmem_pages_init(int bad_ppro)
13400 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
13402 * Holes under sparsemem might not have no mem_map[]:
13403 @@ -376,23 +359,18 @@ static void __init set_highmem_pages_ini
13405 totalram_pages += totalhigh_pages;
13407 -#endif /* CONFIG_FLATMEM */
13408 +#endif /* !CONFIG_NUMA */
13411 -#define kmap_init() do { } while (0)
13412 -#define permanent_kmaps_init(pgd_base) do { } while (0)
13413 -#define set_highmem_pages_init(bad_ppro) do { } while (0)
13414 +# define kmap_init() do { } while (0)
13415 +# define permanent_kmaps_init(pgd_base) do { } while (0)
13416 +# define set_highmem_pages_init(bad_ppro) do { } while (0)
13417 #endif /* CONFIG_HIGHMEM */
13419 -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
13420 +pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
13421 EXPORT_SYMBOL(__PAGE_KERNEL);
13422 -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13424 -#ifdef CONFIG_NUMA
13425 -extern void __init remap_numa_kva(void);
13427 -#define remap_numa_kva() do {} while (0)
13429 +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
13431 pgd_t *swapper_pg_dir;
13433 @@ -410,9 +388,8 @@ static void __init xen_pagetable_setup_d
13434 * the boot process.
13436 * If we're booting on native hardware, this will be a pagetable
13437 - * constructed in arch/i386/kernel/head.S, and not running in PAE mode
13438 - * (even if we'll end up running in PAE). The root of the pagetable
13439 - * will be swapper_pg_dir.
13440 + * constructed in arch/x86/kernel/head_32.S. The root of the
13441 + * pagetable will be swapper_pg_dir.
13443 * If we're booting paravirtualized under a hypervisor, then there are
13444 * more options: we may already be running PAE, and the pagetable may
13445 @@ -424,10 +401,10 @@ static void __init xen_pagetable_setup_d
13446 * be partially populated, and so it avoids stomping on any existing
13449 -static void __init pagetable_init (void)
13450 +static void __init pagetable_init(void)
13452 - unsigned long vaddr, end;
13453 pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
13454 + unsigned long vaddr, end;
13456 xen_pagetable_setup_start(pgd_base);
13458 @@ -449,34 +426,36 @@ static void __init pagetable_init (void)
13459 * Fixed mappings, only the page table structure has to be
13460 * created - mappings will be set by set_fixmap():
13462 + early_ioremap_clear();
13463 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
13464 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
13465 page_table_range_init(vaddr, end, pgd_base);
13466 + early_ioremap_reset();
13468 permanent_kmaps_init(pgd_base);
13470 xen_pagetable_setup_done(pgd_base);
13473 -#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
13474 +#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
13476 - * Swap suspend & friends need this for resume because things like the intel-agp
13477 + * ACPI suspend needs this for resume, because things like the intel-agp
13478 * driver might have split up a kernel 4MB mapping.
13480 -char __nosavedata swsusp_pg_dir[PAGE_SIZE]
13481 - __attribute__ ((aligned (PAGE_SIZE)));
13482 +char swsusp_pg_dir[PAGE_SIZE]
13483 + __attribute__ ((aligned(PAGE_SIZE)));
13485 static inline void save_pg_dir(void)
13487 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
13490 +#else /* !CONFIG_ACPI_SLEEP */
13491 static inline void save_pg_dir(void)
13495 +#endif /* !CONFIG_ACPI_SLEEP */
13497 -void zap_low_mappings (void)
13498 +void zap_low_mappings(void)
13502 @@ -488,22 +467,24 @@ void zap_low_mappings (void)
13503 * Note that "pgd_clear()" doesn't do it for
13504 * us, because pgd_clear() is a no-op on i386.
13506 - for (i = 0; i < USER_PTRS_PER_PGD; i++)
13507 + for (i = 0; i < USER_PTRS_PER_PGD; i++) {
13508 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13509 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
13511 set_pgd(swapper_pg_dir+i, __pgd(0));
13517 -int nx_enabled = 0;
13520 +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
13521 +EXPORT_SYMBOL_GPL(__supported_pte_mask);
13523 #ifdef CONFIG_X86_PAE
13525 -static int disable_nx __initdata = 0;
13526 -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
13527 -EXPORT_SYMBOL_GPL(__supported_pte_mask);
13528 +static int disable_nx __initdata;
13532 @@ -520,11 +501,14 @@ static int __init noexec_setup(char *str
13533 __supported_pte_mask |= _PAGE_NX;
13536 - } else if (!strcmp(str,"off")) {
13538 - __supported_pte_mask &= ~_PAGE_NX;
13542 + if (!strcmp(str, "off")) {
13544 + __supported_pte_mask &= ~_PAGE_NX;
13552 @@ -536,6 +520,7 @@ static void __init set_nx(void)
13554 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
13555 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
13557 if ((v[3] & (1 << 20)) && !disable_nx) {
13558 rdmsr(MSR_EFER, l, h);
13560 @@ -545,35 +530,6 @@ static void __init set_nx(void)
13566 - * Enables/disables executability of a given kernel page and
13567 - * returns the previous setting.
13569 -int __init set_kernel_exec(unsigned long vaddr, int enable)
13577 - pte = lookup_address(vaddr);
13580 - if (!pte_exec_kernel(*pte))
13584 - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
13586 - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
13587 - pte_update_defer(&init_mm, vaddr, pte);
13588 - __flush_tlb_all();
13596 @@ -590,21 +546,10 @@ void __init paging_init(void)
13597 #ifdef CONFIG_X86_PAE
13600 - printk("NX (Execute Disable) protection: active\n");
13601 + printk(KERN_INFO "NX (Execute Disable) protection: active\n");
13606 -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
13608 - * We will bail out later - printk doesn't work right now so
13609 - * the user would just see a hanging kernel.
13610 - * when running as xen domain we are already in PAE mode at
13614 - set_in_cr4(X86_CR4_PAE);
13619 @@ -631,10 +576,10 @@ void __init paging_init(void)
13620 * used to involve black magic jumps to work around some nasty CPU bugs,
13621 * but fortunately the switch to using exceptions got rid of all that.
13624 static void __init test_wp_bit(void)
13626 - printk("Checking if this processor honours the WP bit even in supervisor mode... ");
13628 + "Checking if this processor honours the WP bit even in supervisor mode...");
13630 /* Any page-aligned address will do, the test is non-destructive */
13631 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
13632 @@ -642,23 +587,22 @@ static void __init test_wp_bit(void)
13633 clear_fixmap(FIX_WP_TEST);
13635 if (!boot_cpu_data.wp_works_ok) {
13637 + printk(KERN_CONT "No.\n");
13638 #ifdef CONFIG_X86_WP_WORKS_OK
13639 - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13641 + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
13645 + printk(KERN_CONT "Ok.\n");
13649 -static struct kcore_list kcore_mem, kcore_vmalloc;
13650 +static struct kcore_list kcore_mem, kcore_vmalloc;
13652 void __init mem_init(void)
13654 - extern int ppro_with_ram_bug(void);
13655 int codesize, reservedpages, datasize, initsize;
13658 + int tmp, bad_ppro;
13661 #if defined(CONFIG_SWIOTLB)
13662 @@ -668,19 +612,19 @@ void __init mem_init(void)
13663 #ifdef CONFIG_FLATMEM
13667 bad_ppro = ppro_with_ram_bug();
13669 #ifdef CONFIG_HIGHMEM
13670 /* check that fixmap and pkmap do not overlap */
13671 - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13672 - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
13673 + if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
13675 + "fixmap and kmap areas overlap - this will crash\n");
13676 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
13677 - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
13678 + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
13684 /* this will put all low memory onto the freelists */
13685 totalram_pages += free_all_bootmem();
13686 /* XEN: init and count low-mem pages outside initial allocation. */
13687 @@ -693,7 +637,7 @@ void __init mem_init(void)
13689 for (tmp = 0; tmp < max_low_pfn; tmp++)
13691 - * Only count reserved RAM pages
13692 + * Only count reserved RAM pages:
13694 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
13696 @@ -704,11 +648,12 @@ void __init mem_init(void)
13697 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
13698 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
13700 - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13701 - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13702 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
13703 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
13704 VMALLOC_END-VMALLOC_START);
13706 - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
13707 + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
13708 + "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
13709 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
13710 num_physpages << (PAGE_SHIFT-10),
13712 @@ -719,54 +664,53 @@ void __init mem_init(void)
13715 #if 1 /* double-sanity-check paranoia */
13716 - printk("virtual kernel memory layout:\n"
13717 - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13718 + printk(KERN_INFO "virtual kernel memory layout:\n"
13719 + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13720 #ifdef CONFIG_HIGHMEM
13721 - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13722 + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
13724 - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
13725 - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
13726 - " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
13727 - " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
13728 - " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
13729 - FIXADDR_START, FIXADDR_TOP,
13730 - (FIXADDR_TOP - FIXADDR_START) >> 10,
13731 + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
13732 + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
13733 + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
13734 + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
13735 + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
13736 + FIXADDR_START, FIXADDR_TOP,
13737 + (FIXADDR_TOP - FIXADDR_START) >> 10,
13739 #ifdef CONFIG_HIGHMEM
13740 - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13741 - (LAST_PKMAP*PAGE_SIZE) >> 10,
13742 + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
13743 + (LAST_PKMAP*PAGE_SIZE) >> 10,
13746 - VMALLOC_START, VMALLOC_END,
13747 - (VMALLOC_END - VMALLOC_START) >> 20,
13748 + VMALLOC_START, VMALLOC_END,
13749 + (VMALLOC_END - VMALLOC_START) >> 20,
13751 - (unsigned long)__va(0), (unsigned long)high_memory,
13752 - ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13753 + (unsigned long)__va(0), (unsigned long)high_memory,
13754 + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
13756 - (unsigned long)&__init_begin, (unsigned long)&__init_end,
13757 - ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
13758 + (unsigned long)&__init_begin, (unsigned long)&__init_end,
13759 + ((unsigned long)&__init_end -
13760 + (unsigned long)&__init_begin) >> 10,
13762 - (unsigned long)&_etext, (unsigned long)&_edata,
13763 - ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13764 + (unsigned long)&_etext, (unsigned long)&_edata,
13765 + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
13767 - (unsigned long)&_text, (unsigned long)&_etext,
13768 - ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13769 + (unsigned long)&_text, (unsigned long)&_etext,
13770 + ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
13772 #ifdef CONFIG_HIGHMEM
13773 - BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13774 - BUG_ON(VMALLOC_END > PKMAP_BASE);
13775 + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
13776 + BUG_ON(VMALLOC_END > PKMAP_BASE);
13778 - BUG_ON(VMALLOC_START > VMALLOC_END);
13779 - BUG_ON((unsigned long)high_memory > VMALLOC_START);
13780 + BUG_ON(VMALLOC_START > VMALLOC_END);
13781 + BUG_ON((unsigned long)high_memory > VMALLOC_START);
13782 #endif /* double-sanity-check paranoia */
13784 -#ifdef CONFIG_X86_PAE
13785 - if (!cpu_has_pae)
13786 - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
13788 if (boot_cpu_data.wp_works_ok < 0)
13794 * Subtle. SMP is doing it's boot stuff late (because it has to
13795 * fork idle threads) - but it also needs low mappings for the
13796 @@ -790,49 +734,35 @@ int arch_add_memory(int nid, u64 start,
13798 return __add_pages(zone, start_pfn, nr_pages);
13803 -struct kmem_cache *pmd_cache;
13805 -void __init pgtable_cache_init(void)
13807 - if (PTRS_PER_PMD > 1)
13808 - pmd_cache = kmem_cache_create("pmd",
13809 - PTRS_PER_PMD*sizeof(pmd_t),
13810 - PTRS_PER_PMD*sizeof(pmd_t),
13816 * This function cannot be __init, since exceptions don't work in that
13817 * section. Put this after the callers, so that it cannot be inlined.
13819 -static int noinline do_test_wp_bit(void)
13820 +static noinline int do_test_wp_bit(void)
13825 __asm__ __volatile__(
13827 - "1: movb %1,%0 \n"
13829 + " movb %0, %1 \n"
13830 + "1: movb %1, %0 \n"
13831 + " xorl %2, %2 \n"
13833 - ".section __ex_table,\"a\"\n"
13835 - " .long 1b,2b \n"
13837 + _ASM_EXTABLE(1b,2b)
13838 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
13848 #ifdef CONFIG_DEBUG_RODATA
13849 +const int rodata_test_data = 0xC3;
13850 +EXPORT_SYMBOL_GPL(rodata_test_data);
13852 void mark_rodata_ro(void)
13854 @@ -845,32 +775,58 @@ void mark_rodata_ro(void)
13855 if (num_possible_cpus() <= 1)
13858 - change_page_attr(virt_to_page(start),
13859 - size >> PAGE_SHIFT, PAGE_KERNEL_RX);
13860 - printk("Write protecting the kernel text: %luk\n", size >> 10);
13861 + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13862 + printk(KERN_INFO "Write protecting the kernel text: %luk\n",
13865 +#ifdef CONFIG_CPA_DEBUG
13866 + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
13867 + start, start+size);
13868 + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
13870 + printk(KERN_INFO "Testing CPA: write protecting again\n");
13871 + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
13876 size = (unsigned long)__end_rodata - start;
13877 - change_page_attr(virt_to_page(start),
13878 - size >> PAGE_SHIFT, PAGE_KERNEL_RO);
13879 - printk("Write protecting the kernel read-only data: %luk\n",
13881 + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13882 + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
13886 +#ifdef CONFIG_CPA_DEBUG
13887 + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
13888 + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
13891 - * change_page_attr() requires a global_flush_tlb() call after it.
13892 - * We do this after the printk so that if something went wrong in the
13893 - * change, the printk gets out at least to give a better debug hint
13894 - * of who is the culprit.
13896 - global_flush_tlb();
13897 + printk(KERN_INFO "Testing CPA: write protecting again\n");
13898 + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
13903 void free_init_pages(char *what, unsigned long begin, unsigned long end)
13905 +#ifdef CONFIG_DEBUG_PAGEALLOC
13907 + * If debugging page accesses then do not free this memory but
13908 + * mark them not present - any buggy init-section access will
13909 + * create a kernel page fault:
13911 + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
13912 + begin, PAGE_ALIGN(end));
13913 + set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
13915 unsigned long addr;
13918 + * We just marked the kernel text read only above, now that
13919 + * we are going to free part of that, we need to make that
13920 + * writeable first.
13922 + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
13924 for (addr = begin; addr < end; addr += PAGE_SIZE) {
13925 ClearPageReserved(virt_to_page(addr));
13926 init_page_count(virt_to_page(addr));
13927 @@ -879,6 +835,7 @@ void free_init_pages(char *what, unsigne
13930 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
13934 void free_initmem(void)
13935 @@ -894,4 +851,3 @@ void free_initrd_mem(unsigned long start
13936 free_init_pages("initrd memory", start, end);
13940 --- sle11-2009-10-16.orig/arch/x86/mm/init_64-xen.c 2009-02-16 16:18:36.000000000 +0100
13941 +++ sle11-2009-10-16/arch/x86/mm/init_64-xen.c 2009-03-16 16:33:40.000000000 +0100
13942 @@ -46,14 +46,13 @@
13943 #include <asm/proto.h>
13944 #include <asm/smp.h>
13945 #include <asm/sections.h>
13946 +#include <asm/kdebug.h>
13947 +#include <asm/numa.h>
13948 +#include <asm/cacheflush.h>
13950 #include <xen/features.h>
13953 -#define Dprintk(x...)
13956 -const struct dma_mapping_ops* dma_ops;
13957 +const struct dma_mapping_ops *dma_ops;
13958 EXPORT_SYMBOL(dma_ops);
13960 #if CONFIG_XEN_COMPAT <= 0x030002
13961 @@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_
13962 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
13963 __START_KERNEL_map)))
13965 -static void __meminit early_make_page_readonly(void *va, unsigned int feature)
13966 +pmd_t *__init early_get_pmd(unsigned long va)
13968 + unsigned long addr;
13969 + unsigned long *page = (unsigned long *)init_level4_pgt;
13971 + addr = page[pgd_index(va)];
13972 + addr_to_page(addr, page);
13974 + addr = page[pud_index(va)];
13975 + addr_to_page(addr, page);
13977 + return (pmd_t *)&page[pmd_index(va)];
13980 +void __meminit early_make_page_readonly(void *va, unsigned int feature)
13982 unsigned long addr, _va = (unsigned long)va;
13984 @@ -107,76 +120,6 @@ static void __meminit early_make_page_re
13988 -static void __make_page_readonly(void *va)
13990 - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
13991 - unsigned long addr = (unsigned long) va;
13993 - pgd = pgd_offset_k(addr);
13994 - pud = pud_offset(pgd, addr);
13995 - pmd = pmd_offset(pud, addr);
13996 - ptep = pte_offset_kernel(pmd, addr);
13998 - pte.pte = ptep->pte & ~_PAGE_RW;
13999 - if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14000 - xen_l1_entry_update(ptep, pte); /* fallback */
14002 - if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14003 - __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
14006 -static void __make_page_writable(void *va)
14008 - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
14009 - unsigned long addr = (unsigned long) va;
14011 - pgd = pgd_offset_k(addr);
14012 - pud = pud_offset(pgd, addr);
14013 - pmd = pmd_offset(pud, addr);
14014 - ptep = pte_offset_kernel(pmd, addr);
14016 - pte.pte = ptep->pte | _PAGE_RW;
14017 - if (HYPERVISOR_update_va_mapping(addr, pte, 0))
14018 - xen_l1_entry_update(ptep, pte); /* fallback */
14020 - if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
14021 - __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
14024 -void make_page_readonly(void *va, unsigned int feature)
14026 - if (!xen_feature(feature))
14027 - __make_page_readonly(va);
14030 -void make_page_writable(void *va, unsigned int feature)
14032 - if (!xen_feature(feature))
14033 - __make_page_writable(va);
14036 -void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
14038 - if (xen_feature(feature))
14041 - while (nr-- != 0) {
14042 - __make_page_readonly(va);
14043 - va = (void*)((unsigned long)va + PAGE_SIZE);
14047 -void make_pages_writable(void *va, unsigned nr, unsigned int feature)
14049 - if (xen_feature(feature))
14052 - while (nr-- != 0) {
14053 - __make_page_writable(va);
14054 - va = (void*)((unsigned long)va + PAGE_SIZE);
14059 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
14060 * physical space so we can cache the place of the first one and move
14061 @@ -187,22 +130,26 @@ void show_mem(void)
14063 long i, total = 0, reserved = 0;
14064 long shared = 0, cached = 0;
14065 - pg_data_t *pgdat;
14067 + pg_data_t *pgdat;
14069 printk(KERN_INFO "Mem-info:\n");
14071 - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
14072 + printk(KERN_INFO "Free swap: %6ldkB\n",
14073 + nr_swap_pages << (PAGE_SHIFT-10));
14075 for_each_online_pgdat(pgdat) {
14076 - for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14077 - /* this loop can take a while with 256 GB and 4k pages
14078 - so update the NMI watchdog */
14079 - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
14080 + for (i = 0; i < pgdat->node_spanned_pages; ++i) {
14082 + * This loop can take a while with 256 GB and
14083 + * 4k pages so defer the NMI watchdog:
14085 + if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
14086 touch_nmi_watchdog();
14089 if (!pfn_valid(pgdat->node_start_pfn + i))
14092 page = pfn_to_page(pgdat->node_start_pfn + i);
14094 if (PageReserved(page))
14095 @@ -211,58 +158,67 @@ void show_mem(void)
14097 else if (page_count(page))
14098 shared += page_count(page) - 1;
14102 - printk(KERN_INFO "%lu pages of RAM\n", total);
14103 - printk(KERN_INFO "%lu reserved pages\n",reserved);
14104 - printk(KERN_INFO "%lu pages shared\n",shared);
14105 - printk(KERN_INFO "%lu pages swap cached\n",cached);
14106 + printk(KERN_INFO "%lu pages of RAM\n", total);
14107 + printk(KERN_INFO "%lu reserved pages\n", reserved);
14108 + printk(KERN_INFO "%lu pages shared\n", shared);
14109 + printk(KERN_INFO "%lu pages swap cached\n", cached);
14112 +static unsigned long __meminitdata table_start;
14113 +static unsigned long __meminitdata table_end;
14115 static __init void *spp_getpage(void)
14121 - ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14122 + ptr = (void *) get_zeroed_page(GFP_ATOMIC);
14123 else if (start_pfn < table_end) {
14124 ptr = __va(start_pfn << PAGE_SHIFT);
14126 memset(ptr, 0, PAGE_SIZE);
14128 ptr = alloc_bootmem_pages(PAGE_SIZE);
14129 - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
14130 - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
14132 - Dprintk("spp_getpage %p\n", ptr);
14133 + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
14134 + panic("set_pte_phys: cannot allocate page data %s\n",
14135 + after_bootmem ? "after bootmem" : "");
14138 + pr_debug("spp_getpage %p\n", ptr);
14144 #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
14145 #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
14147 -static __init void set_pte_phys(unsigned long vaddr,
14148 - unsigned long phys, pgprot_t prot, int user_mode)
14149 +static __init void
14150 +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
14155 pte_t *pte, new_pte;
14157 - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14158 + pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
14160 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
14161 if (pgd_none(*pgd)) {
14162 - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14164 + "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14167 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
14168 if (pud_none(*pud)) {
14169 - pmd = (pmd_t *) spp_getpage();
14170 + pmd = (pmd_t *) spp_getpage();
14171 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14172 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14173 if (pmd != pmd_offset(pud, 0)) {
14174 - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14175 + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14176 + pmd, pmd_offset(pud, 0));
14180 @@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned
14181 make_page_readonly(pte, XENFEAT_writable_page_tables);
14182 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14183 if (pte != pte_offset_kernel(pmd, 0)) {
14184 - printk("PAGETABLE BUG #02!\n");
14185 + printk(KERN_ERR "PAGETABLE BUG #02!\n");
14189 @@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned
14190 __flush_tlb_one(vaddr);
14193 -static __init void set_pte_phys_ma(unsigned long vaddr,
14194 - unsigned long phys, pgprot_t prot)
14195 +static __init void
14196 +set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
14201 pte_t *pte, new_pte;
14203 - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
14204 + pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
14206 pgd = pgd_offset_k(vaddr);
14207 if (pgd_none(*pgd)) {
14208 - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
14210 + "PGD FIXMAP MISSING, it should be setup in head.S!\n");
14213 pud = pud_offset(pgd, vaddr);
14214 if (pud_none(*pud)) {
14216 - pmd = (pmd_t *) spp_getpage();
14217 + pmd = (pmd_t *) spp_getpage();
14218 make_page_readonly(pmd, XENFEAT_writable_page_tables);
14219 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
14220 if (pmd != pmd_offset(pud, 0)) {
14221 - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
14223 + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
14224 + pmd, pmd_offset(pud, 0));
14227 pmd = pmd_offset(pud, vaddr);
14228 @@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig
14229 make_page_readonly(pte, XENFEAT_writable_page_tables);
14230 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
14231 if (pte != pte_offset_kernel(pmd, 0)) {
14232 - printk("PAGETABLE BUG #02!\n");
14233 + printk(KERN_ERR "PAGETABLE BUG #02!\n");
14237 @@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig
14238 __flush_tlb_one(vaddr);
14241 +#ifndef CONFIG_XEN
14243 + * The head.S code sets up the kernel high mapping:
14245 + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
14247 + * phys_addr holds the negative offset to the kernel, which is added
14248 + * to the compile time generated pmds. This results in invalid pmds up
14249 + * to the point where we hit the physaddr 0 mapping.
14251 + * We limit the mappings to the region from _text to _end. _end is
14252 + * rounded up to the 2MB boundary. This catches the invalid pmds as
14253 + * well, as they are located before _text:
14255 +void __init cleanup_highmap(void)
14257 + unsigned long vaddr = __START_KERNEL_map;
14258 + unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
14259 + pmd_t *pmd = level2_kernel_pgt;
14260 + pmd_t *last_pmd = pmd + PTRS_PER_PMD;
14262 + for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
14263 + if (!pmd_present(*pmd))
14265 + if (vaddr < (unsigned long) _text || vaddr > end)
14266 + set_pmd(pmd, __pmd(0));
14271 /* NOTE: this is meant to be run only at boot */
14273 -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14275 +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
14277 unsigned long address = __fix_to_virt(idx);
14279 if (idx >= __end_of_fixed_addresses) {
14280 - printk("Invalid __set_fixmap\n");
14281 + printk(KERN_ERR "Invalid __set_fixmap\n");
14285 @@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx,
14289 -unsigned long __meminitdata table_start, table_end;
14291 static __meminit void *alloc_static_page(unsigned long *phys)
14293 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
14295 if (after_bootmem) {
14296 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
14303 @@ -396,7 +380,7 @@ static __meminit void *alloc_static_page
14305 #define PTE_SIZE PAGE_SIZE
14307 -static inline int make_readonly(unsigned long paddr)
14308 +static inline int __meminit make_readonly(unsigned long paddr)
14310 extern char __vsyscall_0;
14312 @@ -430,33 +414,38 @@ static inline int make_readonly(unsigned
14313 /* Must run before zap_low_mappings */
14314 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
14316 - unsigned long vaddr;
14317 pmd_t *pmd, *last_pmd;
14318 + unsigned long vaddr;
14321 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14322 vaddr = __START_KERNEL_map;
14323 pmd = level2_kernel_pgt;
14324 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
14326 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
14327 for (i = 0; i < pmds; i++) {
14328 if (pmd_present(pmd[i]))
14330 + goto continue_outer_loop;
14332 vaddr += addr & ~PMD_MASK;
14335 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
14336 - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
14338 + set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
14339 + __flush_tlb_all();
14341 return (void *)vaddr;
14343 +continue_outer_loop:
14346 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
14350 -/* To avoid virtual aliases later */
14352 + * To avoid virtual aliases later:
14354 __meminit void early_iounmap(void *addr, unsigned long size)
14356 unsigned long vaddr;
14357 @@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr,
14358 vaddr = (unsigned long)addr;
14359 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
14360 pmd = level2_kernel_pgt + pmd_index(vaddr);
14362 for (i = 0; i < pmds; i++)
14363 pmd_clear(pmd + i);
14366 + __flush_tlb_all();
14370 @@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
14371 static void __meminit
14372 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
14374 - pmd_t *pmd = pmd_offset(pud,0);
14375 + pmd_t *pmd = pmd_offset(pud, 0);
14376 spin_lock(&init_mm.page_table_lock);
14377 phys_pmd_init(pmd, address, end);
14378 spin_unlock(&init_mm.page_table_lock);
14382 -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14384 +static void __meminit
14385 +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
14387 int i = pud_index(addr);
14389 - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
14390 + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
14391 unsigned long pmd_phys;
14392 pud_t *pud = pud_page + pud_index(addr);
14394 @@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_
14396 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
14400 + __flush_tlb_all();
14403 void __init xen_init_pt(void)
14405 @@ -632,6 +624,7 @@ void __init xen_init_pt(void)
14406 static void __init extend_init_mapping(unsigned long tables_space)
14408 unsigned long va = __START_KERNEL_map;
14409 + unsigned long start = start_pfn;
14410 unsigned long phys, addr, *pte_page;
14412 pte_t *pte, new_pte;
14413 @@ -682,6 +675,10 @@ static void __init extend_init_mapping(u
14418 + if (start_pfn > start)
14419 + reserve_early(start << PAGE_SHIFT,
14420 + start_pfn << PAGE_SHIFT, "INITMAP");
14423 static void __init find_early_table_space(unsigned long end)
14424 @@ -706,7 +703,7 @@ static void __init find_early_table_spac
14425 (table_start << PAGE_SHIFT) + tables);
14428 -static void xen_finish_init_mapping(void)
14429 +static void __init xen_finish_init_mapping(void)
14431 unsigned long i, start, end;
14433 @@ -738,13 +735,6 @@ static void xen_finish_init_mapping(void
14434 /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
14438 - * Prefetch pte's for the bt_ioremap() area. It gets used before the
14439 - * boot-time allocator is online, so allocate-on-demand would fail.
14441 - for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
14442 - __set_fixmap(i, 0, __pgprot(0));
14444 /* Switch to the real shared_info page, and clear the dummy page. */
14445 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
14446 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
14447 @@ -764,20 +754,23 @@ static void xen_finish_init_mapping(void
14448 table_end = start_pfn;
14451 -/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
14452 - This runs before bootmem is initialized and gets pages directly from the
14453 - physical memory. To access them they are temporarily mapped. */
14455 + * Setup the direct mapping of the physical memory at PAGE_OFFSET.
14456 + * This runs before bootmem is initialized and gets pages directly from
14457 + * the physical memory. To access them they are temporarily mapped.
14459 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
14462 unsigned long next;
14464 - Dprintk("init_memory_mapping\n");
14465 + pr_debug("init_memory_mapping\n");
14469 * Find space for the kernel direct mapping tables.
14470 - * Later we should allocate these tables in the local node of the memory
14471 - * mapped. Unfortunately this is done currently before the nodes are
14474 + * Later we should allocate these tables in the local node of the
14475 + * memory mapped. Unfortunately this is done currently before the
14476 + * nodes are discovered.
14478 if (!after_bootmem)
14479 find_early_table_space(end);
14480 @@ -786,8 +779,8 @@ void __init_refok init_memory_mapping(un
14481 end = (unsigned long)__va(end);
14483 for (; start < end; start = next) {
14484 - unsigned long pud_phys;
14485 pgd_t *pgd = pgd_offset_k(start);
14486 + unsigned long pud_phys;
14490 @@ -795,8 +788,8 @@ void __init_refok init_memory_mapping(un
14492 pud = alloc_static_page(&pud_phys);
14493 next = start + PGDIR_SIZE;
14498 phys_pud_init(pud, __pa(start), __pa(next));
14499 if (!after_bootmem) {
14500 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
14501 @@ -810,12 +803,17 @@ void __init_refok init_memory_mapping(un
14506 + if (!after_bootmem)
14507 + reserve_early(table_start << PAGE_SHIFT,
14508 + table_end << PAGE_SHIFT, "PGTABLE");
14511 #ifndef CONFIG_NUMA
14512 void __init paging_init(void)
14514 unsigned long max_zone_pfns[MAX_NR_ZONES];
14516 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
14517 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
14518 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
14519 @@ -829,40 +827,6 @@ void __init paging_init(void)
14523 -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
14524 - from the CPU leading to inconsistent cache lines. address and size
14525 - must be aligned to 2MB boundaries.
14526 - Does nothing when the mapping doesn't exist. */
14527 -void __init clear_kernel_mapping(unsigned long address, unsigned long size)
14529 - unsigned long end = address + size;
14531 - BUG_ON(address & ~LARGE_PAGE_MASK);
14532 - BUG_ON(size & ~LARGE_PAGE_MASK);
14534 - for (; address < end; address += LARGE_PAGE_SIZE) {
14535 - pgd_t *pgd = pgd_offset_k(address);
14538 - if (pgd_none(*pgd))
14540 - pud = pud_offset(pgd, address);
14541 - if (pud_none(*pud))
14543 - pmd = pmd_offset(pud, address);
14544 - if (!pmd || pmd_none(*pmd))
14546 - if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
14547 - /* Could handle this, but it should not happen currently. */
14549 - "clear_kernel_mapping: mapping has been split. will leak memory\n");
14552 - set_pmd(pmd, __pmd(0));
14554 - __flush_tlb_all();
14558 * Memory hotplug specific functions
14560 @@ -888,16 +852,12 @@ int arch_add_memory(int nid, u64 start,
14561 unsigned long nr_pages = size >> PAGE_SHIFT;
14564 - init_memory_mapping(start, (start + size -1));
14565 + init_memory_mapping(start, start + size-1);
14567 ret = __add_pages(zone, start_pfn, nr_pages);
14574 - printk("%s: Problem encountered in __add_pages!\n", __func__);
14577 EXPORT_SYMBOL_GPL(arch_add_memory);
14579 @@ -911,36 +871,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to
14581 #endif /* CONFIG_MEMORY_HOTPLUG */
14583 -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
14585 - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
14586 - * just online the pages.
14588 -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
14591 - unsigned long pfn;
14592 - unsigned long total = 0, mem = 0;
14593 - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
14594 - if (pfn_valid(pfn)) {
14595 - online_page(pfn_to_page(pfn));
14602 - z->spanned_pages += total;
14603 - z->present_pages += mem;
14604 - z->zone_pgdat->node_spanned_pages += total;
14605 - z->zone_pgdat->node_present_pages += mem;
14611 -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
14613 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
14614 + kcore_modules, kcore_vsyscall;
14616 void __init mem_init(void)
14618 @@ -949,8 +881,7 @@ void __init mem_init(void)
14622 - /* clear the zero-page */
14623 - memset(empty_zero_page, 0, PAGE_SIZE);
14624 + /* clear_bss() already clear the empty_zero_page */
14628 @@ -968,7 +899,6 @@ void __init mem_init(void)
14630 reservedpages = end_pfn - totalram_pages -
14631 absent_pages_in_range(0, end_pfn);
14635 codesize = (unsigned long) &_etext - (unsigned long) &_text;
14636 @@ -976,46 +906,64 @@ void __init mem_init(void)
14637 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
14639 /* Register memory areas for /proc/kcore */
14640 - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14641 - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14642 + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
14643 + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
14644 VMALLOC_END-VMALLOC_START);
14645 kclist_add(&kcore_kernel, &_stext, _end - _stext);
14646 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
14647 - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14648 + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
14649 VSYSCALL_END - VSYSCALL_START);
14651 - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
14652 + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
14653 + "%ldk reserved, %ldk data, %ldk init)\n",
14654 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
14655 end_pfn << (PAGE_SHIFT-10),
14657 reservedpages << (PAGE_SHIFT-10),
14664 void free_init_pages(char *what, unsigned long begin, unsigned long end)
14666 - unsigned long addr;
14667 + unsigned long addr = begin;
14669 - if (begin >= end)
14674 + * If debugging page accesses then do not free this memory but
14675 + * mark them not present - any buggy init-section access will
14676 + * create a kernel page fault:
14678 +#ifdef CONFIG_DEBUG_PAGEALLOC
14679 + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
14680 + begin, PAGE_ALIGN(end));
14681 + set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
14683 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
14684 - for (addr = begin; addr < end; addr += PAGE_SIZE) {
14686 + for (; addr < end; addr += PAGE_SIZE) {
14687 ClearPageReserved(virt_to_page(addr));
14688 init_page_count(virt_to_page(addr));
14689 memset((void *)(addr & ~(PAGE_SIZE-1)),
14690 POISON_FREE_INITMEM, PAGE_SIZE);
14691 if (addr >= __START_KERNEL_map) {
14692 /* make_readonly() reports all kernel addresses. */
14693 - __make_page_writable(__va(__pa(addr)));
14694 - change_page_attr_addr(addr, 1, __pgprot(0));
14695 + if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
14696 + pfn_pte(__pa(addr) >> PAGE_SHIFT,
14700 + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
14706 - if (addr > __START_KERNEL_map)
14707 - global_flush_tlb();
14711 void free_initmem(void)
14712 @@ -1026,6 +974,8 @@ void free_initmem(void)
14715 #ifdef CONFIG_DEBUG_RODATA
14716 +const int rodata_test_data = 0xC3;
14717 +EXPORT_SYMBOL_GPL(rodata_test_data);
14719 void mark_rodata_ro(void)
14721 @@ -1047,18 +997,27 @@ void mark_rodata_ro(void)
14725 - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
14727 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
14728 (end - start) >> 10);
14729 + set_memory_ro(start, (end - start) >> PAGE_SHIFT);
14732 - * change_page_attr_addr() requires a global_flush_tlb() call after it.
14733 - * We do this after the printk so that if something went wrong in the
14734 - * change, the printk gets out at least to give a better debug hint
14735 - * of who is the culprit.
14736 + * The rodata section (but not the kernel text!) should also be
14737 + * not-executable.
14739 - global_flush_tlb();
14740 + start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
14741 + set_memory_nx(start, (end - start) >> PAGE_SHIFT);
14745 +#ifdef CONFIG_CPA_DEBUG
14746 + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
14747 + set_memory_rw(start, (end-start) >> PAGE_SHIFT);
14749 + printk(KERN_INFO "Testing CPA: again\n");
14750 + set_memory_ro(start, (end-start) >> PAGE_SHIFT);
14755 @@ -1069,17 +1028,21 @@ void free_initrd_mem(unsigned long start
14759 -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14761 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
14764 int nid = phys_to_nid(phys);
14766 unsigned long pfn = phys >> PAGE_SHIFT;
14768 if (pfn >= end_pfn) {
14769 - /* This can happen with kdump kernels when accessing firmware
14772 + * This can happen with kdump kernels when accessing
14773 + * firmware tables:
14775 if (pfn < end_pfn_map)
14778 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
14781 @@ -1087,9 +1050,9 @@ void __init reserve_bootmem_generic(unsi
14783 /* Should check here against the e820 map to avoid double free */
14785 - reserve_bootmem_node(NODE_DATA(nid), phys, len);
14787 - reserve_bootmem(phys, len);
14788 + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
14790 + reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
14793 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
14794 @@ -1099,46 +1062,49 @@ void __init reserve_bootmem_generic(unsi
14798 -int kern_addr_valid(unsigned long addr)
14800 +int kern_addr_valid(unsigned long addr)
14802 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
14812 if (above != 0 && above != -1UL)
14817 pgd = pgd_offset_k(addr);
14818 if (pgd_none(*pgd))
14821 pud = pud_offset(pgd, addr);
14822 if (pud_none(*pud))
14826 pmd = pmd_offset(pud, addr);
14827 if (pmd_none(*pmd))
14830 if (pmd_large(*pmd))
14831 return pfn_valid(pmd_pfn(*pmd));
14833 pte = pte_offset_kernel(pmd, addr);
14834 if (pte_none(*pte))
14837 return pfn_valid(pte_pfn(*pte));
14840 -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
14841 - covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14842 - not need special handling anymore. */
14845 + * A pseudo VMA to allow ptrace access for the vsyscall page. This only
14846 + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
14847 + * not need special handling anymore:
14849 static struct vm_area_struct gate_vma = {
14850 - .vm_start = VSYSCALL_START,
14851 - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
14852 - .vm_page_prot = PAGE_READONLY_EXEC,
14853 - .vm_flags = VM_READ | VM_EXEC
14854 + .vm_start = VSYSCALL_START,
14855 + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
14856 + .vm_page_prot = PAGE_READONLY_EXEC,
14857 + .vm_flags = VM_READ | VM_EXEC
14860 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
14861 @@ -1153,14 +1119,17 @@ struct vm_area_struct *get_gate_vma(stru
14862 int in_gate_area(struct task_struct *task, unsigned long addr)
14864 struct vm_area_struct *vma = get_gate_vma(task);
14869 return (addr >= vma->vm_start) && (addr < vma->vm_end);
14872 -/* Use this when you have no reliable task/vma, typically from interrupt
14873 - * context. It is less reliable than using the task's vma and may give
14874 - * false positives.
14876 + * Use this when you have no reliable task/vma, typically from interrupt
14877 + * context. It is less reliable than using the task's vma and may give
14878 + * false positives:
14880 int in_gate_area_no_task(unsigned long addr)
14882 @@ -1180,8 +1149,8 @@ const char *arch_vma_name(struct vm_area
14884 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
14886 -int __meminit vmemmap_populate(struct page *start_page,
14887 - unsigned long size, int node)
14889 +vmemmap_populate(struct page *start_page, unsigned long size, int node)
14891 unsigned long addr = (unsigned long)start_page;
14892 unsigned long end = (unsigned long)(start_page + size);
14893 @@ -1196,6 +1165,7 @@ int __meminit vmemmap_populate(struct pa
14894 pgd = vmemmap_pgd_populate(addr, node);
14898 pud = vmemmap_pud_populate(pgd, addr, node);
14901 @@ -1203,20 +1173,22 @@ int __meminit vmemmap_populate(struct pa
14902 pmd = pmd_offset(pud, addr);
14903 if (pmd_none(*pmd)) {
14905 - void *p = vmemmap_alloc_block(PMD_SIZE, node);
14908 + p = vmemmap_alloc_block(PMD_SIZE, node);
14912 - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
14913 - mk_pte_huge(entry);
14914 - set_pmd(pmd, __pmd(pte_val(entry)));
14915 + entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
14916 + PAGE_KERNEL_LARGE);
14917 + set_pmd(pmd, __pmd_ma(__pte_val(entry)));
14919 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
14920 addr, addr + PMD_SIZE - 1, p, node);
14923 vmemmap_verify((pte_t *)pmd, node, addr, next);
14930 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
14931 +++ sle11-2009-10-16/arch/x86/mm/ioremap-xen.c 2009-03-16 16:33:40.000000000 +0100
14934 + * Re-map IO memory to kernel address space so that we can access it.
14935 + * This is needed for high PCI addresses that aren't mapped in the
14936 + * 640k-1MB IO memory area on PC's
14938 + * (C) Copyright 1995 1996 Linus Torvalds
14941 +#include <linux/bootmem.h>
14942 +#include <linux/init.h>
14943 +#include <linux/io.h>
14944 +#include <linux/module.h>
14945 +#include <linux/pfn.h>
14946 +#include <linux/slab.h>
14947 +#include <linux/vmalloc.h>
14949 +#include <asm/cacheflush.h>
14950 +#include <asm/e820.h>
14951 +#include <asm/fixmap.h>
14952 +#include <asm/pgtable.h>
14953 +#include <asm/tlbflush.h>
14954 +#include <asm/pgalloc.h>
14956 +enum ioremap_mode {
14957 + IOR_MODE_UNCACHED,
14961 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
14963 +unsigned long __phys_addr(unsigned long x)
14965 + if (x >= __START_KERNEL_map)
14966 + return x - __START_KERNEL_map + phys_base;
14967 + return x - PAGE_OFFSET;
14969 +EXPORT_SYMBOL(__phys_addr);
14973 +static int direct_remap_area_pte_fn(pte_t *pte,
14974 + struct page *pmd_page,
14975 + unsigned long address,
14978 + mmu_update_t **v = (mmu_update_t **)data;
14980 + BUG_ON(!pte_none(*pte));
14982 + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
14983 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
14989 +static int __direct_remap_pfn_range(struct mm_struct *mm,
14990 + unsigned long address,
14991 + unsigned long mfn,
14992 + unsigned long size,
14997 + unsigned long i, start_address;
14998 + mmu_update_t *u, *v, *w;
15000 + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15004 + start_address = address;
15006 + flush_cache_all();
15008 + for (i = 0; i < size; i += PAGE_SIZE) {
15009 + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15010 + /* Flush a full batch after filling in the PTE ptrs. */
15011 + rc = apply_to_page_range(mm, start_address,
15012 + address - start_address,
15013 + direct_remap_area_pte_fn, &w);
15017 + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15020 + start_address = address;
15024 + * Fill in the machine address: PTE ptr is done later by
15025 + * apply_to_page_range().
15027 + v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15030 + address += PAGE_SIZE;
15035 + /* Final batch. */
15036 + rc = apply_to_page_range(mm, start_address,
15037 + address - start_address,
15038 + direct_remap_area_pte_fn, &w);
15042 + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15051 + free_page((unsigned long)u);
15056 +int direct_remap_pfn_range(struct vm_area_struct *vma,
15057 + unsigned long address,
15058 + unsigned long mfn,
15059 + unsigned long size,
15063 + if (xen_feature(XENFEAT_auto_translated_physmap))
15064 + return remap_pfn_range(vma, address, mfn, size, prot);
15066 + if (domid == DOMID_SELF)
15069 + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15071 + vma->vm_mm->context.has_foreign_mappings = 1;
15073 + return __direct_remap_pfn_range(
15074 + vma->vm_mm, address, mfn, size, prot, domid);
15076 +EXPORT_SYMBOL(direct_remap_pfn_range);
15078 +int direct_kernel_remap_pfn_range(unsigned long address,
15079 + unsigned long mfn,
15080 + unsigned long size,
15084 + return __direct_remap_pfn_range(
15085 + &init_mm, address, mfn, size, prot, domid);
15087 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15089 +static int lookup_pte_fn(
15090 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15092 + uint64_t *ptep = (uint64_t *)data;
15094 + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15095 + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15099 +int create_lookup_pte_addr(struct mm_struct *mm,
15100 + unsigned long address,
15103 + return apply_to_page_range(mm, address, PAGE_SIZE,
15104 + lookup_pte_fn, ptep);
15107 +EXPORT_SYMBOL(create_lookup_pte_addr);
15109 +static int noop_fn(
15110 + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15115 +int touch_pte_range(struct mm_struct *mm,
15116 + unsigned long address,
15117 + unsigned long size)
15119 + return apply_to_page_range(mm, address, size, noop_fn, NULL);
15122 +EXPORT_SYMBOL(touch_pte_range);
15124 +#ifdef CONFIG_X86_32
15125 +int page_is_ram(unsigned long pagenr)
15127 + unsigned long addr, end;
15130 +#ifndef CONFIG_XEN
15132 + * A special case is the first 4Kb of memory;
15133 + * This is a BIOS owned area, not kernel ram, but generally
15134 + * not listed as such in the E820 table.
15140 + * Second special case: Some BIOSen report the PC BIOS
15141 + * area (640->1Mb) as ram even though it is not.
15143 + if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
15144 + pagenr < (BIOS_END >> PAGE_SHIFT))
15148 + for (i = 0; i < e820.nr_map; i++) {
15150 + * Not usable memory:
15152 + if (e820.map[i].type != E820_RAM)
15154 + addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
15155 + end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
15158 + if ((pagenr >= addr) && (pagenr < end))
15166 + * Fix up the linear direct mapping of the kernel to avoid cache attribute
15169 +static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
15170 + enum ioremap_mode mode)
15172 + unsigned long nrpages = size >> PAGE_SHIFT;
15176 + case IOR_MODE_UNCACHED:
15178 + err = set_memory_uc(vaddr, nrpages);
15180 + case IOR_MODE_CACHED:
15181 + err = set_memory_wb(vaddr, nrpages);
15189 + * Remap an arbitrary physical address space into the kernel virtual
15190 + * address space. Needed when the kernel wants to access high addresses
15193 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15194 + * have to convert them into an offset in a page-aligned mapping, but the
15195 + * caller shouldn't need to know that small detail.
15197 +static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
15198 + enum ioremap_mode mode)
15200 + unsigned long mfn, offset, last_addr, vaddr;
15201 + struct vm_struct *area;
15203 + domid_t domid = DOMID_IO;
15205 + /* Don't allow wraparound or zero size */
15206 + last_addr = phys_addr + size - 1;
15207 + if (!size || last_addr < phys_addr)
15211 + * Don't remap the low PCI/ISA area, it's always mapped..
15213 + if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
15214 + return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
15217 + * Don't allow anybody to remap normal RAM that we're using..
15219 + for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
15220 + unsigned long pfn = mfn_to_local_pfn(mfn);
15222 + if (pfn >= max_pfn)
15225 + domid = DOMID_SELF;
15227 + if (pfn >= max_pfn_mapped) /* bogus */
15230 + if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
15235 + case IOR_MODE_UNCACHED:
15238 + * FIXME: we will use UC MINUS for now, as video fb drivers
15239 + * depend on it. Upcoming ioremap_wc() will fix this behavior.
15241 + prot = PAGE_KERNEL_UC_MINUS;
15243 + case IOR_MODE_CACHED:
15244 + prot = PAGE_KERNEL;
15249 + * Mappings have to be page-aligned
15251 + offset = phys_addr & ~PAGE_MASK;
15252 + phys_addr &= PAGE_MASK;
15253 + size = PAGE_ALIGN(last_addr+1) - phys_addr;
15256 + * Ok, go for it..
15258 + area = get_vm_area(size, VM_IOREMAP | (mode << 20));
15261 + area->phys_addr = phys_addr;
15262 + vaddr = (unsigned long) area->addr;
15263 + if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
15264 + size, prot, domid)) {
15265 + free_vm_area(area);
15269 + if (ioremap_change_attr(vaddr, size, mode) < 0) {
15270 + iounmap((void __iomem *) vaddr);
15274 + return (void __iomem *) (vaddr + offset);
15278 + * ioremap_nocache - map bus memory into CPU space
15279 + * @offset: bus address of the memory
15280 + * @size: size of the resource to map
15282 + * ioremap_nocache performs a platform specific sequence of operations to
15283 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
15284 + * writew/writel functions and the other mmio helpers. The returned
15285 + * address is not guaranteed to be usable directly as a virtual
15288 + * This version of ioremap ensures that the memory is marked uncachable
15289 + * on the CPU as well as honouring existing caching rules from things like
15290 + * the PCI bus. Note that there are other caches and buffers on many
15291 + * busses. In particular driver authors should read up on PCI writes
15293 + * It's useful if some control registers are in such an area and
15294 + * write combining or read caching is not desirable:
15296 + * Must be freed with iounmap.
15298 +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
15300 + return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
15302 +EXPORT_SYMBOL(ioremap_nocache);
15304 +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
15306 + return __ioremap(phys_addr, size, IOR_MODE_CACHED);
15308 +EXPORT_SYMBOL(ioremap_cache);
15311 + * iounmap - Free a IO remapping
15312 + * @addr: virtual address from ioremap_*
15314 + * Caller must ensure there is only one unmapping for the same pointer.
15316 +void iounmap(volatile void __iomem *addr)
15318 + struct vm_struct *p, *o;
15320 + if ((void __force *)addr <= high_memory)
15324 + * __ioremap special-cases the PCI/ISA range by not instantiating a
15325 + * vm_area and by simply returning an address into the kernel mapping
15326 + * of ISA space. So handle that here.
15328 + if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15331 + addr = (volatile void __iomem *)
15332 + (PAGE_MASK & (unsigned long __force)addr);
15334 + /* Use the vm area unlocked, assuming the caller
15335 + ensures there isn't another iounmap for the same address
15336 + in parallel. Reuse of the virtual address is prevented by
15337 + leaving it in the global lists until we're done with it.
15338 + cpa takes care of the direct mappings. */
15339 + read_lock(&vmlist_lock);
15340 + for (p = vmlist; p; p = p->next) {
15341 + if (p->addr == addr)
15344 + read_unlock(&vmlist_lock);
15347 + printk(KERN_ERR "iounmap: bad address %p\n", addr);
15352 + if ((p->flags >> 20) != IOR_MODE_CACHED) {
15353 + unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
15354 + unsigned long mfn = p->phys_addr;
15355 + unsigned long va = (unsigned long)addr;
15357 + for (; n > 0; n--, mfn++, va += PAGE_SIZE)
15358 + if (mfn_to_local_pfn(mfn) < max_pfn)
15359 + set_memory_wb(va, 1);
15362 + /* Finally remove it */
15363 + o = remove_vm_area((void *)addr);
15364 + BUG_ON(p != o || o == NULL);
15367 +EXPORT_SYMBOL(iounmap);
15369 +int __initdata early_ioremap_debug;
15371 +static int __init early_ioremap_debug_setup(char *str)
15373 + early_ioremap_debug = 1;
15377 +early_param("early_ioremap_debug", early_ioremap_debug_setup);
15379 +static __initdata int after_paging_init;
15380 +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
15381 + __attribute__((aligned(PAGE_SIZE)));
15383 +#ifdef CONFIG_X86_32
15384 +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
15386 + /* Don't assume we're using swapper_pg_dir at this point */
15387 + pgd_t *base = __va(read_cr3());
15388 + pgd_t *pgd = &base[pgd_index(addr)];
15389 + pud_t *pud = pud_offset(pgd, addr);
15390 + pmd_t *pmd = pmd_offset(pud, addr);
15395 +#define early_ioremap_pmd early_get_pmd
15396 +#define make_lowmem_page_readonly early_make_page_readonly
15397 +#define make_lowmem_page_writable make_page_writable
15400 +static inline pte_t * __init early_ioremap_pte(unsigned long addr)
15402 + return &bm_pte[pte_index(addr)];
15405 +void __init early_ioremap_init(void)
15409 + if (early_ioremap_debug)
15410 + printk(KERN_INFO "early_ioremap_init()\n");
15412 + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
15413 + memset(bm_pte, 0, sizeof(bm_pte));
15414 + make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
15415 + pmd_populate_kernel(&init_mm, pmd, bm_pte);
15418 + * The boot-ioremap range spans multiple pmds, for which
15419 + * we are not prepared:
15421 + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
15423 + printk(KERN_WARNING "pmd %p != %p\n",
15424 + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
15425 + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
15426 + fix_to_virt(FIX_BTMAP_BEGIN));
15427 + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
15428 + fix_to_virt(FIX_BTMAP_END));
15430 + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
15431 + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
15432 + FIX_BTMAP_BEGIN);
15436 +#ifdef CONFIG_X86_32
15437 +void __init early_ioremap_clear(void)
15441 + if (early_ioremap_debug)
15442 + printk(KERN_INFO "early_ioremap_clear()\n");
15444 + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
15446 + make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
15447 + /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
15448 + __flush_tlb_all();
15451 +void __init early_ioremap_reset(void)
15453 + enum fixed_addresses idx;
15454 + unsigned long addr, phys;
15457 + after_paging_init = 1;
15458 + for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
15459 + addr = fix_to_virt(idx);
15460 + pte = early_ioremap_pte(addr);
15461 + if (pte_present(*pte)) {
15462 + phys = __pte_val(*pte) & PAGE_MASK;
15463 + set_fixmap(idx, phys);
15467 +#endif /* CONFIG_X86_32 */
15469 +static void __init __early_set_fixmap(enum fixed_addresses idx,
15470 + unsigned long phys, pgprot_t flags)
15472 + unsigned long addr = __fix_to_virt(idx);
15475 + if (idx >= __end_of_fixed_addresses) {
15479 + pte = early_ioremap_pte(addr);
15480 + if (pgprot_val(flags))
15481 + set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
15483 + pte_clear(NULL, addr, pte);
15484 + __flush_tlb_one(addr);
15487 +static inline void __init early_set_fixmap(enum fixed_addresses idx,
15488 + unsigned long phys)
15490 + if (after_paging_init)
15491 + set_fixmap(idx, phys);
15493 + __early_set_fixmap(idx, phys, PAGE_KERNEL);
15496 +static inline void __init early_clear_fixmap(enum fixed_addresses idx)
15498 + if (after_paging_init)
15499 + clear_fixmap(idx);
15501 + __early_set_fixmap(idx, 0, __pgprot(0));
15505 +int __initdata early_ioremap_nested;
15507 +static int __init check_early_ioremap_leak(void)
15509 + if (!early_ioremap_nested)
15512 + printk(KERN_WARNING
15513 + "Debug warning: early ioremap leak of %d areas detected.\n",
15514 + early_ioremap_nested);
15515 + printk(KERN_WARNING
15516 + "please boot with early_ioremap_debug and report the dmesg.\n");
15521 +late_initcall(check_early_ioremap_leak);
15523 +void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
15525 + unsigned long offset, last_addr;
15526 + unsigned int nrpages, nesting;
15527 + enum fixed_addresses idx0, idx;
15529 + WARN_ON(system_state != SYSTEM_BOOTING);
15531 + nesting = early_ioremap_nested;
15532 + if (early_ioremap_debug) {
15533 + printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
15534 + phys_addr, size, nesting);
15538 + /* Don't allow wraparound or zero size */
15539 + last_addr = phys_addr + size - 1;
15540 + if (!size || last_addr < phys_addr) {
15545 + if (nesting >= FIX_BTMAPS_NESTING) {
15549 + early_ioremap_nested++;
15551 + * Mappings have to be page-aligned
15553 + offset = phys_addr & ~PAGE_MASK;
15554 + phys_addr &= PAGE_MASK;
15555 + size = PAGE_ALIGN(last_addr) - phys_addr;
15558 + * Mappings have to fit in the FIX_BTMAP area.
15560 + nrpages = size >> PAGE_SHIFT;
15561 + if (nrpages > NR_FIX_BTMAPS) {
15567 + * Ok, go for it..
15569 + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
15571 + while (nrpages > 0) {
15572 + early_set_fixmap(idx, phys_addr);
15573 + phys_addr += PAGE_SIZE;
15577 + if (early_ioremap_debug)
15578 + printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
15580 + return (void *) (offset + fix_to_virt(idx0));
15583 +void __init early_iounmap(void *addr, unsigned long size)
15585 + unsigned long virt_addr;
15586 + unsigned long offset;
15587 + unsigned int nrpages;
15588 + enum fixed_addresses idx;
15589 + unsigned int nesting;
15591 + nesting = --early_ioremap_nested;
15592 + WARN_ON(nesting < 0);
15594 + if (early_ioremap_debug) {
15595 + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
15600 + virt_addr = (unsigned long)addr;
15601 + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
15605 + offset = virt_addr & ~PAGE_MASK;
15606 + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
15608 + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
15609 + while (nrpages > 0) {
15610 + early_clear_fixmap(idx);
15616 +void __this_fixmap_does_not_exist(void)
15620 --- sle11-2009-10-16.orig/arch/x86/mm/ioremap_32-xen.c 2009-02-16 16:17:21.000000000 +0100
15621 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
15624 - * arch/i386/mm/ioremap.c
15626 - * Re-map IO memory to kernel address space so that we can access it.
15627 - * This is needed for high PCI addresses that aren't mapped in the
15628 - * 640k-1MB IO memory area on PC's
15630 - * (C) Copyright 1995 1996 Linus Torvalds
15633 -#include <linux/vmalloc.h>
15634 -#include <linux/init.h>
15635 -#include <linux/slab.h>
15636 -#include <linux/module.h>
15637 -#include <linux/io.h>
15638 -#include <linux/sched.h>
15639 -#include <asm/fixmap.h>
15640 -#include <asm/cacheflush.h>
15641 -#include <asm/tlbflush.h>
15642 -#include <asm/pgtable.h>
15643 -#include <asm/pgalloc.h>
15645 -#define ISA_START_ADDRESS 0x0
15646 -#define ISA_END_ADDRESS 0x100000
15648 -static int direct_remap_area_pte_fn(pte_t *pte,
15649 - struct page *pmd_page,
15650 - unsigned long address,
15653 - mmu_update_t **v = (mmu_update_t **)data;
15655 - BUG_ON(!pte_none(*pte));
15657 - (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15658 - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15664 -static int __direct_remap_pfn_range(struct mm_struct *mm,
15665 - unsigned long address,
15666 - unsigned long mfn,
15667 - unsigned long size,
15672 - unsigned long i, start_address;
15673 - mmu_update_t *u, *v, *w;
15675 - u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
15679 - start_address = address;
15681 - flush_cache_all();
15683 - for (i = 0; i < size; i += PAGE_SIZE) {
15684 - if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
15685 - /* Flush a full batch after filling in the PTE ptrs. */
15686 - rc = apply_to_page_range(mm, start_address,
15687 - address - start_address,
15688 - direct_remap_area_pte_fn, &w);
15692 - if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
15695 - start_address = address;
15699 - * Fill in the machine address: PTE ptr is done later by
15700 - * apply_to_page_range().
15702 - v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
15705 - address += PAGE_SIZE;
15710 - /* Final batch. */
15711 - rc = apply_to_page_range(mm, start_address,
15712 - address - start_address,
15713 - direct_remap_area_pte_fn, &w);
15717 - if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
15726 - free_page((unsigned long)u);
15731 -int direct_remap_pfn_range(struct vm_area_struct *vma,
15732 - unsigned long address,
15733 - unsigned long mfn,
15734 - unsigned long size,
15738 - if (xen_feature(XENFEAT_auto_translated_physmap))
15739 - return remap_pfn_range(vma, address, mfn, size, prot);
15741 - if (domid == DOMID_SELF)
15744 - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
15746 - vma->vm_mm->context.has_foreign_mappings = 1;
15748 - return __direct_remap_pfn_range(
15749 - vma->vm_mm, address, mfn, size, prot, domid);
15751 -EXPORT_SYMBOL(direct_remap_pfn_range);
15753 -int direct_kernel_remap_pfn_range(unsigned long address,
15754 - unsigned long mfn,
15755 - unsigned long size,
15759 - return __direct_remap_pfn_range(
15760 - &init_mm, address, mfn, size, prot, domid);
15762 -EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
15764 -static int lookup_pte_fn(
15765 - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15767 - uint64_t *ptep = (uint64_t *)data;
15769 - *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
15770 - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
15774 -int create_lookup_pte_addr(struct mm_struct *mm,
15775 - unsigned long address,
15778 - return apply_to_page_range(mm, address, PAGE_SIZE,
15779 - lookup_pte_fn, ptep);
15782 -EXPORT_SYMBOL(create_lookup_pte_addr);
15784 -static int noop_fn(
15785 - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
15790 -int touch_pte_range(struct mm_struct *mm,
15791 - unsigned long address,
15792 - unsigned long size)
15794 - return apply_to_page_range(mm, address, size, noop_fn, NULL);
15797 -EXPORT_SYMBOL(touch_pte_range);
15800 - * Does @address reside within a non-highmem page that is local to this virtual
15801 - * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
15802 - * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
15803 - * why this works.
15805 -static inline int is_local_lowmem(unsigned long address)
15807 - extern unsigned long max_low_pfn;
15808 - return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
15812 - * Generic mapping function (not visible outside):
15816 - * Remap an arbitrary physical address space into the kernel virtual
15817 - * address space. Needed when the kernel wants to access high addresses
15820 - * NOTE! We need to allow non-page-aligned mappings too: we will obviously
15821 - * have to convert them into an offset in a page-aligned mapping, but the
15822 - * caller shouldn't need to know that small detail.
15824 -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
15826 - void __iomem * addr;
15827 - struct vm_struct * area;
15828 - unsigned long offset, last_addr;
15830 - domid_t domid = DOMID_IO;
15832 - /* Don't allow wraparound or zero size */
15833 - last_addr = phys_addr + size - 1;
15834 - if (!size || last_addr < phys_addr)
15838 - * Don't remap the low PCI/ISA area, it's always mapped..
15840 - if (is_initial_xendomain() &&
15841 - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
15842 - return (void __iomem *) isa_bus_to_virt(phys_addr);
15845 - * Don't allow anybody to remap normal RAM that we're using..
15847 - if (is_local_lowmem(phys_addr)) {
15848 - char *t_addr, *t_end;
15849 - struct page *page;
15851 - t_addr = bus_to_virt(phys_addr);
15852 - t_end = t_addr + (size - 1);
15854 - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
15855 - if(!PageReserved(page))
15858 - domid = DOMID_SELF;
15861 - prot = __pgprot(_KERNPG_TABLE | flags);
15864 - * Mappings have to be page-aligned
15866 - offset = phys_addr & ~PAGE_MASK;
15867 - phys_addr &= PAGE_MASK;
15868 - size = PAGE_ALIGN(last_addr+1) - phys_addr;
15871 - * Ok, go for it..
15873 - area = get_vm_area(size, VM_IOREMAP | (flags << 20));
15876 - area->phys_addr = phys_addr;
15877 - addr = (void __iomem *) area->addr;
15878 - if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
15879 - phys_addr>>PAGE_SHIFT,
15880 - size, prot, domid)) {
15881 - vunmap((void __force *) addr);
15884 - return (void __iomem *) (offset + (char __iomem *)addr);
15886 -EXPORT_SYMBOL(__ioremap);
15889 - * ioremap_nocache - map bus memory into CPU space
15890 - * @offset: bus address of the memory
15891 - * @size: size of the resource to map
15893 - * ioremap_nocache performs a platform specific sequence of operations to
15894 - * make bus memory CPU accessible via the readb/readw/readl/writeb/
15895 - * writew/writel functions and the other mmio helpers. The returned
15896 - * address is not guaranteed to be usable directly as a virtual
15899 - * This version of ioremap ensures that the memory is marked uncachable
15900 - * on the CPU as well as honouring existing caching rules from things like
15901 - * the PCI bus. Note that there are other caches and buffers on many
15902 - * busses. In particular driver authors should read up on PCI writes
15904 - * It's useful if some control registers are in such an area and
15905 - * write combining or read caching is not desirable:
15907 - * Must be freed with iounmap.
15910 -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
15912 - unsigned long last_addr;
15913 - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
15917 - /* Guaranteed to be > phys_addr, as per __ioremap() */
15918 - last_addr = phys_addr + size - 1;
15920 - if (is_local_lowmem(last_addr)) {
15921 - struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
15922 - unsigned long npages;
15924 - phys_addr &= PAGE_MASK;
15926 - /* This might overflow and become zero.. */
15927 - last_addr = PAGE_ALIGN(last_addr);
15929 - /* .. but that's ok, because modulo-2**n arithmetic will make
15930 - * the page-aligned "last - first" come out right.
15932 - npages = (last_addr - phys_addr) >> PAGE_SHIFT;
15934 - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
15938 - global_flush_tlb();
15943 -EXPORT_SYMBOL(ioremap_nocache);
15946 - * iounmap - Free a IO remapping
15947 - * @addr: virtual address from ioremap_*
15949 - * Caller must ensure there is only one unmapping for the same pointer.
15951 -void iounmap(volatile void __iomem *addr)
15953 - struct vm_struct *p, *o;
15955 - if ((void __force *)addr <= high_memory)
15959 - * __ioremap special-cases the PCI/ISA range by not instantiating a
15960 - * vm_area and by simply returning an address into the kernel mapping
15961 - * of ISA space. So handle that here.
15963 - if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
15966 - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
15968 - /* Use the vm area unlocked, assuming the caller
15969 - ensures there isn't another iounmap for the same address
15970 - in parallel. Reuse of the virtual address is prevented by
15971 - leaving it in the global lists until we're done with it.
15972 - cpa takes care of the direct mappings. */
15973 - read_lock(&vmlist_lock);
15974 - for (p = vmlist; p; p = p->next) {
15975 - if (p->addr == addr)
15978 - read_unlock(&vmlist_lock);
15981 - printk("iounmap: bad address %p\n", addr);
15986 - /* Reset the direct mapping. Can block */
15987 - if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
15988 - change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
15989 - get_vm_area_size(p) >> PAGE_SHIFT,
15991 - global_flush_tlb();
15994 - /* Finally remove it */
15995 - o = remove_vm_area((void *)addr);
15996 - BUG_ON(p != o || o == NULL);
15999 -EXPORT_SYMBOL(iounmap);
16001 -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
16003 - unsigned long offset, last_addr;
16004 - unsigned int nrpages;
16005 - enum fixed_addresses idx;
16007 - /* Don't allow wraparound or zero size */
16008 - last_addr = phys_addr + size - 1;
16009 - if (!size || last_addr < phys_addr)
16013 - * Don't remap the low PCI/ISA area, it's always mapped..
16015 - if (is_initial_xendomain() &&
16016 - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
16017 - return isa_bus_to_virt(phys_addr);
16020 - * Mappings have to be page-aligned
16022 - offset = phys_addr & ~PAGE_MASK;
16023 - phys_addr &= PAGE_MASK;
16024 - size = PAGE_ALIGN(last_addr) - phys_addr;
16027 - * Mappings have to fit in the FIX_BTMAP area.
16029 - nrpages = size >> PAGE_SHIFT;
16030 - if (nrpages > NR_FIX_BTMAPS)
16034 - * Ok, go for it..
16036 - idx = FIX_BTMAP_BEGIN;
16037 - while (nrpages > 0) {
16038 - set_fixmap(idx, phys_addr);
16039 - phys_addr += PAGE_SIZE;
16043 - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
16046 -void __init bt_iounmap(void *addr, unsigned long size)
16048 - unsigned long virt_addr;
16049 - unsigned long offset;
16050 - unsigned int nrpages;
16051 - enum fixed_addresses idx;
16053 - virt_addr = (unsigned long)addr;
16054 - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
16056 - if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
16058 - offset = virt_addr & ~PAGE_MASK;
16059 - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
16061 - idx = FIX_BTMAP_BEGIN;
16062 - while (nrpages > 0) {
16063 - clear_fixmap(idx);
16068 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
16069 +++ sle11-2009-10-16/arch/x86/mm/pageattr-xen.c 2009-03-16 16:37:14.000000000 +0100
16072 + * Copyright 2002 Andi Kleen, SuSE Labs.
16073 + * Thanks to Ben LaHaise for precious feedback.
16075 +#include <linux/highmem.h>
16076 +#include <linux/bootmem.h>
16077 +#include <linux/module.h>
16078 +#include <linux/sched.h>
16079 +#include <linux/slab.h>
16080 +#include <linux/mm.h>
16081 +#include <linux/interrupt.h>
16083 +#include <asm/e820.h>
16084 +#include <asm/processor.h>
16085 +#include <asm/tlbflush.h>
16086 +#include <asm/sections.h>
16087 +#include <asm/uaccess.h>
16088 +#include <asm/pgalloc.h>
16089 +#include <asm/proto.h>
16090 +#include <asm/mmu_context.h>
16092 +#ifndef CONFIG_X86_64
16093 +#define TASK_SIZE64 TASK_SIZE
16096 +static void _pin_lock(struct mm_struct *mm, int lock) {
16098 + spin_lock(&mm->page_table_lock);
16099 +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
16100 + /* While mm->page_table_lock protects us against insertions and
16101 + * removals of higher level page table pages, it doesn't protect
16102 + * against updates of pte-s. Such updates, however, require the
16103 + * pte pages to be in consistent state (unpinned+writable or
16104 + * pinned+readonly). The pinning and attribute changes, however
16105 + * cannot be done atomically, which is why such updates must be
16106 + * prevented from happening concurrently.
16107 + * Note that no pte lock can ever elsewhere be acquired nesting
16108 + * with an already acquired one in the same mm, or with the mm's
16109 + * page_table_lock already acquired, as that would break in the
16110 + * non-split case (where all these are actually resolving to the
16111 + * one page_table_lock). Thus acquiring all of them here is not
16112 + * going to result in dead locks, and the order of acquires
16113 + * doesn't matter.
16116 + pgd_t *pgd = mm->pgd;
16119 + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16123 + if (pgd_none(*pgd))
16125 + pud = pud_offset(pgd, 0);
16126 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16130 + if (pud_none(*pud))
16132 + pmd = pmd_offset(pud, 0);
16133 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16136 + if (pmd_none(*pmd))
16138 + ptl = pte_lockptr(0, pmd);
16142 + spin_unlock(ptl);
16149 + spin_unlock(&mm->page_table_lock);
16151 +#define pin_lock(mm) _pin_lock(mm, 1)
16152 +#define pin_unlock(mm) _pin_lock(mm, 0)
16154 +#define PIN_BATCH sizeof(void *)
16155 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
16157 +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
16158 + unsigned int cpu, unsigned int seq)
16160 + unsigned long pfn = page_to_pfn(page);
16162 + if (PageHighMem(page)) {
16163 + if (pgprot_val(flags) & _PAGE_RW)
16164 + ClearPagePinned(page);
16166 + SetPagePinned(page);
16168 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16169 + (unsigned long)__va(pfn << PAGE_SHIFT),
16170 + pfn_pte(pfn, flags), 0);
16171 + if (unlikely(++seq == PIN_BATCH)) {
16172 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16173 + PIN_BATCH, NULL)))
16182 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
16184 + pgd_t *pgd = pgd_base;
16188 + unsigned int cpu, seq;
16189 + multicall_entry_t *mcl;
16191 + if (xen_feature(XENFEAT_auto_translated_physmap))
16197 + * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
16198 + * may not be the 'current' task's pagetables (e.g., current may be
16199 + * 32-bit, but the pagetables may be for a 64-bit task).
16200 + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
16201 + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
16203 + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
16204 + if (pgd_none(*pgd))
16206 + pud = pud_offset(pgd, 0);
16207 + if (PTRS_PER_PUD > 1) /* not folded */
16208 + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
16209 + for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
16210 + if (pud_none(*pud))
16212 + pmd = pmd_offset(pud, 0);
16213 + if (PTRS_PER_PMD > 1) /* not folded */
16214 + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
16215 + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
16216 + if (pmd_none(*pmd))
16218 + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
16223 + mcl = per_cpu(pb_mcl, cpu);
16224 +#ifdef CONFIG_X86_64
16225 + if (unlikely(seq > PIN_BATCH - 2)) {
16226 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
16230 + MULTI_update_va_mapping(mcl + seq,
16231 + (unsigned long)__user_pgd(pgd_base),
16232 + pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
16234 + MULTI_update_va_mapping(mcl + seq + 1,
16235 + (unsigned long)pgd_base,
16236 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16238 + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
16241 + if (likely(seq != 0)) {
16242 + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
16243 + (unsigned long)pgd_base,
16244 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16246 + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
16249 + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
16250 + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
16258 +static void __pgd_pin(pgd_t *pgd)
16260 + pgd_walk(pgd, PAGE_KERNEL_RO);
16261 + kmap_flush_unused();
16262 + xen_pgd_pin(__pa(pgd)); /* kernel */
16263 +#ifdef CONFIG_X86_64
16264 + xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
16266 + SetPagePinned(virt_to_page(pgd));
16269 +static void __pgd_unpin(pgd_t *pgd)
16271 + xen_pgd_unpin(__pa(pgd));
16272 +#ifdef CONFIG_X86_64
16273 + xen_pgd_unpin(__pa(__user_pgd(pgd)));
16275 + pgd_walk(pgd, PAGE_KERNEL);
16276 + ClearPagePinned(virt_to_page(pgd));
16279 +void pgd_test_and_unpin(pgd_t *pgd)
16281 + if (PagePinned(virt_to_page(pgd)))
16282 + __pgd_unpin(pgd);
16285 +void mm_pin(struct mm_struct *mm)
16287 + if (xen_feature(XENFEAT_writable_page_tables))
16291 + __pgd_pin(mm->pgd);
16295 +void mm_unpin(struct mm_struct *mm)
16297 + if (xen_feature(XENFEAT_writable_page_tables))
16301 + __pgd_unpin(mm->pgd);
16305 +void mm_pin_all(void)
16307 + struct page *page;
16308 + unsigned long flags;
16310 + if (xen_feature(XENFEAT_writable_page_tables))
16314 + * Allow uninterrupted access to the pgd_list. Also protects
16315 + * __pgd_pin() by disabling preemption.
16316 + * All other CPUs must be at a safe point (e.g., in stop_machine
16317 + * or offlined entirely).
16319 + spin_lock_irqsave(&pgd_lock, flags);
16320 + list_for_each_entry(page, &pgd_list, lru) {
16321 + if (!PagePinned(page))
16322 + __pgd_pin((pgd_t *)page_address(page));
16324 + spin_unlock_irqrestore(&pgd_lock, flags);
16327 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
16329 + if (!PagePinned(virt_to_page(mm->pgd)))
16333 +void arch_exit_mmap(struct mm_struct *mm)
16335 + struct task_struct *tsk = current;
16340 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
16341 + * *much* faster this way, as no tlb flushes means bigger wrpt batches.
16343 + if (tsk->active_mm == mm) {
16344 + tsk->active_mm = &init_mm;
16345 + atomic_inc(&init_mm.mm_count);
16347 + switch_mm(mm, &init_mm, tsk);
16349 + atomic_dec(&mm->mm_count);
16350 + BUG_ON(atomic_read(&mm->mm_count) == 0);
16353 + task_unlock(tsk);
16355 + if (PagePinned(virt_to_page(mm->pgd))
16356 + && atomic_read(&mm->mm_count) == 1
16357 + && !mm->context.has_foreign_mappings)
16361 +static void _pte_free(struct page *page, unsigned int order)
16364 + __pte_free(page);
16367 +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
16369 + struct page *pte;
16371 +#ifdef CONFIG_HIGHPTE
16372 + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
16374 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16377 + pgtable_page_ctor(pte);
16378 + SetPageForeign(pte, _pte_free);
16379 + init_page_count(pte);
16384 +void __pte_free(pgtable_t pte)
16386 + if (!PageHighMem(pte)) {
16387 + unsigned long va = (unsigned long)page_address(pte);
16388 + unsigned int level;
16389 + pte_t *ptep = lookup_address(va, &level);
16391 + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
16392 + if (!pte_write(*ptep)
16393 + && HYPERVISOR_update_va_mapping(va,
16394 + mk_pte(pte, PAGE_KERNEL),
16398 +#ifdef CONFIG_HIGHPTE
16399 + ClearPagePinned(pte);
16404 + ClearPageForeign(pte);
16405 + init_page_count(pte);
16406 + pgtable_page_dtor(pte);
16407 + __free_page(pte);
16410 +#if PAGETABLE_LEVELS >= 3
16411 +static void _pmd_free(struct page *page, unsigned int order)
16414 + __pmd_free(page);
16417 +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
16419 + struct page *pmd;
16421 + pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
16424 + SetPageForeign(pmd, _pmd_free);
16425 + init_page_count(pmd);
16426 + return page_address(pmd);
16429 +void __pmd_free(pgtable_t pmd)
16431 + unsigned long va = (unsigned long)page_address(pmd);
16432 + unsigned int level;
16433 + pte_t *ptep = lookup_address(va, &level);
16435 + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
16436 + if (!pte_write(*ptep)
16437 + && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
16440 + ClearPageForeign(pmd);
16441 + init_page_count(pmd);
16442 + __free_page(pmd);
16446 +/* blktap and gntdev need this, as otherwise they would implicitly (and
16447 + * needlessly, as they never use it) reference init_mm. */
16448 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
16449 + unsigned long addr, pte_t *ptep, int full)
16451 + return ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
16453 +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
16456 + * The current flushing context - we pass it instead of 5 arguments:
16459 + unsigned long vaddr;
16460 + pgprot_t mask_set;
16461 + pgprot_t mask_clr;
16464 + unsigned long pfn;
16467 +#ifdef CONFIG_X86_64
16469 +static inline unsigned long highmap_start_pfn(void)
16471 + return __pa(_text) >> PAGE_SHIFT;
16474 +static inline unsigned long highmap_end_pfn(void)
16476 + return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
16481 +#ifdef CONFIG_DEBUG_PAGEALLOC
16482 +# define debug_pagealloc 1
16484 +# define debug_pagealloc 0
16488 +within(unsigned long addr, unsigned long start, unsigned long end)
16490 + return addr >= start && addr < end;
16494 + * Flushing functions
16498 + * clflush_cache_range - flush a cache range with clflush
16499 + * @addr: virtual start address
16500 + * @size: number of bytes to flush
16502 + * clflush is an unordered instruction which needs fencing with mfence
16503 + * to avoid ordering issues.
16505 +void clflush_cache_range(void *vaddr, unsigned int size)
16507 + void *vend = vaddr + size - 1;
16511 + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
16514 + * Flush any possible final partial cacheline:
16521 +static void __cpa_flush_all(void *arg)
16523 + unsigned long cache = (unsigned long)arg;
16526 + * Flush all to work around Errata in early athlons regarding
16527 + * large page flushing.
16529 + __flush_tlb_all();
16531 + if (cache && boot_cpu_data.x86_model >= 4)
16535 +static void cpa_flush_all(unsigned long cache)
16537 + BUG_ON(irqs_disabled());
16539 + on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
16542 +static void __cpa_flush_range(void *arg)
16545 + * We could optimize that further and do individual per page
16546 + * tlb invalidates for a low number of pages. Caveat: we must
16547 + * flush the high aliases on 64bit as well.
16549 + __flush_tlb_all();
16552 +static void cpa_flush_range(unsigned long start, int numpages, int cache)
16554 + unsigned int i, level;
16555 + unsigned long addr;
16557 + BUG_ON(irqs_disabled());
16558 + WARN_ON(PAGE_ALIGN(start) != start);
16560 + on_each_cpu(__cpa_flush_range, NULL, 1, 1);
16566 + * We only need to flush on one CPU,
16567 + * clflush is a MESI-coherent instruction that
16568 + * will cause all other CPUs to flush the same
16571 + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
16572 + pte_t *pte = lookup_address(addr, &level);
16575 + * Only flush present addresses:
16577 + if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
16578 + clflush_cache_range((void *) addr, PAGE_SIZE);
16583 + * Certain areas of memory on x86 require very specific protection flags,
16584 + * for example the BIOS area or kernel text. Callers don't always get this
16585 + * right (again, ioremap() on BIOS memory is not uncommon) so this function
16586 + * checks and fixes these known static required protection bits.
16588 +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
16589 + unsigned long pfn)
16591 + pgprot_t forbidden = __pgprot(0);
16593 +#ifndef CONFIG_XEN
16595 + * The BIOS area between 640k and 1Mb needs to be executable for
16596 + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
16598 + if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
16599 + pgprot_val(forbidden) |= _PAGE_NX;
16603 + * The kernel text needs to be executable for obvious reasons
16604 + * Does not cover __inittext since that is gone later on. On
16605 + * 64bit we do not enforce !NX on the low mapping
16607 + if (within(address, (unsigned long)_text, (unsigned long)_etext))
16608 + pgprot_val(forbidden) |= _PAGE_NX;
16611 + * The .rodata section needs to be read-only. Using the pfn
16612 + * catches all aliases.
16614 + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
16615 + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
16616 + pgprot_val(forbidden) |= _PAGE_RW;
16618 + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
16624 + * Lookup the page table entry for a virtual address. Return a pointer
16625 + * to the entry and the level of the mapping.
16627 + * Note: We return pud and pmd either when the entry is marked large
16628 + * or when the present bit is not set. Otherwise we would return a
16629 + * pointer to a nonexisting mapping.
16631 +pte_t *lookup_address(unsigned long address, unsigned int *level)
16633 + pgd_t *pgd = pgd_offset_k(address);
16637 + *level = PG_LEVEL_NONE;
16639 + if (pgd_none(*pgd))
16642 + pud = pud_offset(pgd, address);
16643 + if (pud_none(*pud))
16646 + *level = PG_LEVEL_1G;
16647 + if (pud_large(*pud) || !pud_present(*pud))
16648 + return (pte_t *)pud;
16650 + pmd = pmd_offset(pud, address);
16651 + if (pmd_none(*pmd))
16654 + *level = PG_LEVEL_2M;
16655 + if (pmd_large(*pmd) || !pmd_present(*pmd))
16656 + return (pte_t *)pmd;
16658 + *level = PG_LEVEL_4K;
16660 + return pte_offset_kernel(pmd, address);
16664 + * Set the new pmd in all the pgds we know about:
16666 +static void __set_pmd_pte(pte_t *kpte, unsigned long address,
16667 + unsigned int level, pte_t pte)
16669 + /* change init_mm */
16671 + case PG_LEVEL_2M:
16672 + xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
16674 +#ifdef CONFIG_X86_64
16675 + case PG_LEVEL_1G:
16676 + xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
16682 +#ifdef CONFIG_X86_32
16683 + if (!SHARED_KERNEL_PMD) {
16684 + struct page *page;
16686 + list_for_each_entry(page, &pgd_list, lru) {
16691 + pgd = (pgd_t *)page_address(page) + pgd_index(address);
16692 + pud = pud_offset(pgd, address);
16693 + pmd = pmd_offset(pud, address);
16694 + xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
16701 +try_preserve_large_page(pte_t *kpte, unsigned long address,
16702 + struct cpa_data *cpa)
16704 + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
16705 + pte_t new_pte, old_pte, *tmp;
16706 + pgprot_t old_prot, new_prot;
16707 + int i, do_split = 1;
16708 + unsigned int level;
16710 + spin_lock_irqsave(&pgd_lock, flags);
16712 + * Check for races, another CPU might have split this page
16715 + tmp = lookup_address(address, &level);
16720 + case PG_LEVEL_2M:
16721 + psize = PMD_PAGE_SIZE;
16722 + pmask = PMD_PAGE_MASK;
16724 +#ifdef CONFIG_X86_64
16725 + case PG_LEVEL_1G:
16726 + psize = PUD_PAGE_SIZE;
16727 + pmask = PUD_PAGE_MASK;
16731 + do_split = -EINVAL;
16736 + * Calculate the number of pages, which fit into this large
16737 + * page starting at address:
16739 + nextpage_addr = (address + psize) & pmask;
16740 + numpages = (nextpage_addr - address) >> PAGE_SHIFT;
16741 + if (numpages < cpa->numpages)
16742 + cpa->numpages = numpages;
16745 + * We are safe now. Check whether the new pgprot is the same:
16748 + old_prot = new_prot = pte_pgprot(old_pte);
16750 + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
16751 + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
16754 + * old_pte points to the large page base address. So we need
16755 + * to add the offset of the virtual address:
16757 + pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
16760 + new_prot = static_protections(new_prot, address, pfn);
16763 + * We need to check the full range, whether
16764 + * static_protection() requires a different pgprot for one of
16765 + * the pages in the range we try to preserve:
16767 + if (pfn < max_mapnr) {
16768 + addr = address + PAGE_SIZE;
16769 + for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
16770 + i++, addr += PAGE_SIZE) {
16771 + pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
16773 + if (pgprot_val(chk_prot) != pgprot_val(new_prot))
16779 + * If there are no changes, return. maxpages has been updated
16782 + if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
16788 + * We need to change the attributes. Check, whether we can
16789 + * change the large page in one go. We request a split, when
16790 + * the address is not aligned and the number of pages is
16791 + * smaller than the number of pages in the large page. Note
16792 + * that we limited the number of possible pages already to
16793 + * the number of pages in the large page.
16795 + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
16797 + * The address is aligned and the number of pages
16798 + * covers the full page.
16800 + new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
16801 + __set_pmd_pte(kpte, address, level, new_pte);
16802 + cpa->flushtlb = 1;
16807 + spin_unlock_irqrestore(&pgd_lock, flags);
16812 +static LIST_HEAD(page_pool);
16813 +static unsigned long pool_size, pool_pages, pool_low;
16814 +static unsigned long pool_used, pool_failed;
16816 +static void cpa_fill_pool(struct page **ret)
16818 + gfp_t gfp = GFP_KERNEL;
16819 + unsigned long flags;
16823 + * Avoid recursion (on debug-pagealloc) and also signal
16824 + * our priority to get to these pagetables:
16826 + if (current->flags & PF_MEMALLOC)
16828 + current->flags |= PF_MEMALLOC;
16831 + * Allocate atomically from atomic contexts:
16833 + if (in_atomic() || irqs_disabled() || debug_pagealloc)
16834 + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
16836 + while (pool_pages < pool_size || (ret && !*ret)) {
16837 + p = alloc_pages(gfp, 0);
16843 + * If the call site needs a page right now, provide it:
16845 + if (ret && !*ret) {
16849 + spin_lock_irqsave(&pgd_lock, flags);
16850 + list_add(&p->lru, &page_pool);
16852 + spin_unlock_irqrestore(&pgd_lock, flags);
16855 + current->flags &= ~PF_MEMALLOC;
16858 +#define SHIFT_MB (20 - PAGE_SHIFT)
16859 +#define ROUND_MB_GB ((1 << 10) - 1)
16860 +#define SHIFT_MB_GB 10
16861 +#define POOL_PAGES_PER_GB 16
16863 +void __init cpa_init(void)
16865 + struct sysinfo si;
16866 + unsigned long gb;
16870 + * Calculate the number of pool pages:
16872 + * Convert totalram (nr of pages) to MiB and round to the next
16873 + * GiB. Shift MiB to Gib and multiply the result by
16874 + * POOL_PAGES_PER_GB:
16876 + if (debug_pagealloc) {
16877 + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
16878 + pool_size = POOL_PAGES_PER_GB * gb;
16882 + pool_low = pool_size;
16884 + cpa_fill_pool(NULL);
16885 + printk(KERN_DEBUG
16886 + "CPA: page pool initialized %lu of %lu pages preallocated\n",
16887 + pool_pages, pool_size);
16890 +static int split_large_page(pte_t *kpte, unsigned long address)
16892 + unsigned long flags, mfn, mfninc = 1;
16893 + unsigned int i, level;
16894 + pte_t *pbase, *tmp;
16895 + pgprot_t ref_prot;
16896 + struct page *base;
16899 + * Get a page from the pool. The pool list is protected by the
16900 + * pgd_lock, which we have to take anyway for the split
16903 + spin_lock_irqsave(&pgd_lock, flags);
16904 + if (list_empty(&page_pool)) {
16905 + spin_unlock_irqrestore(&pgd_lock, flags);
16907 + cpa_fill_pool(&base);
16910 + spin_lock_irqsave(&pgd_lock, flags);
16912 + base = list_first_entry(&page_pool, struct page, lru);
16913 + list_del(&base->lru);
16916 + if (pool_pages < pool_low)
16917 + pool_low = pool_pages;
16921 + * Check for races, another CPU might have split this page
16922 + * up for us already:
16924 + tmp = lookup_address(address, &level);
16928 + pbase = (pte_t *)page_address(base);
16929 +#ifdef CONFIG_X86_32
16930 + paravirt_alloc_pt(&init_mm, page_to_pfn(base));
16932 + ref_prot = pte_pgprot(pte_clrhuge(*kpte));
16934 +#ifdef CONFIG_X86_64
16935 + if (level == PG_LEVEL_1G) {
16936 + mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
16937 + pgprot_val(ref_prot) |= _PAGE_PSE;
16942 + * Get the target mfn from the original entry:
16944 + mfn = __pte_mfn(*kpte);
16945 + for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
16946 + set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
16949 + * Install the new, split up pagetable. Important details here:
16951 + * On Intel the NX bit of all levels must be cleared to make a
16952 + * page executable. See section 4.13.2 of Intel 64 and IA-32
16953 + * Architectures Software Developer's Manual).
16955 + * Mark the entry present. The current mapping might be
16956 + * set to not present, which we preserved above.
16958 + if (!xen_feature(XENFEAT_writable_page_tables) &&
16959 + HYPERVISOR_update_va_mapping((unsigned long)pbase,
16960 + mk_pte(base, PAGE_KERNEL_RO), 0))
16962 + ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
16963 + pgprot_val(ref_prot) |= _PAGE_PRESENT;
16964 + __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
16969 + * If we dropped out via the lookup_address check under
16970 + * pgd_lock then stick the page back into the pool:
16973 + list_add(&base->lru, &page_pool);
16977 + spin_unlock_irqrestore(&pgd_lock, flags);
16982 +static int __change_page_attr(struct cpa_data *cpa, int primary)
16984 + unsigned long address = cpa->vaddr;
16985 + int do_split, err;
16986 + unsigned int level;
16987 + pte_t *kpte, old_pte;
16990 + kpte = lookup_address(address, &level);
16992 + return primary ? -EINVAL : 0;
16995 + if (!__pte_val(old_pte)) {
16998 + printk(KERN_WARNING "CPA: called for zero pte. "
16999 + "vaddr = %lx cpa->vaddr = %lx\n", address,
17005 + if (level == PG_LEVEL_4K) {
17007 + pgprot_t new_prot = pte_pgprot(old_pte);
17008 + unsigned long mfn = __pte_mfn(old_pte);
17010 + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
17011 + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
17013 + new_prot = static_protections(new_prot, address,
17014 + mfn_to_local_pfn(mfn));
17017 + * We need to keep the mfn from the existing PTE,
17018 + * after all we're only going to change it's attributes
17019 + * not the memory it points to
17021 + new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
17022 + cpa->pfn = mfn_to_local_pfn(mfn);
17024 + * Do we really change anything ?
17026 + if (__pte_val(old_pte) != __pte_val(new_pte)) {
17027 + set_pte_atomic(kpte, new_pte);
17028 + cpa->flushtlb = 1;
17030 + cpa->numpages = 1;
17035 + * Check, whether we can keep the large page intact
17036 + * and just change the pte:
17038 + do_split = try_preserve_large_page(kpte, address, cpa);
17040 + * When the range fits into the existing large page,
17041 + * return. cp->numpages and cpa->tlbflush have been updated in
17042 + * try_large_page:
17044 + if (do_split <= 0)
17048 + * We have to split the large page:
17050 + err = split_large_page(kpte, address);
17052 + cpa->flushtlb = 1;
17059 +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
17061 +static int cpa_process_alias(struct cpa_data *cpa)
17063 + struct cpa_data alias_cpa;
17066 + if (cpa->pfn > max_pfn_mapped)
17070 + * No need to redo, when the primary call touched the direct
17071 + * mapping already:
17073 + if (!within(cpa->vaddr, PAGE_OFFSET,
17074 + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
17076 + alias_cpa = *cpa;
17077 + alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
17079 + ret = __change_page_attr_set_clr(&alias_cpa, 0);
17082 +#ifdef CONFIG_X86_64
17086 + * No need to redo, when the primary call touched the high
17087 + * mapping already:
17089 + if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
17093 + * If the physical address is inside the kernel map, we need
17094 + * to touch the high mapped kernel as well:
17096 + if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
17099 + alias_cpa = *cpa;
17100 + alias_cpa.vaddr =
17101 + (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
17104 + * The high mapping range is imprecise, so ignore the return value.
17106 + __change_page_attr_set_clr(&alias_cpa, 0);
17111 +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
17113 + int ret, numpages = cpa->numpages;
17115 + while (numpages) {
17117 + * Store the remaining nr of pages for the large page
17118 + * preservation check.
17120 + cpa->numpages = numpages;
17122 + ret = __change_page_attr(cpa, checkalias);
17126 + if (checkalias) {
17127 + ret = cpa_process_alias(cpa);
17133 + * Adjust the number of pages with the result of the
17134 + * CPA operation. Either a large page has been
17135 + * preserved or a single page update happened.
17137 + BUG_ON(cpa->numpages > numpages);
17138 + numpages -= cpa->numpages;
17139 + cpa->vaddr += cpa->numpages * PAGE_SIZE;
17144 +static inline int cache_attr(pgprot_t attr)
17146 + return pgprot_val(attr) &
17147 + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
17150 +static int change_page_attr_set_clr(unsigned long addr, int numpages,
17151 + pgprot_t mask_set, pgprot_t mask_clr)
17153 + struct cpa_data cpa;
17154 + int ret, cache, checkalias;
17157 + * Check, if we are requested to change a not supported
17160 + mask_set = canon_pgprot(mask_set);
17161 + mask_clr = canon_pgprot(mask_clr);
17162 + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
17165 + /* Ensure we are PAGE_SIZE aligned */
17166 + if (addr & ~PAGE_MASK) {
17167 + addr &= PAGE_MASK;
17169 + * People should not be passing in unaligned addresses:
17174 + cpa.vaddr = addr;
17175 + cpa.numpages = numpages;
17176 + cpa.mask_set = mask_set;
17177 + cpa.mask_clr = mask_clr;
17178 + cpa.flushtlb = 0;
17180 + /* No alias checking for _NX bit modifications */
17181 + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
17183 + ret = __change_page_attr_set_clr(&cpa, checkalias);
17186 + * Check whether we really changed something:
17188 + if (!cpa.flushtlb)
17192 + * No need to flush, when we did not set any of the caching
17195 + cache = cache_attr(mask_set);
17198 + * On success we use clflush, when the CPU supports it to
17199 + * avoid the wbindv. If the CPU does not support it and in the
17200 + * error case we fall back to cpa_flush_all (which uses
17203 + if (!ret && cpu_has_clflush)
17204 + cpa_flush_range(addr, numpages, cache);
17206 + cpa_flush_all(cache);
17209 + cpa_fill_pool(NULL);
17214 +static inline int change_page_attr_set(unsigned long addr, int numpages,
17217 + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
17220 +static inline int change_page_attr_clear(unsigned long addr, int numpages,
17223 + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
17226 +int set_memory_uc(unsigned long addr, int numpages)
17228 + return change_page_attr_set(addr, numpages,
17229 + __pgprot(_PAGE_PCD));
17231 +EXPORT_SYMBOL(set_memory_uc);
17233 +int set_memory_wb(unsigned long addr, int numpages)
17235 + return change_page_attr_clear(addr, numpages,
17236 + __pgprot(_PAGE_PCD | _PAGE_PWT));
17238 +EXPORT_SYMBOL(set_memory_wb);
17240 +int set_memory_x(unsigned long addr, int numpages)
17242 + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
17244 +EXPORT_SYMBOL(set_memory_x);
17246 +int set_memory_nx(unsigned long addr, int numpages)
17248 + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
17250 +EXPORT_SYMBOL(set_memory_nx);
17252 +int set_memory_ro(unsigned long addr, int numpages)
17254 + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
17257 +int set_memory_rw(unsigned long addr, int numpages)
17259 + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
17262 +int set_memory_np(unsigned long addr, int numpages)
17264 + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
17267 +int set_pages_uc(struct page *page, int numpages)
17269 + unsigned long addr = (unsigned long)page_address(page);
17271 + return set_memory_uc(addr, numpages);
17273 +EXPORT_SYMBOL(set_pages_uc);
17275 +int set_pages_wb(struct page *page, int numpages)
17277 + unsigned long addr = (unsigned long)page_address(page);
17279 + return set_memory_wb(addr, numpages);
17281 +EXPORT_SYMBOL(set_pages_wb);
17283 +int set_pages_x(struct page *page, int numpages)
17285 + unsigned long addr = (unsigned long)page_address(page);
17287 + return set_memory_x(addr, numpages);
17289 +EXPORT_SYMBOL(set_pages_x);
17291 +int set_pages_nx(struct page *page, int numpages)
17293 + unsigned long addr = (unsigned long)page_address(page);
17295 + return set_memory_nx(addr, numpages);
17297 +EXPORT_SYMBOL(set_pages_nx);
17299 +int set_pages_ro(struct page *page, int numpages)
17301 + unsigned long addr = (unsigned long)page_address(page);
17303 + return set_memory_ro(addr, numpages);
17306 +int set_pages_rw(struct page *page, int numpages)
17308 + unsigned long addr = (unsigned long)page_address(page);
17310 + return set_memory_rw(addr, numpages);
17313 +#ifdef CONFIG_DEBUG_PAGEALLOC
17315 +static int __set_pages_p(struct page *page, int numpages)
17317 + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17318 + .numpages = numpages,
17319 + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
17320 + .mask_clr = __pgprot(0)};
17322 + return __change_page_attr_set_clr(&cpa, 1);
17325 +static int __set_pages_np(struct page *page, int numpages)
17327 + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
17328 + .numpages = numpages,
17329 + .mask_set = __pgprot(0),
17330 + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
17332 + return __change_page_attr_set_clr(&cpa, 1);
17335 +void kernel_map_pages(struct page *page, int numpages, int enable)
17337 + if (PageHighMem(page))
17340 + debug_check_no_locks_freed(page_address(page),
17341 + numpages * PAGE_SIZE);
17345 + * If page allocator is not up yet then do not call c_p_a():
17347 + if (!debug_pagealloc_enabled)
17351 + * The return value is ignored as the calls cannot fail.
17352 + * Large pages are kept enabled at boot time, and are
17353 + * split up quickly with DEBUG_PAGEALLOC. If a splitup
17354 + * fails here (due to temporary memory shortage) no damage
17355 + * is done because we just keep the largepage intact up
17356 + * to the next attempt when it will likely be split up:
17359 + __set_pages_p(page, numpages);
17361 + __set_pages_np(page, numpages);
17364 + * We should perform an IPI and flush all tlbs,
17365 + * but that can deadlock->flush only current cpu:
17367 + __flush_tlb_all();
17370 + * Try to refill the page pool here. We can do this only after
17373 + cpa_fill_pool(NULL);
17376 +#ifdef CONFIG_HIBERNATION
17378 +bool kernel_page_present(struct page *page)
17380 + unsigned int level;
17383 + if (PageHighMem(page))
17386 + pte = lookup_address((unsigned long)page_address(page), &level);
17387 + return (__pte_val(*pte) & _PAGE_PRESENT);
17390 +#endif /* CONFIG_HIBERNATION */
17392 +#endif /* CONFIG_DEBUG_PAGEALLOC */
17394 +static inline int in_secondary_range(unsigned long va)
17396 +#ifdef CONFIG_X86_64
17397 + return va >= VMALLOC_START && va < VMALLOC_END;
17399 + return va >= (unsigned long)high_memory;
17403 +static void __make_page_readonly(unsigned long va)
17406 + unsigned int level;
17408 + pte = lookup_address(va, &level);
17409 + BUG_ON(!pte || level != PG_LEVEL_4K);
17410 + if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
17412 + if (in_secondary_range(va)) {
17413 + unsigned long pfn = pte_pfn(*pte);
17415 +#ifdef CONFIG_HIGHMEM
17416 + if (pfn >= highstart_pfn)
17417 + kmap_flush_unused(); /* flush stale writable kmaps */
17420 + __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
17424 +static void __make_page_writable(unsigned long va)
17427 + unsigned int level;
17429 + pte = lookup_address(va, &level);
17430 + BUG_ON(!pte || level != PG_LEVEL_4K);
17431 + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
17433 + if (in_secondary_range(va)) {
17434 + unsigned long pfn = pte_pfn(*pte);
17436 +#ifdef CONFIG_HIGHMEM
17437 + if (pfn < highstart_pfn)
17439 + __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
17443 +void make_page_readonly(void *va, unsigned int feature)
17445 + if (!xen_feature(feature))
17446 + __make_page_readonly((unsigned long)va);
17449 +void make_page_writable(void *va, unsigned int feature)
17451 + if (!xen_feature(feature))
17452 + __make_page_writable((unsigned long)va);
17455 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
17457 + unsigned long addr;
17459 + if (xen_feature(feature))
17462 + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
17463 + __make_page_readonly(addr);
17466 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
17468 + unsigned long addr;
17470 + if (xen_feature(feature))
17473 + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
17474 + __make_page_writable(addr);
17478 + * The testcases use internal knowledge of the implementation that shouldn't
17479 + * be exposed to the rest of the kernel. Include these directly here.
17481 +#ifdef CONFIG_CPA_DEBUG
17482 +#include "pageattr-test.c"
17484 --- sle11-2009-10-16.orig/arch/x86/mm/pageattr_64-xen.c 2009-02-16 16:18:36.000000000 +0100
17485 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
17488 - * Copyright 2002 Andi Kleen, SuSE Labs.
17489 - * Thanks to Ben LaHaise for precious feedback.
17492 -#include <linux/mm.h>
17493 -#include <linux/sched.h>
17494 -#include <linux/highmem.h>
17495 -#include <linux/module.h>
17496 -#include <linux/slab.h>
17497 -#include <asm/uaccess.h>
17498 -#include <asm/processor.h>
17499 -#include <asm/tlbflush.h>
17500 -#include <asm/io.h>
17503 -#include <asm/pgalloc.h>
17504 -#include <asm/mmu_context.h>
17506 -static void _pin_lock(struct mm_struct *mm, int lock) {
17508 - spin_lock(&mm->page_table_lock);
17509 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
17510 - /* While mm->page_table_lock protects us against insertions and
17511 - * removals of higher level page table pages, it doesn't protect
17512 - * against updates of pte-s. Such updates, however, require the
17513 - * pte pages to be in consistent state (unpinned+writable or
17514 - * pinned+readonly). The pinning and attribute changes, however
17515 - * cannot be done atomically, which is why such updates must be
17516 - * prevented from happening concurrently.
17517 - * Note that no pte lock can ever elsewhere be acquired nesting
17518 - * with an already acquired one in the same mm, or with the mm's
17519 - * page_table_lock already acquired, as that would break in the
17520 - * non-split case (where all these are actually resolving to the
17521 - * one page_table_lock). Thus acquiring all of them here is not
17522 - * going to result in dead locks, and the order of acquires
17523 - * doesn't matter.
17526 - pgd_t *pgd = mm->pgd;
17529 - for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
17533 - if (pgd_none(*pgd))
17535 - pud = pud_offset(pgd, 0);
17536 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17540 - if (pud_none(*pud))
17542 - pmd = pmd_offset(pud, 0);
17543 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17546 - if (pmd_none(*pmd))
17548 - ptl = pte_lockptr(0, pmd);
17552 - spin_unlock(ptl);
17559 - spin_unlock(&mm->page_table_lock);
17561 -#define pin_lock(mm) _pin_lock(mm, 1)
17562 -#define pin_unlock(mm) _pin_lock(mm, 0)
17564 -#define PIN_BATCH 8
17565 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
17567 -static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags,
17568 - unsigned int cpu, unsigned int seq)
17570 - struct page *page = virt_to_page(pt);
17571 - unsigned long pfn = page_to_pfn(page);
17573 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
17574 - (unsigned long)__va(pfn << PAGE_SHIFT),
17575 - pfn_pte(pfn, flags), 0);
17576 - if (unlikely(++seq == PIN_BATCH)) {
17577 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
17578 - PIN_BATCH, NULL)))
17586 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
17588 - pgd_t *pgd = pgd_base;
17593 - unsigned int cpu, seq;
17594 - multicall_entry_t *mcl;
17599 - * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
17600 - * be the 'current' task's pagetables (e.g., current may be 32-bit,
17601 - * but the pagetables may be for a 64-bit task).
17602 - * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
17603 - * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
17605 - for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
17606 - if (pgd_none(*pgd))
17608 - pud = pud_offset(pgd, 0);
17609 - if (PTRS_PER_PUD > 1) /* not folded */
17610 - seq = pgd_walk_set_prot(pud,flags,cpu,seq);
17611 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
17612 - if (pud_none(*pud))
17614 - pmd = pmd_offset(pud, 0);
17615 - if (PTRS_PER_PMD > 1) /* not folded */
17616 - seq = pgd_walk_set_prot(pmd,flags,cpu,seq);
17617 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
17618 - if (pmd_none(*pmd))
17620 - pte = pte_offset_kernel(pmd,0);
17621 - seq = pgd_walk_set_prot(pte,flags,cpu,seq);
17626 - mcl = per_cpu(pb_mcl, cpu);
17627 - if (unlikely(seq > PIN_BATCH - 2)) {
17628 - if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
17632 - MULTI_update_va_mapping(mcl + seq,
17633 - (unsigned long)__user_pgd(pgd_base),
17634 - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
17636 - MULTI_update_va_mapping(mcl + seq + 1,
17637 - (unsigned long)pgd_base,
17638 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
17640 - if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
17646 -static void __pgd_pin(pgd_t *pgd)
17648 - pgd_walk(pgd, PAGE_KERNEL_RO);
17649 - xen_pgd_pin(__pa(pgd)); /* kernel */
17650 - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
17651 - SetPagePinned(virt_to_page(pgd));
17654 -static void __pgd_unpin(pgd_t *pgd)
17656 - xen_pgd_unpin(__pa(pgd));
17657 - xen_pgd_unpin(__pa(__user_pgd(pgd)));
17658 - pgd_walk(pgd, PAGE_KERNEL);
17659 - ClearPagePinned(virt_to_page(pgd));
17662 -void pgd_test_and_unpin(pgd_t *pgd)
17664 - if (PagePinned(virt_to_page(pgd)))
17665 - __pgd_unpin(pgd);
17668 -void mm_pin(struct mm_struct *mm)
17670 - if (xen_feature(XENFEAT_writable_page_tables))
17674 - __pgd_pin(mm->pgd);
17678 -void mm_unpin(struct mm_struct *mm)
17680 - if (xen_feature(XENFEAT_writable_page_tables))
17684 - __pgd_unpin(mm->pgd);
17688 -void mm_pin_all(void)
17690 - struct page *page;
17691 - unsigned long flags;
17693 - if (xen_feature(XENFEAT_writable_page_tables))
17697 - * Allow uninterrupted access to the pgd_list. Also protects
17698 - * __pgd_pin() by disabling preemption.
17699 - * All other CPUs must be at a safe point (e.g., in stop_machine
17700 - * or offlined entirely).
17702 - spin_lock_irqsave(&pgd_lock, flags);
17703 - list_for_each_entry(page, &pgd_list, lru) {
17704 - if (!PagePinned(page))
17705 - __pgd_pin((pgd_t *)page_address(page));
17707 - spin_unlock_irqrestore(&pgd_lock, flags);
17710 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
17712 - if (!PagePinned(virt_to_page(mm->pgd)))
17716 -void arch_exit_mmap(struct mm_struct *mm)
17718 - struct task_struct *tsk = current;
17723 - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
17724 - * *much* faster this way, as no tlb flushes means bigger wrpt batches.
17726 - if (tsk->active_mm == mm) {
17727 - tsk->active_mm = &init_mm;
17728 - atomic_inc(&init_mm.mm_count);
17730 - switch_mm(mm, &init_mm, tsk);
17732 - atomic_dec(&mm->mm_count);
17733 - BUG_ON(atomic_read(&mm->mm_count) == 0);
17736 - task_unlock(tsk);
17738 - if (PagePinned(virt_to_page(mm->pgd))
17739 - && (atomic_read(&mm->mm_count) == 1)
17740 - && !mm->context.has_foreign_mappings)
17744 -static void _pte_free(struct page *page, unsigned int order)
17750 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
17752 - struct page *pte;
17754 - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
17756 - SetPageForeign(pte, _pte_free);
17757 - init_page_count(pte);
17762 -void pte_free(struct page *pte)
17764 - unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
17766 - if (!pte_write(*virt_to_ptep(va)))
17767 - if (HYPERVISOR_update_va_mapping(
17768 - va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
17771 - ClearPageForeign(pte);
17772 - init_page_count(pte);
17774 - __free_page(pte);
17776 -#endif /* CONFIG_XEN */
17778 -pte_t *lookup_address(unsigned long address)
17780 - pgd_t *pgd = pgd_offset_k(address);
17784 - if (pgd_none(*pgd))
17786 - pud = pud_offset(pgd, address);
17787 - if (!pud_present(*pud))
17789 - pmd = pmd_offset(pud, address);
17790 - if (!pmd_present(*pmd))
17792 - if (pmd_large(*pmd))
17793 - return (pte_t *)pmd;
17794 - pte = pte_offset_kernel(pmd, address);
17795 - if (pte && !pte_present(*pte))
17800 -static struct page *split_large_page(unsigned long address, pgprot_t prot,
17801 - pgprot_t ref_prot)
17804 - unsigned long addr;
17805 - struct page *base = alloc_pages(GFP_KERNEL, 0);
17810 - * page_private is used to track the number of entries in
17811 - * the page table page have non standard attributes.
17813 - SetPagePrivate(base);
17814 - page_private(base) = 0;
17816 - address = __pa(address);
17817 - addr = address & LARGE_PAGE_MASK;
17818 - pbase = (pte_t *)page_address(base);
17819 - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
17820 - pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
17821 - addr == address ? prot : ref_prot);
17826 -void clflush_cache_range(void *adr, int size)
17829 - for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
17833 -static void flush_kernel_map(void *arg)
17835 - struct list_head *l = (struct list_head *)arg;
17838 - /* When clflush is available always use it because it is
17839 - much cheaper than WBINVD. */
17840 - /* clflush is still broken. Disable for now. */
17841 - if (1 || !cpu_has_clflush)
17842 - asm volatile("wbinvd" ::: "memory");
17843 - else list_for_each_entry(pg, l, lru) {
17844 - void *adr = page_address(pg);
17845 - clflush_cache_range(adr, PAGE_SIZE);
17847 - __flush_tlb_all();
17850 -static inline void flush_map(struct list_head *l)
17852 - on_each_cpu(flush_kernel_map, l, 1, 1);
17855 -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
17857 -static inline void save_page(struct page *fpage)
17859 - if (!test_and_set_bit(PG_arch_1, &fpage->flags))
17860 - list_add(&fpage->lru, &deferred_pages);
17864 - * No more special protections in this 2/4MB area - revert to a
17865 - * large page again.
17867 -static void revert_page(unsigned long address, pgprot_t ref_prot)
17873 - unsigned long pfn;
17875 - pgd = pgd_offset_k(address);
17876 - BUG_ON(pgd_none(*pgd));
17877 - pud = pud_offset(pgd,address);
17878 - BUG_ON(pud_none(*pud));
17879 - pmd = pmd_offset(pud, address);
17880 - BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
17881 - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
17882 - large_pte = pfn_pte(pfn, ref_prot);
17883 - large_pte = pte_mkhuge(large_pte);
17884 - set_pte((pte_t *)pmd, large_pte);
17888 -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
17889 - pgprot_t ref_prot)
17892 - struct page *kpte_page;
17893 - pgprot_t ref_prot2;
17895 - kpte = lookup_address(address);
17896 - if (!kpte) return 0;
17897 - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
17898 - BUG_ON(PageLRU(kpte_page));
17899 - BUG_ON(PageCompound(kpte_page));
17900 - if (pgprot_val(prot) != pgprot_val(ref_prot)) {
17901 - if (!pte_huge(*kpte)) {
17902 - set_pte(kpte, pfn_pte(pfn, prot));
17905 - * split_large_page will take the reference for this
17906 - * change_page_attr on the split page.
17908 - struct page *split;
17909 - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
17910 - split = split_large_page(address, prot, ref_prot2);
17913 - pgprot_val(ref_prot2) &= ~_PAGE_NX;
17914 - set_pte(kpte, mk_pte(split, ref_prot2));
17915 - kpte_page = split;
17917 - page_private(kpte_page)++;
17918 - } else if (!pte_huge(*kpte)) {
17919 - set_pte(kpte, pfn_pte(pfn, ref_prot));
17920 - BUG_ON(page_private(kpte_page) == 0);
17921 - page_private(kpte_page)--;
17925 - /* on x86-64 the direct mapping set at boot is not using 4k pages */
17927 - * ..., but the XEN guest kernels (currently) do:
17928 - * If the pte was reserved, it means it was created at boot
17929 - * time (not via split_large_page) and in turn we must not
17930 - * replace it with a large page.
17932 -#ifndef CONFIG_XEN
17933 - BUG_ON(PageReserved(kpte_page));
17935 - if (PageReserved(kpte_page))
17939 - save_page(kpte_page);
17940 - if (page_private(kpte_page) == 0)
17941 - revert_page(address, ref_prot);
17946 - * Change the page attributes of an page in the linear mapping.
17948 - * This should be used when a page is mapped with a different caching policy
17949 - * than write-back somewhere - some CPUs do not like it when mappings with
17950 - * different caching policies exist. This changes the page attributes of the
17951 - * in kernel linear mapping too.
17953 - * The caller needs to ensure that there are no conflicting mappings elsewhere.
17954 - * This function only deals with the kernel linear map.
17956 - * Caller must call global_flush_tlb() after this.
17958 -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
17960 - int err = 0, kernel_map = 0;
17963 - if (address >= __START_KERNEL_map
17964 - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
17965 - address = (unsigned long)__va(__pa(address));
17969 - down_write(&init_mm.mmap_sem);
17970 - for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
17971 - unsigned long pfn = __pa(address) >> PAGE_SHIFT;
17973 - if (!kernel_map || pte_present(pfn_pte(0, prot))) {
17974 - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
17978 - /* Handle kernel mapping too which aliases part of the
17980 - if (__pa(address) < KERNEL_TEXT_SIZE) {
17981 - unsigned long addr2;
17983 - addr2 = __START_KERNEL_map + __pa(address);
17984 - /* Make sure the kernel mappings stay executable */
17985 - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
17986 - err = __change_page_attr(addr2, pfn, prot2,
17987 - PAGE_KERNEL_EXEC);
17990 - up_write(&init_mm.mmap_sem);
17994 -/* Don't call this for MMIO areas that may not have a mem_map entry */
17995 -int change_page_attr(struct page *page, int numpages, pgprot_t prot)
17997 - unsigned long addr = (unsigned long)page_address(page);
17998 - return change_page_attr_addr(addr, numpages, prot);
18001 -void global_flush_tlb(void)
18003 - struct page *pg, *next;
18004 - struct list_head l;
18007 - * Write-protect the semaphore, to exclude two contexts
18008 - * doing a list_replace_init() call in parallel and to
18009 - * exclude new additions to the deferred_pages list:
18011 - down_write(&init_mm.mmap_sem);
18012 - list_replace_init(&deferred_pages, &l);
18013 - up_write(&init_mm.mmap_sem);
18017 - list_for_each_entry_safe(pg, next, &l, lru) {
18018 - list_del(&pg->lru);
18019 - clear_bit(PG_arch_1, &pg->flags);
18020 - if (page_private(pg) != 0)
18022 - ClearPagePrivate(pg);
18027 -EXPORT_SYMBOL(change_page_attr);
18028 -EXPORT_SYMBOL(global_flush_tlb);
18029 --- sle11-2009-10-16.orig/arch/x86/mm/pgtable_32-xen.c 2009-02-16 16:18:36.000000000 +0100
18030 +++ sle11-2009-10-16/arch/x86/mm/pgtable_32-xen.c 2009-03-16 16:33:40.000000000 +0100
18032 #include <xen/features.h>
18033 #include <asm/hypervisor.h>
18035 -static void pgd_test_and_unpin(pgd_t *pgd);
18037 void show_mem(void)
18039 int total = 0, reserved = 0;
18040 @@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st
18044 -static void _pte_free(struct page *page, unsigned int order)
18050 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18052 - struct page *pte;
18054 -#ifdef CONFIG_HIGHPTE
18055 - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18057 - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18060 - SetPageForeign(pte, _pte_free);
18061 - init_page_count(pte);
18066 -void pte_free(struct page *pte)
18068 - unsigned long pfn = page_to_pfn(pte);
18070 - if (!PageHighMem(pte)) {
18071 - unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
18073 - if (!pte_write(*virt_to_ptep(va)))
18074 - if (HYPERVISOR_update_va_mapping(
18075 - va, pfn_pte(pfn, PAGE_KERNEL), 0))
18078 - ClearPagePinned(pte);
18080 - ClearPageForeign(pte);
18081 - init_page_count(pte);
18083 - __free_page(pte);
18086 -void pmd_ctor(struct kmem_cache *cache, void *pmd)
18088 - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18092 * List of all pgd's needed for non-PAE so it can invalidate entries
18093 * in both cached and uncached pgd's; not needed for PAE since the
18094 @@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache,
18095 * vmalloc faults work because attached pagetables are never freed.
18098 -DEFINE_SPINLOCK(pgd_lock);
18099 -struct page *pgd_list;
18101 static inline void pgd_list_add(pgd_t *pgd)
18103 struct page *page = virt_to_page(pgd);
18104 - page->index = (unsigned long)pgd_list;
18106 - set_page_private(pgd_list, (unsigned long)&page->index);
18108 - set_page_private(page, (unsigned long)&pgd_list);
18110 + list_add(&page->lru, &pgd_list);
18113 static inline void pgd_list_del(pgd_t *pgd)
18115 - struct page *next, **pprev, *page = virt_to_page(pgd);
18116 - next = (struct page *)page->index;
18117 - pprev = (struct page **)page_private(page);
18120 - set_page_private(next, (unsigned long)pprev);
18122 + struct page *page = virt_to_page(pgd);
18124 + list_del(&page->lru);
18127 +#define UNSHARED_PTRS_PER_PGD \
18128 + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18130 -#if (PTRS_PER_PMD == 1)
18131 -/* Non-PAE pgd constructor */
18132 -static void pgd_ctor(void *pgd)
18133 +static void pgd_ctor(void *p)
18136 unsigned long flags;
18138 - /* !PAE, no pagetable sharing */
18139 + pgd_test_and_unpin(pgd);
18141 + /* Clear usermode parts of PGD */
18142 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18144 spin_lock_irqsave(&pgd_lock, flags);
18146 - /* must happen under lock */
18147 - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18148 - swapper_pg_dir + USER_PTRS_PER_PGD,
18149 - KERNEL_PGD_PTRS);
18151 - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18152 - __pa(swapper_pg_dir) >> PAGE_SHIFT,
18153 - USER_PTRS_PER_PGD,
18154 - KERNEL_PGD_PTRS);
18155 - pgd_list_add(pgd);
18156 - spin_unlock_irqrestore(&pgd_lock, flags);
18158 -#else /* PTRS_PER_PMD > 1 */
18159 -/* PAE pgd constructor */
18160 -static void pgd_ctor(void *pgd)
18162 - /* PAE, kernel PMD may be shared */
18164 - if (SHARED_KERNEL_PMD) {
18165 - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18166 + /* If the pgd points to a shared pagetable level (either the
18167 + ptes in non-PAE, or shared PMD in PAE), then just copy the
18168 + references from swapper_pg_dir. */
18169 + if (PAGETABLE_LEVELS == 2 ||
18170 + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
18171 + clone_pgd_range(pgd + USER_PTRS_PER_PGD,
18172 swapper_pg_dir + USER_PTRS_PER_PGD,
18175 - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18176 + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
18177 + __pa(swapper_pg_dir) >> PAGE_SHIFT,
18178 + USER_PTRS_PER_PGD,
18179 + KERNEL_PGD_PTRS);
18182 + /* list required to sync kernel mapping updates */
18183 + if (PAGETABLE_LEVELS == 2)
18184 + pgd_list_add(pgd);
18186 + spin_unlock_irqrestore(&pgd_lock, flags);
18188 -#endif /* PTRS_PER_PMD */
18190 static void pgd_dtor(void *pgd)
18192 unsigned long flags; /* can be called from interrupt context */
18194 - if (SHARED_KERNEL_PMD)
18197 - paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
18198 - spin_lock_irqsave(&pgd_lock, flags);
18199 - pgd_list_del(pgd);
18200 - spin_unlock_irqrestore(&pgd_lock, flags);
18201 + if (!SHARED_KERNEL_PMD) {
18202 + spin_lock_irqsave(&pgd_lock, flags);
18203 + pgd_list_del(pgd);
18204 + spin_unlock_irqrestore(&pgd_lock, flags);
18207 pgd_test_and_unpin(pgd);
18210 -#define UNSHARED_PTRS_PER_PGD \
18211 - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
18213 -/* If we allocate a pmd for part of the kernel address space, then
18214 - make sure its initialized with the appropriate kernel mappings.
18215 - Otherwise use a cached zeroed pmd. */
18216 -static pmd_t *pmd_cache_alloc(int idx)
18217 +#ifdef CONFIG_X86_PAE
18219 + * Mop up any pmd pages which may still be attached to the pgd.
18220 + * Normally they will be freed by munmap/exit_mmap, but any pmd we
18221 + * preallocate which never got a corresponding vma will need to be
18222 + * freed manually.
18224 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18229 - if (idx >= USER_PTRS_PER_PGD) {
18230 - pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
18231 + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
18232 + pgd_t pgd = pgdp[i];
18234 -#ifndef CONFIG_XEN
18237 - (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
18238 - sizeof(pmd_t) * PTRS_PER_PMD);
18241 - pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18242 + if (__pgd_val(pgd) != 0) {
18243 + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
18247 + pgdp[i] = xen_make_pgd(0);
18249 -static void pmd_cache_free(pmd_t *pmd, int idx)
18251 - if (idx >= USER_PTRS_PER_PGD) {
18252 - make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
18253 - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18254 - free_page((unsigned long)pmd);
18256 - kmem_cache_free(pmd_cache, pmd);
18257 + paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
18258 + pmd_free(mm, pmd);
18263 -pgd_t *pgd_alloc(struct mm_struct *mm)
18265 + * In PAE mode, we need to do a cr3 reload (=tlb flush) when
18266 + * updating the top-level pagetable entries to guarantee the
18267 + * processor notices the update. Since this is expensive, and
18268 + * all 4 top-level entries are used almost immediately in a
18269 + * new process's life, we just pre-populate them here.
18271 + * Also, if we're in a paravirt environment where the kernel pmd is
18272 + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
18273 + * and initialize the kernel pmds here.
18275 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18278 + pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
18279 + unsigned long addr, flags;
18281 - pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
18282 - pmd_t **pmds = NULL;
18283 - unsigned long flags;
18285 - pgd_test_and_unpin(pgd);
18287 - if (PTRS_PER_PMD == 1 || !pgd)
18291 - if (!SHARED_KERNEL_PMD) {
18293 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
18294 - * allocation). We therefore store virtual addresses of pmds as they
18295 - * do not change across save/restore, and poke the machine addresses
18296 - * into the pgdir under the pgd_lock.
18298 - pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
18300 - quicklist_free(0, pgd_dtor, pgd);
18306 - /* Allocate pmds, remember virtual addresses. */
18307 - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18308 - pmd_t *pmd = pmd_cache_alloc(i);
18312 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
18313 + * allocation). We therefore store virtual addresses of pmds as they
18314 + * do not change across save/restore, and poke the machine addresses
18315 + * into the pgdir under the pgd_lock.
18317 + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
18318 + pmds[i] = pmd_alloc_one(mm, addr);
18322 - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
18326 - set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18330 - if (SHARED_KERNEL_PMD)
18333 spin_lock_irqsave(&pgd_lock, flags);
18335 /* Protect against save/restore: move below 4GB under pgd_lock. */
18336 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18337 - int rc = xen_create_contiguous_region(
18338 - (unsigned long)pgd, 0, 32);
18340 - spin_unlock_irqrestore(&pgd_lock, flags);
18343 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
18344 + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
18345 + spin_unlock_irqrestore(&pgd_lock, flags);
18348 + pmd_free(mm, pmds[i]);
18352 /* Copy kernel pmd contents and write-protect the new pmds. */
18353 - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18355 - (void *)pgd_page_vaddr(swapper_pg_dir[i]),
18356 - sizeof(pmd_t) * PTRS_PER_PMD);
18357 - make_lowmem_page_readonly(
18358 - pmds[i], XENFEAT_writable_page_tables);
18360 + pud = pud_offset(pgd, 0);
18361 + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
18362 + i++, pud++, addr += PUD_SIZE) {
18363 + if (i >= USER_PTRS_PER_PGD) {
18365 + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
18366 + sizeof(pmd_t) * PTRS_PER_PMD);
18367 + make_lowmem_page_readonly(
18368 + pmds[i], XENFEAT_writable_page_tables);
18371 - /* It is safe to poke machine addresses of pmds under the pmd_lock. */
18372 - for (i = 0; i < PTRS_PER_PGD; i++)
18373 - set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
18374 + /* It is safe to poke machine addresses of pmds under the pgd_lock. */
18375 + pud_populate(mm, pud, pmds[i]);
18378 - /* Ensure this pgd gets picked up and pinned on save/restore. */
18379 + /* List required to sync kernel mapping updates and
18380 + * to pin/unpin on save/restore. */
18383 spin_unlock_irqrestore(&pgd_lock, flags);
18389 +#else /* !CONFIG_X86_PAE */
18390 +/* No need to prepopulate any pagetable entries in non-PAE modes. */
18391 +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
18397 +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
18400 +#endif /* CONFIG_X86_PAE */
18404 - for (i--; i >= 0; i--) {
18405 - pgd_t pgdent = pgd[i];
18406 - void* pmd = (void *)__va(pgd_val(pgdent)-1);
18407 - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18408 - pmd_cache_free(pmd, i);
18411 - for (i--; i >= 0; i--) {
18412 - paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
18413 - pmd_cache_free(pmds[i], i);
18416 +pgd_t *pgd_alloc(struct mm_struct *mm)
18418 + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
18420 + /* so that alloc_pd can use it */
18425 + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
18426 + free_page((unsigned long)pgd);
18429 - quicklist_free(0, pgd_dtor, pgd);
18435 -void pgd_free(pgd_t *pgd)
18436 +void pgd_free(struct mm_struct *mm, pgd_t *pgd)
18441 * After this the pgd should not be pinned for the duration of this
18442 * function's execution. We should never sleep and thus never race:
18443 @@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd)
18444 * 2. The machine addresses in PGD entries will not become invalid
18445 * due to a concurrent save/restore.
18447 - pgd_test_and_unpin(pgd);
18450 - /* in the PAE case user pgd entries are overwritten before usage */
18451 - if (PTRS_PER_PMD > 1) {
18452 - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
18453 - pgd_t pgdent = pgd[i];
18454 - void* pmd = (void *)__va(pgd_val(pgdent)-1);
18455 - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18456 - pmd_cache_free(pmd, i);
18458 + if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
18459 + xen_destroy_contiguous_region((unsigned long)pgd, 0);
18461 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18462 - xen_destroy_contiguous_region((unsigned long)pgd, 0);
18464 + pgd_mop_up_pmds(mm, pgd);
18465 + free_page((unsigned long)pgd);
18468 - /* in the non-PAE case, free_pgtables() clears user pgd entries */
18469 - quicklist_free(0, pgd_dtor, pgd);
18470 +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
18472 + pgtable_page_dtor(pte);
18473 + paravirt_release_pt(page_to_pfn(pte));
18474 + tlb_remove_page(tlb, pte);
18477 -void check_pgt_cache(void)
18478 +#ifdef CONFIG_X86_PAE
18480 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
18482 - quicklist_trim(0, pgd_dtor, 25, 16);
18483 + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
18484 + tlb_remove_page(tlb, virt_to_page(pmd));
18489 void make_lowmem_page_readonly(void *va, unsigned int feature)
18492 + unsigned int level;
18495 if (xen_feature(feature))
18498 - pte = virt_to_ptep(va);
18499 + pte = lookup_address((unsigned long)va, &level);
18500 + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18501 rc = HYPERVISOR_update_va_mapping(
18502 (unsigned long)va, pte_wrprotect(*pte), 0);
18504 @@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va,
18505 void make_lowmem_page_writable(void *va, unsigned int feature)
18508 + unsigned int level;
18511 if (xen_feature(feature))
18514 - pte = virt_to_ptep(va);
18515 + pte = lookup_address((unsigned long)va, &level);
18516 + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
18517 rc = HYPERVISOR_update_va_mapping(
18518 (unsigned long)va, pte_mkwrite(*pte), 0);
18522 -void make_page_readonly(void *va, unsigned int feature)
18527 - if (xen_feature(feature))
18530 - pte = virt_to_ptep(va);
18531 - rc = HYPERVISOR_update_va_mapping(
18532 - (unsigned long)va, pte_wrprotect(*pte), 0);
18533 - if (rc) /* fallback? */
18534 - xen_l1_entry_update(pte, pte_wrprotect(*pte));
18535 - if ((unsigned long)va >= (unsigned long)high_memory) {
18536 - unsigned long pfn = pte_pfn(*pte);
18537 -#ifdef CONFIG_HIGHMEM
18538 - if (pfn >= highstart_pfn)
18539 - kmap_flush_unused(); /* flush stale writable kmaps */
18542 - make_lowmem_page_readonly(
18543 - phys_to_virt(pfn << PAGE_SHIFT), feature);
18547 -void make_page_writable(void *va, unsigned int feature)
18552 - if (xen_feature(feature))
18555 - pte = virt_to_ptep(va);
18556 - rc = HYPERVISOR_update_va_mapping(
18557 - (unsigned long)va, pte_mkwrite(*pte), 0);
18558 - if (rc) /* fallback? */
18559 - xen_l1_entry_update(pte, pte_mkwrite(*pte));
18560 - if ((unsigned long)va >= (unsigned long)high_memory) {
18561 - unsigned long pfn = pte_pfn(*pte);
18562 -#ifdef CONFIG_HIGHMEM
18563 - if (pfn < highstart_pfn)
18565 - make_lowmem_page_writable(
18566 - phys_to_virt(pfn << PAGE_SHIFT), feature);
18570 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18572 - if (xen_feature(feature))
18575 - while (nr-- != 0) {
18576 - make_page_readonly(va, feature);
18577 - va = (void *)((unsigned long)va + PAGE_SIZE);
18581 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18583 - if (xen_feature(feature))
18586 - while (nr-- != 0) {
18587 - make_page_writable(va, feature);
18588 - va = (void *)((unsigned long)va + PAGE_SIZE);
18592 -static void _pin_lock(struct mm_struct *mm, int lock) {
18594 - spin_lock(&mm->page_table_lock);
18595 -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
18596 - /* While mm->page_table_lock protects us against insertions and
18597 - * removals of higher level page table pages, it doesn't protect
18598 - * against updates of pte-s. Such updates, however, require the
18599 - * pte pages to be in consistent state (unpinned+writable or
18600 - * pinned+readonly). The pinning and attribute changes, however
18601 - * cannot be done atomically, which is why such updates must be
18602 - * prevented from happening concurrently.
18603 - * Note that no pte lock can ever elsewhere be acquired nesting
18604 - * with an already acquired one in the same mm, or with the mm's
18605 - * page_table_lock already acquired, as that would break in the
18606 - * non-split case (where all these are actually resolving to the
18607 - * one page_table_lock). Thus acquiring all of them here is not
18608 - * going to result in dead locks, and the order of acquires
18609 - * doesn't matter.
18612 - pgd_t *pgd = mm->pgd;
18615 - for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18619 - if (pgd_none(*pgd))
18621 - pud = pud_offset(pgd, 0);
18622 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18626 - if (pud_none(*pud))
18628 - pmd = pmd_offset(pud, 0);
18629 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18632 - if (pmd_none(*pmd))
18634 - ptl = pte_lockptr(0, pmd);
18638 - spin_unlock(ptl);
18645 - spin_unlock(&mm->page_table_lock);
18647 -#define pin_lock(mm) _pin_lock(mm, 1)
18648 -#define pin_unlock(mm) _pin_lock(mm, 0)
18650 -#define PIN_BATCH 4
18651 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
18653 -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
18654 - unsigned int cpu, unsigned seq)
18656 - unsigned long pfn = page_to_pfn(page);
18658 - if (PageHighMem(page)) {
18659 - if (pgprot_val(flags) & _PAGE_RW)
18660 - ClearPagePinned(page);
18662 - SetPagePinned(page);
18664 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18665 - (unsigned long)__va(pfn << PAGE_SHIFT),
18666 - pfn_pte(pfn, flags), 0);
18667 - if (unlikely(++seq == PIN_BATCH)) {
18668 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18669 - PIN_BATCH, NULL)))
18678 -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
18680 - pgd_t *pgd = pgd_base;
18684 - unsigned int cpu, seq;
18686 - if (xen_feature(XENFEAT_auto_translated_physmap))
18691 - for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18692 - if (pgd_none(*pgd))
18694 - pud = pud_offset(pgd, 0);
18695 - if (PTRS_PER_PUD > 1) /* not folded */
18696 - seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
18697 - for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18698 - if (pud_none(*pud))
18700 - pmd = pmd_offset(pud, 0);
18701 - if (PTRS_PER_PMD > 1) /* not folded */
18702 - seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
18703 - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18704 - if (pmd_none(*pmd))
18706 - seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
18711 - if (likely(seq != 0)) {
18712 - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
18713 - (unsigned long)pgd_base,
18714 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18716 - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
18719 - } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
18720 - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18727 -static void __pgd_pin(pgd_t *pgd)
18729 - pgd_walk(pgd, PAGE_KERNEL_RO);
18730 - kmap_flush_unused();
18731 - xen_pgd_pin(__pa(pgd));
18732 - SetPagePinned(virt_to_page(pgd));
18735 -static void __pgd_unpin(pgd_t *pgd)
18737 - xen_pgd_unpin(__pa(pgd));
18738 - pgd_walk(pgd, PAGE_KERNEL);
18739 - ClearPagePinned(virt_to_page(pgd));
18742 -static void pgd_test_and_unpin(pgd_t *pgd)
18744 - if (PagePinned(virt_to_page(pgd)))
18745 - __pgd_unpin(pgd);
18748 -void mm_pin(struct mm_struct *mm)
18750 - if (xen_feature(XENFEAT_writable_page_tables))
18753 - __pgd_pin(mm->pgd);
18757 -void mm_unpin(struct mm_struct *mm)
18759 - if (xen_feature(XENFEAT_writable_page_tables))
18762 - __pgd_unpin(mm->pgd);
18766 -void mm_pin_all(void)
18768 - struct page *page;
18769 - unsigned long flags;
18771 - if (xen_feature(XENFEAT_writable_page_tables))
18775 - * Allow uninterrupted access to the pgd_list. Also protects
18776 - * __pgd_pin() by disabling preemption.
18777 - * All other CPUs must be at a safe point (e.g., in stop_machine
18778 - * or offlined entirely).
18780 - spin_lock_irqsave(&pgd_lock, flags);
18781 - for (page = pgd_list; page; page = (struct page *)page->index) {
18782 - if (!PagePinned(page))
18783 - __pgd_pin((pgd_t *)page_address(page));
18785 - spin_unlock_irqrestore(&pgd_lock, flags);
18788 -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
18790 - if (!PagePinned(virt_to_page(mm->pgd)))
18794 -void arch_exit_mmap(struct mm_struct *mm)
18796 - struct task_struct *tsk = current;
18801 - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
18802 - * *much* faster this way, as no tlb flushes means bigger wrpt batches.
18804 - if (tsk->active_mm == mm) {
18805 - tsk->active_mm = &init_mm;
18806 - atomic_inc(&init_mm.mm_count);
18808 - switch_mm(mm, &init_mm, tsk);
18810 - atomic_dec(&mm->mm_count);
18811 - BUG_ON(atomic_read(&mm->mm_count) == 0);
18814 - task_unlock(tsk);
18816 - if (PagePinned(virt_to_page(mm->pgd)) &&
18817 - (atomic_read(&mm->mm_count) == 1) &&
18818 - !mm->context.has_foreign_mappings)
18821 --- sle11-2009-10-16.orig/arch/x86/pci/irq-xen.c 2009-02-16 16:18:36.000000000 +0100
18822 +++ sle11-2009-10-16/arch/x86/pci/irq-xen.c 2009-03-16 16:33:40.000000000 +0100
18823 @@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev *
18825 static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
18827 + WARN_ON_ONCE(pirq >= 16);
18828 return irqmap[read_config_nybble(router, 0x48, pirq-1)];
18831 @@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev *
18833 static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
18834 unsigned int val = irqmap[irq];
18837 + WARN_ON_ONCE(pirq >= 16);
18839 write_config_nybble(router, 0x48, pirq-1, val);
18841 @@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev *
18842 static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18844 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18846 + WARN_ON_ONCE(pirq >= 5);
18847 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
18850 static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18852 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
18854 + WARN_ON_ONCE(pirq >= 5);
18855 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
18858 @@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de
18859 static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18861 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18863 + WARN_ON_ONCE(pirq >= 4);
18864 return read_config_nybble(router,0x43, pirqmap[pirq-1]);
18867 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18869 static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
18871 + WARN_ON_ONCE(pirq >= 4);
18872 write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
18875 @@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev *
18877 static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18879 + WARN_ON_ONCE(pirq >= 9);
18881 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18883 @@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev
18885 static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18887 + WARN_ON_ONCE(pirq >= 9);
18889 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
18891 @@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev
18893 static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
18895 - outb_p(pirq, 0xc00);
18896 + outb(pirq, 0xc00);
18897 return inb(0xc01) & 0xf;
18900 static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
18902 - outb_p(pirq, 0xc00);
18903 - outb_p(irq, 0xc01);
18904 + outb(pirq, 0xc00);
18905 + outb(irq, 0xc01);
18909 @@ -575,6 +587,10 @@ static __init int intel_router_probe(str
18910 case PCI_DEVICE_ID_INTEL_ICH9_4:
18911 case PCI_DEVICE_ID_INTEL_ICH9_5:
18912 case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
18913 + case PCI_DEVICE_ID_INTEL_ICH10_0:
18914 + case PCI_DEVICE_ID_INTEL_ICH10_1:
18915 + case PCI_DEVICE_ID_INTEL_ICH10_2:
18916 + case PCI_DEVICE_ID_INTEL_ICH10_3:
18917 r->name = "PIIX/ICH";
18918 r->get = pirq_piix_get;
18919 r->set = pirq_piix_set;
18920 --- sle11-2009-10-16.orig/arch/x86/vdso/Makefile 2008-11-25 12:35:54.000000000 +0100
18921 +++ sle11-2009-10-16/arch/x86/vdso/Makefile 2009-03-16 16:33:40.000000000 +0100
18922 @@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y) += int80
18923 vdso32.so-$(CONFIG_COMPAT) += syscall
18924 vdso32.so-$(VDSO32-y) += sysenter
18925 xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
18926 +xen-vdso32-$(CONFIG_X86_32) += syscall
18927 vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
18929 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
18930 --- sle11-2009-10-16.orig/arch/x86/vdso/vdso32/syscall.S 2009-10-28 14:55:04.000000000 +0100
18931 +++ sle11-2009-10-16/arch/x86/vdso/vdso32/syscall.S 2009-03-16 16:33:40.000000000 +0100
18932 @@ -19,8 +19,10 @@ __kernel_vsyscall:
18936 +#ifndef CONFIG_XEN
18937 movl $__USER32_DS, %ecx
18943 --- sle11-2009-10-16.orig/arch/x86/vdso/vdso32.S 2009-10-28 14:55:04.000000000 +0100
18944 +++ sle11-2009-10-16/arch/x86/vdso/vdso32.S 2009-03-16 16:33:40.000000000 +0100
18945 @@ -19,4 +19,16 @@ vdso32_sysenter_start:
18946 .incbin "arch/x86/vdso/vdso32-sysenter.so"
18947 vdso32_sysenter_end:
18949 +#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
18950 + .globl vdso32_int80_start, vdso32_int80_end
18951 +vdso32_int80_start:
18952 + .incbin "arch/x86/vdso/vdso32-int80.so"
18954 +#elif defined(CONFIG_X86_XEN)
18955 + .globl vdso32_syscall_start, vdso32_syscall_end
18956 +vdso32_syscall_start:
18957 + .incbin "arch/x86/vdso/vdso32-syscall.so"
18958 +vdso32_syscall_end:
18962 --- sle11-2009-10-16.orig/arch/x86/vdso/vdso32-setup.c 2008-11-25 12:35:53.000000000 +0100
18963 +++ sle11-2009-10-16/arch/x86/vdso/vdso32-setup.c 2009-03-16 16:33:40.000000000 +0100
18965 #include <asm/vdso.h>
18966 #include <asm/proto.h>
18969 -#include <xen/interface/callback.h>
18975 @@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m
18977 void enable_sep_cpu(void)
18979 -#ifndef CONFIG_XEN
18980 int cpu = get_cpu();
18981 struct tss_struct *tss = &per_cpu(init_tss, cpu);
18983 @@ -244,35 +239,6 @@ void enable_sep_cpu(void)
18984 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
18985 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
18988 - extern asmlinkage void ia32pv_sysenter_target(void);
18989 - static struct callback_register sysenter = {
18990 - .type = CALLBACKTYPE_sysenter,
18991 - .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
18994 - if (!boot_cpu_has(X86_FEATURE_SEP))
18999 - if (xen_feature(XENFEAT_supervisor_mode_kernel))
19000 - sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19002 - switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19005 -#if CONFIG_XEN_COMPAT < 0x030200
19007 - sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19008 - if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19012 - clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
19018 static struct vm_area_struct gate_vma;
19019 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
19020 +++ sle11-2009-10-16/arch/x86/vdso/vdso32-setup-xen.c 2009-03-16 16:33:40.000000000 +0100
19023 + * (C) Copyright 2002 Linus Torvalds
19024 + * Portions based on the vdso-randomization code from exec-shield:
19025 + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
19027 + * This file contains the needed initializations to support sysenter.
19030 +#include <linux/init.h>
19031 +#include <linux/smp.h>
19032 +#include <linux/thread_info.h>
19033 +#include <linux/sched.h>
19034 +#include <linux/gfp.h>
19035 +#include <linux/string.h>
19036 +#include <linux/elf.h>
19037 +#include <linux/mm.h>
19038 +#include <linux/err.h>
19039 +#include <linux/module.h>
19041 +#include <asm/cpufeature.h>
19042 +#include <asm/msr.h>
19043 +#include <asm/pgtable.h>
19044 +#include <asm/unistd.h>
19045 +#include <asm/elf.h>
19046 +#include <asm/tlbflush.h>
19047 +#include <asm/vdso.h>
19048 +#include <asm/proto.h>
19050 +#include <xen/interface/callback.h>
19053 + VDSO_DISABLED = 0,
19054 + VDSO_ENABLED = 1,
19058 +#ifdef CONFIG_COMPAT_VDSO
19059 +#define VDSO_DEFAULT VDSO_COMPAT
19061 +#define VDSO_DEFAULT VDSO_ENABLED
19064 +#ifdef CONFIG_X86_64
19065 +#define vdso_enabled sysctl_vsyscall32
19066 +#define arch_setup_additional_pages syscall32_setup_pages
19070 + * This is the difference between the prelinked addresses in the vDSO images
19071 + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
19072 + * in the user address space.
19074 +#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
19077 + * Should the kernel map a VDSO page into processes and pass its
19078 + * address down to glibc upon exec()?
19080 +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
19082 +static int __init vdso_setup(char *s)
19084 + vdso_enabled = simple_strtoul(s, NULL, 0);
19090 + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
19091 + * behavior on both 64-bit and 32-bit kernels.
19092 + * On 32-bit kernels, vdso=[012] means the same thing.
19094 +__setup("vdso32=", vdso_setup);
19096 +#ifdef CONFIG_X86_32
19097 +__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
19099 +EXPORT_SYMBOL_GPL(vdso_enabled);
19102 +static __init void reloc_symtab(Elf32_Ehdr *ehdr,
19103 + unsigned offset, unsigned size)
19105 + Elf32_Sym *sym = (void *)ehdr + offset;
19106 + unsigned nsym = size / sizeof(*sym);
19109 + for(i = 0; i < nsym; i++, sym++) {
19110 + if (sym->st_shndx == SHN_UNDEF ||
19111 + sym->st_shndx == SHN_ABS)
19112 + continue; /* skip */
19114 + if (sym->st_shndx > SHN_LORESERVE) {
19115 + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
19120 + switch(ELF_ST_TYPE(sym->st_info)) {
19123 + case STT_SECTION:
19125 + sym->st_value += VDSO_ADDR_ADJUST;
19130 +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
19132 + Elf32_Dyn *dyn = (void *)ehdr + offset;
19134 + for(; dyn->d_tag != DT_NULL; dyn++)
19135 + switch(dyn->d_tag) {
19149 + case DT_ADDRRNGLO ... DT_ADDRRNGHI:
19150 + /* definitely pointers needing relocation */
19151 + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19154 + case DT_ENCODING ... OLD_DT_LOOS-1:
19155 + case DT_LOOS ... DT_HIOS-1:
19156 + /* Tags above DT_ENCODING are pointers if
19158 + if (dyn->d_tag >= DT_ENCODING &&
19159 + (dyn->d_tag & 1) == 0)
19160 + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
19163 + case DT_VERDEFNUM:
19164 + case DT_VERNEEDNUM:
19166 + case DT_RELACOUNT:
19167 + case DT_RELCOUNT:
19168 + case DT_VALRNGLO ... DT_VALRNGHI:
19169 + /* definitely not pointers */
19172 + case OLD_DT_LOOS ... DT_LOOS-1:
19173 + case DT_HIOS ... DT_VALRNGLO-1:
19175 + if (dyn->d_tag > DT_ENCODING)
19176 + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
19182 +static __init void relocate_vdso(Elf32_Ehdr *ehdr)
19184 + Elf32_Phdr *phdr;
19185 + Elf32_Shdr *shdr;
19188 + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
19189 + !elf_check_arch_ia32(ehdr) ||
19190 + ehdr->e_type != ET_DYN);
19192 + ehdr->e_entry += VDSO_ADDR_ADJUST;
19194 + /* rebase phdrs */
19195 + phdr = (void *)ehdr + ehdr->e_phoff;
19196 + for (i = 0; i < ehdr->e_phnum; i++) {
19197 + phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
19199 + /* relocate dynamic stuff */
19200 + if (phdr[i].p_type == PT_DYNAMIC)
19201 + reloc_dyn(ehdr, phdr[i].p_offset);
19204 + /* rebase sections */
19205 + shdr = (void *)ehdr + ehdr->e_shoff;
19206 + for(i = 0; i < ehdr->e_shnum; i++) {
19207 + if (!(shdr[i].sh_flags & SHF_ALLOC))
19210 + shdr[i].sh_addr += VDSO_ADDR_ADJUST;
19212 + if (shdr[i].sh_type == SHT_SYMTAB ||
19213 + shdr[i].sh_type == SHT_DYNSYM)
19214 + reloc_symtab(ehdr, shdr[i].sh_offset,
19215 + shdr[i].sh_size);
19220 + * These symbols are defined by vdso32.S to mark the bounds
19221 + * of the ELF DSO images included therein.
19223 +extern const char vdso32_default_start, vdso32_default_end;
19224 +extern const char vdso32_sysenter_start, vdso32_sysenter_end;
19225 +static struct page *vdso32_pages[1];
19227 +#ifdef CONFIG_X86_64
19229 +#if CONFIG_XEN_COMPAT < 0x030200
19230 +static int use_int80 = 1;
19232 +static int use_sysenter __read_mostly = -1;
19234 +#define vdso32_sysenter() (use_sysenter > 0)
19236 +/* May not be __init: called during resume */
19237 +void syscall32_cpu_init(void)
19239 + static const struct callback_register cstar = {
19240 + .type = CALLBACKTYPE_syscall32,
19241 + .address = (unsigned long)ia32_cstar_target
19243 + static const struct callback_register sysenter = {
19244 + .type = CALLBACKTYPE_sysenter,
19245 + .address = (unsigned long)ia32_sysenter_target
19248 + if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
19249 + (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
19250 +#if CONFIG_XEN_COMPAT < 0x030200
19257 + if (use_sysenter < 0)
19258 + use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
19261 +#define compat_uses_vma 1
19263 +static inline void map_compat_vdso(int map)
19267 +#else /* CONFIG_X86_32 */
19269 +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
19271 +extern asmlinkage void ia32pv_cstar_target(void);
19272 +static const struct callback_register __cpuinitconst cstar = {
19273 + .type = CALLBACKTYPE_syscall32,
19274 + .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
19277 +void __cpuinit enable_sep_cpu(void)
19279 + extern asmlinkage void ia32pv_sysenter_target(void);
19280 + static struct callback_register __cpuinitdata sysenter = {
19281 + .type = CALLBACKTYPE_sysenter,
19282 + .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
19285 + if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19286 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
19291 + if (!boot_cpu_has(X86_FEATURE_SEP))
19294 + if (xen_feature(XENFEAT_supervisor_mode_kernel))
19295 + sysenter.address.eip = (unsigned long)ia32_sysenter_target;
19297 + switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
19300 +#if CONFIG_XEN_COMPAT < 0x030200
19302 + sysenter.type = CALLBACKTYPE_sysenter_deprecated;
19303 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
19307 + setup_clear_cpu_cap(X86_FEATURE_SEP);
19312 +static struct vm_area_struct gate_vma;
19314 +static int __init gate_vma_init(void)
19316 + gate_vma.vm_mm = NULL;
19317 + gate_vma.vm_start = FIXADDR_USER_START;
19318 + gate_vma.vm_end = FIXADDR_USER_END;
19319 + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
19320 + gate_vma.vm_page_prot = __P101;
19322 + * Make sure the vDSO gets into every core dump.
19323 + * Dumping its contents makes post-mortem fully interpretable later
19324 + * without matching up the same kernel and hardware config to see
19325 + * what PC values meant.
19327 + gate_vma.vm_flags |= VM_ALWAYSDUMP;
19331 +#define compat_uses_vma 0
19333 +static void map_compat_vdso(int map)
19335 + static int vdso_mapped;
19337 + if (map == vdso_mapped)
19340 + vdso_mapped = map;
19342 + __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
19343 + map ? PAGE_READONLY_EXEC : PAGE_NONE);
19345 + /* flush stray tlbs */
19349 +#endif /* CONFIG_X86_64 */
19351 +int __init sysenter_setup(void)
19353 + void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
19354 + const void *vsyscall;
19355 + size_t vsyscall_len;
19357 + vdso32_pages[0] = virt_to_page(syscall_page);
19359 +#ifdef CONFIG_X86_32
19362 + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
19365 +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
19367 + extern const char vdso32_int80_start, vdso32_int80_end;
19369 + vsyscall = &vdso32_int80_start;
19370 + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
19372 +#elif defined(CONFIG_X86_32)
19373 + if (boot_cpu_has(X86_FEATURE_SYSCALL)
19374 + && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
19375 + || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
19376 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
19377 + barrier(); /* until clear_bit()'s constraints are correct ... */
19378 + if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
19379 + extern const char vdso32_syscall_start, vdso32_syscall_end;
19381 + vsyscall = &vdso32_syscall_start;
19382 + vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
19385 + if (!vdso32_sysenter()) {
19386 + vsyscall = &vdso32_default_start;
19387 + vsyscall_len = &vdso32_default_end - &vdso32_default_start;
19389 + vsyscall = &vdso32_sysenter_start;
19390 + vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
19393 + memcpy(syscall_page, vsyscall, vsyscall_len);
19394 + relocate_vdso(syscall_page);
19399 +/* Setup a VMA at program startup for the vsyscall page */
19400 +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
19402 + struct mm_struct *mm = current->mm;
19403 + unsigned long addr;
19407 + down_write(&mm->mmap_sem);
19409 + /* Test compat mode once here, in case someone
19410 + changes it via sysctl */
19411 + compat = (vdso_enabled == VDSO_COMPAT);
19413 + map_compat_vdso(compat);
19416 + addr = VDSO_HIGH_BASE;
19418 + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
19419 + if (IS_ERR_VALUE(addr)) {
19425 + if (compat_uses_vma || !compat) {
19427 + * MAYWRITE to allow gdb to COW and set breakpoints
19429 + * Make sure the vDSO gets into every core dump.
19430 + * Dumping its contents makes post-mortem fully
19431 + * interpretable later without matching up the same
19432 + * kernel and hardware config to see what PC values
19435 + ret = install_special_mapping(mm, addr, PAGE_SIZE,
19437 + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
19445 + current->mm->context.vdso = (void *)addr;
19446 + current_thread_info()->sysenter_return =
19447 + VDSO32_SYMBOL(addr, SYSENTER_RETURN);
19450 + up_write(&mm->mmap_sem);
19455 +#ifdef CONFIG_X86_64
19458 + * This must be done early in case we have an initrd containing 32-bit
19459 + * binaries (e.g., hotplug). This could be pushed upstream.
19461 +core_initcall(sysenter_setup);
19463 +#ifdef CONFIG_SYSCTL
19464 +/* Register vsyscall32 into the ABI table */
19465 +#include <linux/sysctl.h>
19467 +static ctl_table abi_table2[] = {
19469 + .procname = "vsyscall32",
19470 + .data = &sysctl_vsyscall32,
19471 + .maxlen = sizeof(int),
19473 + .proc_handler = proc_dointvec
19478 +static ctl_table abi_root_table2[] = {
19480 + .ctl_name = CTL_ABI,
19481 + .procname = "abi",
19483 + .child = abi_table2
19488 +static __init int ia32_binfmt_init(void)
19490 + register_sysctl_table(abi_root_table2);
19493 +__initcall(ia32_binfmt_init);
19496 +#else /* CONFIG_X86_32 */
19498 +const char *arch_vma_name(struct vm_area_struct *vma)
19500 + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
19505 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
19507 + struct mm_struct *mm = tsk->mm;
19509 + /* Check to see if this task was created in compat vdso mode */
19510 + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
19511 + return &gate_vma;
19515 +int in_gate_area(struct task_struct *task, unsigned long addr)
19517 + const struct vm_area_struct *vma = get_gate_vma(task);
19519 + return vma && addr >= vma->vm_start && addr < vma->vm_end;
19522 +int in_gate_area_no_task(unsigned long addr)
19527 +#endif /* CONFIG_X86_64 */
19528 --- sle11-2009-10-16.orig/drivers/pci/msi-xen.c 2009-02-16 16:18:36.000000000 +0100
19529 +++ sle11-2009-10-16/drivers/pci/msi-xen.c 2009-03-16 16:33:40.000000000 +0100
19530 @@ -43,6 +43,53 @@ struct msi_pirq_entry {
19536 +int __attribute__ ((weak))
19537 +arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
19542 +#ifndef CONFIG_XEN
19543 +int __attribute__ ((weak))
19544 +arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19549 +int __attribute__ ((weak))
19550 +arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19552 + struct msi_desc *entry;
19555 + list_for_each_entry(entry, &dev->msi_list, list) {
19556 + ret = arch_setup_msi_irq(dev, entry);
19564 +void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19569 +void __attribute__ ((weak))
19570 +arch_teardown_msi_irqs(struct pci_dev *dev)
19572 + struct msi_desc *entry;
19574 + list_for_each_entry(entry, &dev->msi_list, list) {
19575 + if (entry->irq != 0)
19576 + arch_teardown_msi_irq(entry->irq);
19581 static void msi_set_enable(struct pci_dev *dev, int enable)
19584 @@ -270,7 +317,6 @@ static void pci_intx_for_msi(struct pci_
19585 pci_intx(dev, enable);
19589 static void __pci_restore_msi_state(struct pci_dev *dev)
19592 @@ -328,7 +374,7 @@ void pci_restore_msi_state(struct pci_de
19593 __pci_restore_msi_state(dev);
19594 __pci_restore_msix_state(dev);
19596 -#endif /* CONFIG_PM */
19597 +EXPORT_SYMBOL_GPL(pci_restore_msi_state);
19600 * msi_capability_init - configure device's MSI capability structure
19601 @@ -755,51 +801,3 @@ void pci_msi_init_pci_dev(struct pci_dev
19602 INIT_LIST_HEAD(&dev->msi_list);
19609 -int __attribute__ ((weak))
19610 -arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
19615 -#ifndef CONFIG_XEN
19616 -int __attribute__ ((weak))
19617 -arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
19622 -int __attribute__ ((weak))
19623 -arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
19625 - struct msi_desc *entry;
19628 - list_for_each_entry(entry, &dev->msi_list, list) {
19629 - ret = arch_setup_msi_irq(dev, entry);
19637 -void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
19642 -void __attribute__ ((weak))
19643 -arch_teardown_msi_irqs(struct pci_dev *dev)
19645 - struct msi_desc *entry;
19647 - list_for_each_entry(entry, &dev->msi_list, list) {
19648 - if (entry->irq != 0)
19649 - arch_teardown_msi_irq(entry->irq);
19653 --- sle11-2009-10-16.orig/drivers/pci/pci.c 2009-10-28 14:55:04.000000000 +0100
19654 +++ sle11-2009-10-16/drivers/pci/pci.c 2009-03-16 16:33:40.000000000 +0100
19655 @@ -353,7 +353,12 @@ pci_find_parent_resource(const struct pc
19656 * Restore the BAR values for a given device, so as to make it
19657 * accessible by its driver.
19659 +#ifndef CONFIG_XEN
19662 +EXPORT_SYMBOL_GPL(pci_restore_bars);
19665 pci_restore_bars(struct pci_dev *dev)
19668 --- sle11-2009-10-16.orig/drivers/xen/balloon/sysfs.c 2009-03-04 11:25:55.000000000 +0100
19669 +++ sle11-2009-10-16/drivers/xen/balloon/sysfs.c 2009-06-29 15:29:24.000000000 +0200
19670 @@ -104,7 +104,7 @@ static struct attribute_group balloon_in
19673 static struct sysdev_class balloon_sysdev_class = {
19674 - set_kset_name(BALLOON_CLASS_NAME),
19675 + .name = BALLOON_CLASS_NAME,
19678 static struct sys_device balloon_sysdev;
19679 --- sle11-2009-10-16.orig/drivers/xen/blkback/blkback.c 2009-02-16 16:18:36.000000000 +0100
19680 +++ sle11-2009-10-16/drivers/xen/blkback/blkback.c 2009-03-16 16:33:40.000000000 +0100
19681 @@ -148,7 +148,7 @@ static void unplug_queue(blkif_t *blkif)
19683 if (blkif->plug->unplug_fn)
19684 blkif->plug->unplug_fn(blkif->plug);
19685 - blk_put_queue(blkif->plug);
19686 + kobject_put(&blkif->plug->kobj);
19687 blkif->plug = NULL;
19690 @@ -159,7 +159,8 @@ static void plug_queue(blkif_t *blkif, s
19691 if (q == blkif->plug)
19693 unplug_queue(blkif);
19694 - blk_get_queue(q);
19695 + WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
19696 + kobject_get(&q->kobj);
19700 --- sle11-2009-10-16.orig/drivers/xen/blkfront/blkfront.c 2009-02-16 16:18:36.000000000 +0100
19701 +++ sle11-2009-10-16/drivers/xen/blkfront/blkfront.c 2009-03-24 10:12:53.000000000 +0100
19702 @@ -713,7 +713,6 @@ static irqreturn_t blkif_int(int irq, vo
19704 unsigned long flags;
19705 struct blkfront_info *info = (struct blkfront_info *)dev_id;
19708 spin_lock_irqsave(&blkif_io_lock, flags);
19710 @@ -738,13 +737,13 @@ static irqreturn_t blkif_int(int irq, vo
19712 ADD_ID_TO_FREELIST(info, id);
19714 - uptodate = (bret->status == BLKIF_RSP_OKAY);
19715 + ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
19716 switch (bret->operation) {
19717 case BLKIF_OP_WRITE_BARRIER:
19718 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
19719 printk("blkfront: %s: write barrier op failed\n",
19720 info->gd->disk_name);
19721 - uptodate = -EOPNOTSUPP;
19722 + ret = -EOPNOTSUPP;
19723 info->feature_barrier = 0;
19724 xlvbd_barrier(info);
19726 @@ -755,10 +754,8 @@ static irqreturn_t blkif_int(int irq, vo
19727 DPRINTK("Bad return from blkdev data "
19728 "request: %x\n", bret->status);
19730 - ret = end_that_request_first(req, uptodate,
19731 - req->hard_nr_sectors);
19732 + ret = __blk_end_request(req, ret, blk_rq_bytes(req));
19734 - end_that_request_last(req, uptodate);
19738 --- sle11-2009-10-16.orig/drivers/xen/blktap/blktap.c 2009-04-20 11:37:50.000000000 +0200
19739 +++ sle11-2009-10-16/drivers/xen/blktap/blktap.c 2009-04-20 11:38:54.000000000 +0200
19740 @@ -331,8 +331,8 @@ static pte_t blktap_clear_pte(struct vm_
19741 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
19743 if (uvaddr < uvstart || vma->vm_file == NULL)
19744 - return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
19745 - ptep, is_fullmm);
19746 + return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19749 info = vma->vm_file->private_data;
19750 priv = vma->vm_private_data;
19751 @@ -379,8 +379,8 @@ static pte_t blktap_clear_pte(struct vm_
19752 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
19754 /* USING SHADOW PAGE TABLES. */
19755 - copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
19757 + copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
19762 --- sle11-2009-10-16.orig/drivers/xen/core/Makefile 2009-10-28 14:55:04.000000000 +0100
19763 +++ sle11-2009-10-16/drivers/xen/core/Makefile 2009-03-16 16:33:40.000000000 +0100
19764 @@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis
19765 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
19766 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
19767 obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o
19768 +obj-$(CONFIG_X86_SMP) += spinlock.o
19769 obj-$(CONFIG_KEXEC) += machine_kexec.o
19770 obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
19771 --- sle11-2009-10-16.orig/drivers/xen/core/evtchn.c 2009-03-04 11:25:55.000000000 +0100
19772 +++ sle11-2009-10-16/drivers/xen/core/evtchn.c 2009-03-16 16:33:40.000000000 +0100
19773 @@ -194,7 +194,7 @@ static inline unsigned int cpu_from_evtc
19775 /* Upcall to generic IRQ layer. */
19777 -extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
19778 +extern unsigned int do_IRQ(struct pt_regs *regs);
19779 void __init xen_init_IRQ(void);
19780 void __init init_IRQ(void)
19782 @@ -203,13 +203,11 @@ void __init init_IRQ(void)
19784 #if defined (__i386__)
19785 static inline void exit_idle(void) {}
19786 -#define IRQ_REG orig_eax
19787 #elif defined (__x86_64__)
19788 #include <asm/idle.h>
19789 -#define IRQ_REG orig_rax
19791 #define do_IRQ(irq, regs) do { \
19792 - (regs)->IRQ_REG = ~(irq); \
19793 + (regs)->orig_ax = ~(irq); \
19797 @@ -670,13 +668,12 @@ static void set_affinity_irq(unsigned in
19798 int resend_irq_on_evtchn(unsigned int irq)
19800 int masked, evtchn = evtchn_from_irq(irq);
19801 - shared_info_t *s = HYPERVISOR_shared_info;
19803 if (!VALID_EVTCHN(evtchn))
19806 masked = test_and_set_evtchn_mask(evtchn);
19807 - synch_set_bit(evtchn, s->evtchn_pending);
19808 + set_evtchn(evtchn);
19810 unmask_evtchn(evtchn);
19812 @@ -969,6 +966,43 @@ void disable_all_local_evtchn(void)
19813 synch_set_bit(i, &s->evtchn_mask[0]);
19816 +/* Clear an irq's pending state, in preparation for polling on it. */
19817 +void xen_clear_irq_pending(int irq)
19819 + int evtchn = evtchn_from_irq(irq);
19821 + if (VALID_EVTCHN(evtchn))
19822 + clear_evtchn(evtchn);
19825 +/* Set an irq's pending state, to avoid blocking on it. */
19826 +void xen_set_irq_pending(int irq)
19828 + int evtchn = evtchn_from_irq(irq);
19830 + if (VALID_EVTCHN(evtchn))
19831 + set_evtchn(evtchn);
19834 +/* Test an irq's pending state. */
19835 +int xen_test_irq_pending(int irq)
19837 + int evtchn = evtchn_from_irq(irq);
19839 + return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
19842 +/* Poll waiting for an irq to become pending. In the usual case, the
19843 + irq will be disabled so it won't deliver an interrupt. */
19844 +void xen_poll_irq(int irq)
19846 + evtchn_port_t evtchn = evtchn_from_irq(irq);
19848 + if (VALID_EVTCHN(evtchn)
19849 + && HYPERVISOR_poll_no_timeout(&evtchn, 1))
19853 static void restore_cpu_virqs(unsigned int cpu)
19855 struct evtchn_bind_virq bind_virq;
19856 @@ -1022,8 +1056,8 @@ static void restore_cpu_ipis(unsigned in
19857 bind_evtchn_to_cpu(evtchn, cpu);
19859 /* Ready for use. */
19860 - unmask_evtchn(evtchn);
19862 + if (!(irq_desc[irq].status & IRQ_DISABLED))
19863 + unmask_evtchn(evtchn);
19867 --- sle11-2009-10-16.orig/drivers/xen/core/hypervisor_sysfs.c 2008-12-15 11:27:22.000000000 +0100
19868 +++ sle11-2009-10-16/drivers/xen/core/hypervisor_sysfs.c 2009-03-16 16:33:40.000000000 +0100
19869 @@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
19870 if (!is_running_on_xen())
19873 - hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
19874 + hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
19878 --- sle11-2009-10-16.orig/drivers/xen/core/smpboot.c 2009-02-16 16:18:36.000000000 +0100
19879 +++ sle11-2009-10-16/drivers/xen/core/smpboot.c 2009-03-16 16:33:40.000000000 +0100
19880 @@ -135,6 +135,10 @@ static int __cpuinit xen_smp_intr_init(u
19882 per_cpu(callfunc_irq, cpu) = rc;
19884 + rc = xen_spinlock_init(cpu);
19888 if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
19891 @@ -145,6 +149,7 @@ static int __cpuinit xen_smp_intr_init(u
19892 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
19893 if (per_cpu(callfunc_irq, cpu) >= 0)
19894 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
19895 + xen_spinlock_cleanup(cpu);
19899 @@ -156,6 +161,7 @@ static void xen_smp_intr_exit(unsigned i
19901 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
19902 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
19903 + xen_spinlock_cleanup(cpu);
19907 @@ -208,36 +214,25 @@ static void __cpuinit cpu_initialize_con
19908 smp_trap_init(ctxt.trap_ctxt);
19911 - ctxt.gdt_ents = GDT_SIZE / 8;
19914 ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
19915 + ctxt.gdt_ents = GDT_SIZE / 8;
19917 ctxt.user_regs.cs = __KERNEL_CS;
19918 - ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
19919 + ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
19921 ctxt.kernel_ss = __KERNEL_DS;
19922 - ctxt.kernel_sp = idle->thread.esp0;
19923 + ctxt.kernel_sp = idle->thread.sp0;
19925 - ctxt.event_callback_cs = __KERNEL_CS;
19926 ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
19927 - ctxt.failsafe_callback_cs = __KERNEL_CS;
19928 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
19930 + ctxt.event_callback_cs = __KERNEL_CS;
19931 + ctxt.failsafe_callback_cs = __KERNEL_CS;
19933 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
19935 ctxt.user_regs.fs = __KERNEL_PERCPU;
19936 #else /* __x86_64__ */
19937 - ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
19939 - ctxt.user_regs.cs = __KERNEL_CS;
19940 - ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
19942 - ctxt.kernel_ss = __KERNEL_DS;
19943 - ctxt.kernel_sp = idle->thread.rsp0;
19945 - ctxt.event_callback_eip = (unsigned long)hypervisor_callback;
19946 - ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
19947 ctxt.syscall_callback_eip = (unsigned long)system_call;
19949 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
19950 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
19951 +++ sle11-2009-10-16/drivers/xen/core/spinlock.c 2009-03-16 16:33:40.000000000 +0100
19954 + * Xen spinlock functions
19956 + * See arch/x86/xen/smp.c for copyright and credits for derived
19957 + * portions of this file.
19960 +#include <linux/init.h>
19961 +#include <linux/irq.h>
19962 +#include <linux/kernel.h>
19963 +#include <linux/kernel_stat.h>
19964 +#include <linux/module.h>
19965 +#include <xen/evtchn.h>
19967 +extern irqreturn_t smp_reschedule_interrupt(int, void *);
19969 +static DEFINE_PER_CPU(int, spinlock_irq) = -1;
19970 +static char spinlock_name[NR_CPUS][15];
19973 + raw_spinlock_t *lock;
19974 + unsigned int ticket;
19975 + struct spinning *prev;
19977 +static DEFINE_PER_CPU(struct spinning *, spinning);
19979 + * Protect removal of objects: Addition can be done lockless, and even
19980 + * removal itself doesn't need protection - what needs to be prevented is
19981 + * removed objects going out of scope (as they're allocated on the stack.
19983 +static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
19985 +int __cpuinit xen_spinlock_init(unsigned int cpu)
19989 + sprintf(spinlock_name[cpu], "spinlock%u", cpu);
19990 + rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR,
19992 + smp_reschedule_interrupt,
19993 + IRQF_DISABLED|IRQF_NOBALANCING,
19994 + spinlock_name[cpu],
19999 + disable_irq(rc); /* make sure it's never delivered */
20000 + per_cpu(spinlock_irq, cpu) = rc;
20005 +void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
20007 + if (per_cpu(spinlock_irq, cpu) >= 0)
20008 + unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL);
20009 + per_cpu(spinlock_irq, cpu) = -1;
20012 +int xen_spin_wait(raw_spinlock_t *lock, unsigned int token)
20014 + int rc = 0, irq = __get_cpu_var(spinlock_irq);
20015 + raw_rwlock_t *rm_lock;
20016 + unsigned long flags;
20017 + struct spinning spinning;
20019 + /* If kicker interrupt not initialized yet, just spin. */
20020 + if (unlikely(irq < 0) || unlikely(!cpu_online(raw_smp_processor_id())))
20023 + token >>= TICKET_SHIFT;
20025 + /* announce we're spinning */
20026 + spinning.ticket = token;
20027 + spinning.lock = lock;
20028 + spinning.prev = __get_cpu_var(spinning);
20030 + __get_cpu_var(spinning) = &spinning;
20032 + /* clear pending */
20033 + xen_clear_irq_pending(irq);
20036 + /* Check again to make sure it didn't become free while
20037 + * we weren't looking. */
20038 + if ((lock->slock & ((1U << TICKET_SHIFT) - 1)) == token) {
20039 + /* If we interrupted another spinlock while it was
20040 + * blocking, make sure it doesn't block (again)
20041 + * without rechecking the lock. */
20042 + if (spinning.prev)
20043 + xen_set_irq_pending(irq);
20048 + /* block until irq becomes pending */
20049 + xen_poll_irq(irq);
20050 + } while (!xen_test_irq_pending(irq));
20052 + /* Leave the irq pending so that any interrupted blocker will
20054 + kstat_this_cpu.irqs[irq] += !rc;
20056 + /* announce we're done */
20057 + __get_cpu_var(spinning) = spinning.prev;
20058 + rm_lock = &__get_cpu_var(spinning_rm_lock);
20059 + raw_local_irq_save(flags);
20060 + __raw_write_lock(rm_lock);
20061 + __raw_write_unlock(rm_lock);
20062 + raw_local_irq_restore(flags);
20066 +EXPORT_SYMBOL(xen_spin_wait);
20068 +unsigned int xen_spin_adjust(raw_spinlock_t *lock, unsigned int token)
20070 + return token;//todo
20072 +EXPORT_SYMBOL(xen_spin_adjust);
20074 +int xen_spin_wait_flags(raw_spinlock_t *lock, unsigned int *token,
20075 + unsigned int flags)
20077 + return xen_spin_wait(lock, *token);//todo
20079 +EXPORT_SYMBOL(xen_spin_wait_flags);
20081 +void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
20083 + unsigned int cpu;
20085 + token &= (1U << TICKET_SHIFT) - 1;
20086 + for_each_online_cpu(cpu) {
20087 + raw_rwlock_t *rm_lock;
20088 + unsigned long flags;
20089 + struct spinning *spinning;
20091 + if (cpu == raw_smp_processor_id())
20094 + rm_lock = &per_cpu(spinning_rm_lock, cpu);
20095 + raw_local_irq_save(flags);
20096 + __raw_read_lock(rm_lock);
20098 + spinning = per_cpu(spinning, cpu);
20101 + && (spinning->lock != lock || spinning->ticket != token))
20104 + __raw_read_unlock(rm_lock);
20105 + raw_local_irq_restore(flags);
20107 + if (unlikely(spinning)) {
20108 + notify_remote_via_irq(per_cpu(spinlock_irq, cpu));
20113 +EXPORT_SYMBOL(xen_spin_kick);
20114 --- sle11-2009-10-16.orig/drivers/xen/core/xen_sysfs.c 2008-12-15 11:27:22.000000000 +0100
20115 +++ sle11-2009-10-16/drivers/xen/core/xen_sysfs.c 2009-03-16 16:33:40.000000000 +0100
20116 @@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type);
20118 static int __init xen_sysfs_type_init(void)
20120 - return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
20121 + return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
20124 static void xen_sysfs_type_destroy(void)
20126 - sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
20127 + sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
20130 /* xen version attributes */
20131 @@ -90,13 +90,12 @@ static struct attribute_group version_gr
20133 static int __init xen_sysfs_version_init(void)
20135 - return sysfs_create_group(&hypervisor_subsys.kobj,
20137 + return sysfs_create_group(hypervisor_kobj, &version_group);
20140 static void xen_sysfs_version_destroy(void)
20142 - sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
20143 + sysfs_remove_group(hypervisor_kobj, &version_group);
20147 @@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid);
20149 static int __init xen_sysfs_uuid_init(void)
20151 - return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20152 + return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
20155 static void xen_sysfs_uuid_destroy(void)
20157 - sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
20158 + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
20161 /* xen compilation attributes */
20162 @@ -204,14 +203,12 @@ static struct attribute_group xen_compil
20164 int __init static xen_compilation_init(void)
20166 - return sysfs_create_group(&hypervisor_subsys.kobj,
20167 - &xen_compilation_group);
20168 + return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
20171 static void xen_compilation_destroy(void)
20173 - sysfs_remove_group(&hypervisor_subsys.kobj,
20174 - &xen_compilation_group);
20175 + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
20178 /* xen properties info */
20179 @@ -325,14 +322,12 @@ static struct attribute_group xen_proper
20181 static int __init xen_properties_init(void)
20183 - return sysfs_create_group(&hypervisor_subsys.kobj,
20184 - &xen_properties_group);
20185 + return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
20188 static void xen_properties_destroy(void)
20190 - sysfs_remove_group(&hypervisor_subsys.kobj,
20191 - &xen_properties_group);
20192 + sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
20195 #ifdef CONFIG_KEXEC
20196 @@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);
20198 static int __init xen_sysfs_vmcoreinfo_init(void)
20200 - return sysfs_create_file(&hypervisor_subsys.kobj,
20201 - &vmcoreinfo_attr.attr);
20202 + return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20205 static void xen_sysfs_vmcoreinfo_destroy(void)
20207 - sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
20208 + sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
20212 --- sle11-2009-10-16.orig/drivers/xen/gntdev/gntdev.c 2009-03-04 11:28:34.000000000 +0100
20213 +++ sle11-2009-10-16/drivers/xen/gntdev/gntdev.c 2009-03-16 16:33:40.000000000 +0100
20214 @@ -782,7 +782,7 @@ static pte_t gntdev_clear_pte(struct vm_
20217 /* USING SHADOW PAGE TABLES. */
20218 - copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20219 + copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20222 /* Finally, we unmap the grant from kernel space. */
20223 @@ -810,7 +810,7 @@ static pte_t gntdev_clear_pte(struct vm_
20224 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
20227 - copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
20228 + copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
20232 --- sle11-2009-10-16.orig/drivers/xen/scsifront/scsifront.c 2009-02-16 16:18:36.000000000 +0100
20233 +++ sle11-2009-10-16/drivers/xen/scsifront/scsifront.c 2009-03-16 16:33:40.000000000 +0100
20234 @@ -260,19 +260,19 @@ static int map_data_for_request(struct v
20238 - if (sc->use_sg) {
20239 + if (scsi_bufflen(sc)) {
20240 /* quoted scsi_lib.c/scsi_req_map_sg . */
20241 - struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer;
20242 - unsigned int data_len = sc->request_bufflen;
20243 + struct scatterlist *sg, *sgl = scsi_sglist(sc);
20244 + unsigned int data_len = scsi_bufflen(sc);
20246 - nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20247 + nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
20248 if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20249 printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n");
20250 ref_cnt = (-E2BIG);
20254 - for_each_sg (sgl, sg, sc->use_sg, i) {
20255 + for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
20256 page = sg_page(sg);
20259 @@ -306,45 +306,6 @@ static int map_data_for_request(struct v
20263 - } else if (sc->request_bufflen) {
20264 - unsigned long end = ((unsigned long)sc->request_buffer
20265 - + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
20266 - unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
20268 - page = virt_to_page(sc->request_buffer);
20269 - nr_pages = end - start;
20270 - len = sc->request_bufflen;
20272 - if (nr_pages > VSCSIIF_SG_TABLESIZE) {
20273 - ref_cnt = (-E2BIG);
20277 - buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
20279 - off = offset_in_page((unsigned long)sc->request_buffer);
20280 - for (i = 0; i < nr_pages; i++) {
20281 - bytes = PAGE_SIZE - off;
20286 - ref = gnttab_claim_grant_reference(&gref_head);
20287 - BUG_ON(ref == -ENOSPC);
20289 - gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
20290 - buffer_pfn, write);
20292 - info->shadow[id].gref[i] = ref;
20293 - ring_req->seg[i].gref = ref;
20294 - ring_req->seg[i].offset = (uint16_t)off;
20295 - ring_req->seg[i].length = (uint16_t)bytes;
20305 --- sle11-2009-10-16.orig/drivers/xen/xenoprof/xenoprofile.c 2009-03-11 15:39:38.000000000 +0100
20306 +++ sle11-2009-10-16/drivers/xen/xenoprof/xenoprofile.c 2009-03-16 16:33:40.000000000 +0100
20307 @@ -78,7 +78,7 @@ static int xenoprof_resume(struct sys_de
20310 static struct sysdev_class oprofile_sysclass = {
20311 - set_kset_name("oprofile"),
20312 + .name = "oprofile",
20313 .resume = xenoprof_resume,
20314 .suspend = xenoprof_suspend
20316 --- sle11-2009-10-16.orig/include/asm-x86/e820.h 2009-10-28 14:55:04.000000000 +0100
20317 +++ sle11-2009-10-16/include/asm-x86/e820.h 2009-03-16 16:33:40.000000000 +0100
20318 @@ -127,7 +127,11 @@ extern char *memory_setup(void);
20319 #endif /* __KERNEL__ */
20320 #endif /* __ASSEMBLY__ */
20322 +#ifndef CONFIG_XEN
20323 #define ISA_START_ADDRESS 0xa0000
20325 +#define ISA_START_ADDRESS 0
20327 #define ISA_END_ADDRESS 0x100000
20328 #define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
20330 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/agp.h 2009-02-16 16:18:36.000000000 +0100
20331 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/agp.h 2009-03-16 16:33:40.000000000 +0100
20332 @@ -13,18 +13,13 @@
20333 * page. This avoids data corruption on some CPUs.
20337 - * Caller's responsibility to call global_flush_tlb() for performance
20340 #define map_page_into_agp(page) ( \
20341 xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
20342 - ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
20343 + ?: set_pages_uc(page, 1))
20344 #define unmap_page_from_agp(page) ( \
20345 xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
20346 /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
20347 - change_page_attr(page, 1, PAGE_KERNEL))
20348 -#define flush_agp_mappings() global_flush_tlb()
20349 + set_pages_wb(page, 1))
20352 * Could use CLFLUSH here if the cpu supports it. But then it would
20353 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/desc.h 2009-02-16 16:18:36.000000000 +0100
20354 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/desc.h 2009-03-16 16:33:40.000000000 +0100
20356 +#ifndef _ASM_DESC_H_
20357 +#define _ASM_DESC_H_
20359 +#ifndef __ASSEMBLY__
20360 +#include <asm/desc_defs.h>
20361 +#include <asm/ldt.h>
20362 +#include <asm/mmu.h>
20363 +#include <linux/smp.h>
20365 +static inline void fill_ldt(struct desc_struct *desc,
20366 + const struct user_desc *info)
20368 + desc->limit0 = info->limit & 0x0ffff;
20369 + desc->base0 = info->base_addr & 0x0000ffff;
20371 + desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
20372 + desc->type = (info->read_exec_only ^ 1) << 1;
20373 + desc->type |= info->contents << 2;
20376 + desc->p = info->seg_not_present ^ 1;
20377 + desc->limit = (info->limit & 0xf0000) >> 16;
20378 + desc->avl = info->useable;
20379 + desc->d = info->seg_32bit;
20380 + desc->g = info->limit_in_pages;
20381 + desc->base2 = (info->base_addr & 0xff000000) >> 24;
20384 +#ifndef CONFIG_X86_NO_IDT
20385 +extern struct desc_ptr idt_descr;
20386 +extern gate_desc idt_table[];
20389 +#ifdef CONFIG_X86_64
20390 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
20391 +extern struct desc_ptr cpu_gdt_descr[];
20392 +/* the cpu gdt accessor */
20393 +#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
20395 +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
20396 + unsigned dpl, unsigned ist, unsigned seg)
20398 + gate->offset_low = PTR_LOW(func);
20399 + gate->segment = __KERNEL_CS;
20405 + gate->type = type;
20406 + gate->offset_middle = PTR_MIDDLE(func);
20407 + gate->offset_high = PTR_HIGH(func);
20412 + struct desc_struct gdt[GDT_ENTRIES];
20413 +} __attribute__((aligned(PAGE_SIZE)));
20414 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
20416 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20418 + return per_cpu(gdt_page, cpu).gdt;
20421 +static inline void pack_gate(gate_desc *gate, unsigned char type,
20422 + unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
20425 + gate->a = (seg << 16) | (base & 0xffff);
20426 + gate->b = (base & 0xffff0000) |
20427 + (((0x80 | type | (dpl << 5)) & 0xff) << 8);
20432 +static inline int desc_empty(const void *ptr)
20434 + const u32 *desc = ptr;
20435 + return !(desc[0] | desc[1]);
20438 +#ifndef CONFIG_XEN
20439 +#define load_TR_desc() native_load_tr_desc()
20440 +#define load_gdt(dtr) native_load_gdt(dtr)
20441 +#define load_idt(dtr) native_load_idt(dtr)
20442 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20443 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20445 +#define store_gdt(dtr) native_store_gdt(dtr)
20446 +#define store_idt(dtr) native_store_idt(dtr)
20447 +#define store_tr(tr) (tr = native_store_tr())
20448 +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20450 +#define load_TLS(t, cpu) native_load_tls(t, cpu)
20451 +#define set_ldt native_set_ldt
20453 +#define write_ldt_entry(dt, entry, desc) \
20454 + native_write_ldt_entry(dt, entry, desc)
20455 +#define write_gdt_entry(dt, entry, desc, type) \
20456 + native_write_gdt_entry(dt, entry, desc, type)
20457 +#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
20459 +static inline void native_write_idt_entry(gate_desc *idt, int entry,
20460 + const gate_desc *gate)
20462 + memcpy(&idt[entry], gate, sizeof(*gate));
20465 +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
20466 + const void *desc)
20468 + memcpy(&ldt[entry], desc, 8);
20471 +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
20472 + const void *desc, int type)
20474 + unsigned int size;
20477 + size = sizeof(tss_desc);
20480 + size = sizeof(ldt_desc);
20483 + size = sizeof(struct desc_struct);
20486 + memcpy(&gdt[entry], desc, size);
20490 +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
20491 + unsigned long limit, unsigned char type,
20492 + unsigned char flags)
20494 + desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
20495 + desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20496 + (limit & 0x000f0000) | ((type & 0xff) << 8) |
20497 + ((flags & 0xf) << 20);
20502 +#ifndef CONFIG_XEN
20503 +static inline void set_tssldt_descriptor(void *d, unsigned long addr,
20504 + unsigned type, unsigned size)
20506 +#ifdef CONFIG_X86_64
20507 + struct ldttss_desc64 *desc = d;
20508 + memset(desc, 0, sizeof(*desc));
20509 + desc->limit0 = size & 0xFFFF;
20510 + desc->base0 = PTR_LOW(addr);
20511 + desc->base1 = PTR_MIDDLE(addr) & 0xFF;
20512 + desc->type = type;
20514 + desc->limit1 = (size >> 16) & 0xF;
20515 + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
20516 + desc->base3 = PTR_HIGH(addr);
20519 + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
20523 +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
20525 + struct desc_struct *d = get_cpu_gdt_table(cpu);
20529 + * sizeof(unsigned long) coming from an extra "long" at the end
20530 + * of the iobitmap. See tss_struct definition in processor.h
20532 + * -1? seg base+limit should be pointing to the address of the
20533 + * last valid byte
20535 + set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
20536 + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
20537 + write_gdt_entry(d, entry, &tss, DESC_TSS);
20540 +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20542 +static inline void native_set_ldt(const void *addr, unsigned int entries)
20544 + if (likely(entries == 0))
20545 + __asm__ __volatile__("lldt %w0"::"q" (0));
20547 + unsigned cpu = smp_processor_id();
20550 + set_tssldt_descriptor(&ldt, (unsigned long)addr,
20551 + DESC_LDT, entries * sizeof(ldt) - 1);
20552 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
20554 + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20558 +static inline void native_load_tr_desc(void)
20560 + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20563 +static inline void native_load_gdt(const struct desc_ptr *dtr)
20565 + asm volatile("lgdt %0"::"m" (*dtr));
20568 +static inline void native_load_idt(const struct desc_ptr *dtr)
20570 + asm volatile("lidt %0"::"m" (*dtr));
20573 +static inline void native_store_gdt(struct desc_ptr *dtr)
20575 + asm volatile("sgdt %0":"=m" (*dtr));
20578 +static inline void native_store_idt(struct desc_ptr *dtr)
20580 + asm volatile("sidt %0":"=m" (*dtr));
20583 +static inline unsigned long native_store_tr(void)
20585 + unsigned long tr;
20586 + asm volatile("str %0":"=r" (tr));
20590 +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20593 + struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20595 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20596 + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20599 +#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20600 +#define set_ldt xen_set_ldt
20602 +extern int write_ldt_entry(struct desc_struct *ldt, int entry,
20603 + const void *desc);
20604 +extern int write_gdt_entry(struct desc_struct *gdt, int entry,
20605 + const void *desc, int type);
20607 +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20610 + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20612 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20613 + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20614 + *(u64 *)&t->tls_array[i]))
20619 +#define _LDT_empty(info) (\
20620 + (info)->base_addr == 0 && \
20621 + (info)->limit == 0 && \
20622 + (info)->contents == 0 && \
20623 + (info)->read_exec_only == 1 && \
20624 + (info)->seg_32bit == 0 && \
20625 + (info)->limit_in_pages == 0 && \
20626 + (info)->seg_not_present == 1 && \
20627 + (info)->useable == 0)
20629 +#ifdef CONFIG_X86_64
20630 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
20632 +#define LDT_empty(info) (_LDT_empty(info))
20635 +static inline void clear_LDT(void)
20637 + set_ldt(NULL, 0);
20641 + * load one particular LDT into the current CPU
20643 +static inline void load_LDT_nolock(mm_context_t *pc)
20645 + set_ldt(pc->ldt, pc->size);
20648 +static inline void load_LDT(mm_context_t *pc)
20650 + preempt_disable();
20651 + load_LDT_nolock(pc);
20652 + preempt_enable();
20655 +static inline unsigned long get_desc_base(const struct desc_struct *desc)
20657 + return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
20660 +static inline unsigned long get_desc_limit(const struct desc_struct *desc)
20662 + return desc->limit0 | (desc->limit << 16);
20665 +#ifndef CONFIG_X86_NO_IDT
20666 +static inline void _set_gate(int gate, unsigned type, void *addr,
20667 + unsigned dpl, unsigned ist, unsigned seg)
20670 + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
20672 + * does not need to be atomic because it is only done once at
20675 + write_idt_entry(idt_table, gate, &s);
20679 + * This needs to use 'idt_table' rather than 'idt', and
20680 + * thus use the _nonmapped_ version of the IDT, as the
20681 + * Pentium F0 0F bugfix can have resulted in the mapped
20682 + * IDT being write-protected.
20684 +static inline void set_intr_gate(unsigned int n, void *addr)
20686 + BUG_ON((unsigned)n > 0xFF);
20687 + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
20691 + * This routine sets up an interrupt gate at directory privilege level 3.
20693 +static inline void set_system_intr_gate(unsigned int n, void *addr)
20695 + BUG_ON((unsigned)n > 0xFF);
20696 + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
20699 +static inline void set_trap_gate(unsigned int n, void *addr)
20701 + BUG_ON((unsigned)n > 0xFF);
20702 + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
20705 +static inline void set_system_gate(unsigned int n, void *addr)
20707 + BUG_ON((unsigned)n > 0xFF);
20708 #ifdef CONFIG_X86_32
20709 -# include "desc_32.h"
20710 + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
20712 + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
20716 +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
20718 + BUG_ON((unsigned)n > 0xFF);
20719 + _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
20722 +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
20724 + BUG_ON((unsigned)n > 0xFF);
20725 + _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
20728 +static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
20730 + BUG_ON((unsigned)n > 0xFF);
20731 + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
20736 -# include "desc_64.h"
20738 + * GET_DESC_BASE reads the descriptor base of the specified segment.
20741 + * idx - descriptor index
20742 + * gdt - GDT pointer
20743 + * base - 32bit register to which the base will be written
20744 + * lo_w - lo word of the "base" register
20745 + * lo_b - lo byte of the "base" register
20746 + * hi_b - hi byte of the low word of the "base" register
20749 + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
20750 + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
20752 +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
20753 + movb idx*8+4(gdt), lo_b; \
20754 + movb idx*8+7(gdt), hi_b; \
20755 + shll $16, base; \
20756 + movw idx*8+2(gdt), lo_w;
20759 +#endif /* __ASSEMBLY__ */
20762 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/desc_32.h 2008-12-15 11:27:22.000000000 +0100
20763 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
20765 -#ifndef __ARCH_DESC_H
20766 -#define __ARCH_DESC_H
20768 -#include <asm/ldt.h>
20769 -#include <asm/segment.h>
20771 -#ifndef __ASSEMBLY__
20773 -#include <linux/preempt.h>
20774 -#include <linux/smp.h>
20776 -#include <asm/mmu.h>
20778 -struct Xgt_desc_struct {
20779 - unsigned short size;
20780 - unsigned long address __attribute__((packed));
20781 - unsigned short pad;
20782 -} __attribute__ ((packed));
20786 - struct desc_struct gdt[GDT_ENTRIES];
20787 -} __attribute__((aligned(PAGE_SIZE)));
20788 -DECLARE_PER_CPU(struct gdt_page, gdt_page);
20790 -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
20792 - return per_cpu(gdt_page, cpu).gdt;
20795 -extern struct Xgt_desc_struct idt_descr;
20796 -extern struct desc_struct idt_table[];
20797 -extern void set_intr_gate(unsigned int irq, void * addr);
20799 -static inline void pack_descriptor(__u32 *a, __u32 *b,
20800 - unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
20802 - *a = ((base & 0xffff) << 16) | (limit & 0xffff);
20803 - *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
20804 - (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
20807 -static inline void pack_gate(__u32 *a, __u32 *b,
20808 - unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
20810 - *a = (seg << 16) | (base & 0xffff);
20811 - *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
20814 -#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
20815 -#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
20816 -#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
20817 -#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
20818 -#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
20819 -#define DESCTYPE_DPL3 0x60 /* DPL-3 */
20820 -#define DESCTYPE_S 0x10 /* !system */
20822 -#ifndef CONFIG_XEN
20823 -#define load_TR_desc() native_load_tr_desc()
20824 -#define load_gdt(dtr) native_load_gdt(dtr)
20825 -#define load_idt(dtr) native_load_idt(dtr)
20826 -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
20827 -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
20829 -#define store_gdt(dtr) native_store_gdt(dtr)
20830 -#define store_idt(dtr) native_store_idt(dtr)
20831 -#define store_tr(tr) (tr = native_store_tr())
20832 -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
20834 -#define load_TLS(t, cpu) native_load_tls(t, cpu)
20835 -#define set_ldt native_set_ldt
20837 -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20838 -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20839 -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
20841 -static inline void write_dt_entry(struct desc_struct *dt,
20842 - int entry, u32 entry_low, u32 entry_high)
20844 - dt[entry].a = entry_low;
20845 - dt[entry].b = entry_high;
20848 -static inline void native_set_ldt(const void *addr, unsigned int entries)
20850 - if (likely(entries == 0))
20851 - __asm__ __volatile__("lldt %w0"::"q" (0));
20853 - unsigned cpu = smp_processor_id();
20856 - pack_descriptor(&a, &b, (unsigned long)addr,
20857 - entries * sizeof(struct desc_struct) - 1,
20858 - DESCTYPE_LDT, 0);
20859 - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
20860 - __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
20865 -static inline void native_load_tr_desc(void)
20867 - asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
20870 -static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
20872 - asm volatile("lgdt %0"::"m" (*dtr));
20875 -static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
20877 - asm volatile("lidt %0"::"m" (*dtr));
20880 -static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
20882 - asm ("sgdt %0":"=m" (*dtr));
20885 -static inline void native_store_idt(struct Xgt_desc_struct *dtr)
20887 - asm ("sidt %0":"=m" (*dtr));
20890 -static inline unsigned long native_store_tr(void)
20892 - unsigned long tr;
20893 - asm ("str %0":"=r" (tr));
20897 -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
20900 - struct desc_struct *gdt = get_cpu_gdt_table(cpu);
20902 - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20903 - gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
20906 -#define load_TLS(t, cpu) xen_load_tls(t, cpu)
20907 -#define set_ldt xen_set_ldt
20909 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
20910 -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
20912 -static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
20915 - struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
20917 - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
20918 - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
20919 - *(u64 *)&t->tls_array[i]))
20924 -#ifndef CONFIG_X86_NO_IDT
20925 -static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
20928 - pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
20929 - write_idt_entry(idt_table, gate, a, b);
20933 -#ifndef CONFIG_X86_NO_TSS
20934 -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
20937 - pack_descriptor(&a, &b, (unsigned long)addr,
20938 - offsetof(struct tss_struct, __cacheline_filler) - 1,
20939 - DESCTYPE_TSS, 0);
20940 - write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
20945 -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
20947 -#define LDT_entry_a(info) \
20948 - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
20950 -#define LDT_entry_b(info) \
20951 - (((info)->base_addr & 0xff000000) | \
20952 - (((info)->base_addr & 0x00ff0000) >> 16) | \
20953 - ((info)->limit & 0xf0000) | \
20954 - (((info)->read_exec_only ^ 1) << 9) | \
20955 - ((info)->contents << 10) | \
20956 - (((info)->seg_not_present ^ 1) << 15) | \
20957 - ((info)->seg_32bit << 22) | \
20958 - ((info)->limit_in_pages << 23) | \
20959 - ((info)->useable << 20) | \
20962 -#define LDT_empty(info) (\
20963 - (info)->base_addr == 0 && \
20964 - (info)->limit == 0 && \
20965 - (info)->contents == 0 && \
20966 - (info)->read_exec_only == 1 && \
20967 - (info)->seg_32bit == 0 && \
20968 - (info)->limit_in_pages == 0 && \
20969 - (info)->seg_not_present == 1 && \
20970 - (info)->useable == 0 )
20972 -static inline void clear_LDT(void)
20974 - set_ldt(NULL, 0);
20978 - * load one particular LDT into the current CPU
20980 -static inline void load_LDT_nolock(mm_context_t *pc)
20982 - set_ldt(pc->ldt, pc->size);
20985 -static inline void load_LDT(mm_context_t *pc)
20987 - preempt_disable();
20988 - load_LDT_nolock(pc);
20989 - preempt_enable();
20992 -static inline unsigned long get_desc_base(unsigned long *desc)
20994 - unsigned long base;
20995 - base = ((desc[0] >> 16) & 0x0000ffff) |
20996 - ((desc[1] << 16) & 0x00ff0000) |
20997 - (desc[1] & 0xff000000);
21001 -#else /* __ASSEMBLY__ */
21004 - * GET_DESC_BASE reads the descriptor base of the specified segment.
21007 - * idx - descriptor index
21008 - * gdt - GDT pointer
21009 - * base - 32bit register to which the base will be written
21010 - * lo_w - lo word of the "base" register
21011 - * lo_b - lo byte of the "base" register
21012 - * hi_b - hi byte of the low word of the "base" register
21015 - * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
21016 - * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
21018 -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
21019 - movb idx*8+4(gdt), lo_b; \
21020 - movb idx*8+7(gdt), hi_b; \
21021 - shll $16, base; \
21022 - movw idx*8+2(gdt), lo_w;
21024 -#endif /* !__ASSEMBLY__ */
21027 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/desc_64.h 2009-02-16 16:18:36.000000000 +0100
21028 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
21030 -/* Written 2000 by Andi Kleen */
21031 -#ifndef __ARCH_DESC_H
21032 -#define __ARCH_DESC_H
21034 -#include <linux/threads.h>
21035 -#include <asm/ldt.h>
21037 -#ifndef __ASSEMBLY__
21039 -#include <linux/string.h>
21040 -#include <linux/smp.h>
21041 -#include <asm/desc_defs.h>
21043 -#include <asm/segment.h>
21044 -#include <asm/mmu.h>
21046 -extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
21048 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
21050 -#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
21051 -#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
21053 -static inline void clear_LDT(void)
21055 - int cpu = get_cpu();
21058 - * NB. We load the default_ldt for lcall7/27 handling on demand, as
21059 - * it slows down context switching. Noone uses it anyway.
21061 - cpu = cpu; /* XXX avoid compiler warning */
21062 - xen_set_ldt(NULL, 0);
21066 -#ifndef CONFIG_X86_NO_TSS
21067 -static inline unsigned long __store_tr(void)
21069 - unsigned long tr;
21071 - asm volatile ("str %w0":"=r" (tr));
21075 -#define store_tr(tr) (tr) = __store_tr()
21079 - * This is the ldt that every process will get unless we need
21080 - * something other than this.
21082 -extern struct desc_struct default_ldt[];
21083 -#ifndef CONFIG_X86_NO_IDT
21084 -extern struct gate_struct idt_table[];
21086 -extern struct desc_ptr cpu_gdt_descr[];
21088 -/* the cpu gdt accessor */
21089 -#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
21091 -#ifndef CONFIG_XEN
21092 -static inline void load_gdt(const struct desc_ptr *ptr)
21094 - asm volatile("lgdt %w0"::"m" (*ptr));
21097 -static inline void store_gdt(struct desc_ptr *ptr)
21099 - asm("sgdt %w0":"=m" (*ptr));
21103 -static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
21105 - struct gate_struct s;
21106 - s.offset_low = PTR_LOW(func);
21107 - s.segment = __KERNEL_CS;
21114 - s.offset_middle = PTR_MIDDLE(func);
21115 - s.offset_high = PTR_HIGH(func);
21116 - /* does not need to be atomic because it is only done once at setup time */
21117 - memcpy(adr, &s, 16);
21120 -#ifndef CONFIG_X86_NO_IDT
21121 -static inline void set_intr_gate(int nr, void *func)
21123 - BUG_ON((unsigned)nr > 0xFF);
21124 - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
21127 -static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
21129 - BUG_ON((unsigned)nr > 0xFF);
21130 - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
21133 -static inline void set_system_gate(int nr, void *func)
21135 - BUG_ON((unsigned)nr > 0xFF);
21136 - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
21139 -static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
21141 - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
21144 -static inline void load_idt(const struct desc_ptr *ptr)
21146 - asm volatile("lidt %w0"::"m" (*ptr));
21149 -static inline void store_idt(struct desc_ptr *dtr)
21151 - asm("sidt %w0":"=m" (*dtr));
21155 -static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
21158 - struct ldttss_desc d;
21159 - memset(&d,0,sizeof(d));
21160 - d.limit0 = size & 0xFFFF;
21161 - d.base0 = PTR_LOW(tss);
21162 - d.base1 = PTR_MIDDLE(tss) & 0xFF;
21165 - d.limit1 = (size >> 16) & 0xF;
21166 - d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
21167 - d.base3 = PTR_HIGH(tss);
21168 - memcpy(ptr, &d, 16);
21171 -#ifndef CONFIG_X86_NO_TSS
21172 -static inline void set_tss_desc(unsigned cpu, void *addr)
21175 - * sizeof(unsigned long) coming from an extra "long" at the end
21176 - * of the iobitmap. See tss_struct definition in processor.h
21178 - * -1? seg base+limit should be pointing to the address of the
21179 - * last valid byte
21181 - set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
21182 - (unsigned long)addr, DESC_TSS,
21183 - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
21187 -static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
21189 - set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
21190 - DESC_LDT, size * 8 - 1);
21193 -#define LDT_entry_a(info) \
21194 - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
21195 -/* Don't allow setting of the lm bit. It is useless anyways because
21196 - 64bit system calls require __USER_CS. */
21197 -#define LDT_entry_b(info) \
21198 - (((info)->base_addr & 0xff000000) | \
21199 - (((info)->base_addr & 0x00ff0000) >> 16) | \
21200 - ((info)->limit & 0xf0000) | \
21201 - (((info)->read_exec_only ^ 1) << 9) | \
21202 - ((info)->contents << 10) | \
21203 - (((info)->seg_not_present ^ 1) << 15) | \
21204 - ((info)->seg_32bit << 22) | \
21205 - ((info)->limit_in_pages << 23) | \
21206 - ((info)->useable << 20) | \
21207 - /* ((info)->lm << 21) | */ \
21210 -#define LDT_empty(info) (\
21211 - (info)->base_addr == 0 && \
21212 - (info)->limit == 0 && \
21213 - (info)->contents == 0 && \
21214 - (info)->read_exec_only == 1 && \
21215 - (info)->seg_32bit == 0 && \
21216 - (info)->limit_in_pages == 0 && \
21217 - (info)->seg_not_present == 1 && \
21218 - (info)->useable == 0 && \
21221 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
21224 - u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
21226 - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
21227 - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
21228 - t->tls_array[i]))
21233 - * load one particular LDT into the current CPU
21235 -static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
21237 - void *segments = pc->ldt;
21238 - int count = pc->size;
21240 - if (likely(!count))
21243 - xen_set_ldt(segments, count);
21246 -static inline void load_LDT(mm_context_t *pc)
21248 - int cpu = get_cpu();
21249 - load_LDT_nolock(pc, cpu);
21253 -extern struct desc_ptr idt_descr;
21255 -#endif /* !__ASSEMBLY__ */
21258 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-02-16 16:18:36.000000000 +0100
21259 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/dma-mapping_32.h 2009-03-16 16:33:40.000000000 +0100
21260 @@ -84,23 +84,13 @@ dma_sync_single_range_for_device(struct
21261 dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
21264 -static inline void
21266 dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
21267 - enum dma_data_direction direction)
21270 - swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
21271 - flush_write_buffers();
21273 + enum dma_data_direction direction);
21275 -static inline void
21277 dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
21278 - enum dma_data_direction direction)
21281 - swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
21282 - flush_write_buffers();
21284 + enum dma_data_direction direction);
21287 dma_mapping_error(dma_addr_t dma_addr);
21288 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-02-16 16:17:21.000000000 +0100
21289 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap_32.h 2009-03-16 16:33:40.000000000 +0100
21290 @@ -64,7 +64,7 @@ enum fixed_addresses {
21292 #ifdef CONFIG_X86_VISWS_APIC
21293 FIX_CO_CPU, /* Cobalt timer */
21294 - FIX_CO_APIC, /* Cobalt APIC Redirection Table */
21295 + FIX_CO_APIC, /* Cobalt APIC Redirection Table */
21296 FIX_LI_PCIA, /* Lithium PCI Bridge A */
21297 FIX_LI_PCIB, /* Lithium PCI Bridge B */
21299 @@ -73,7 +73,7 @@ enum fixed_addresses {
21301 #ifdef CONFIG_X86_CYCLONE_TIMER
21302 FIX_CYCLONE_TIMER, /*cyclone timer register*/
21305 #ifdef CONFIG_HIGHMEM
21306 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
21307 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
21308 @@ -93,11 +93,23 @@ enum fixed_addresses {
21310 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21311 __end_of_permanent_fixed_addresses,
21312 - /* temporary boot-time mappings, used before ioremap() is functional */
21313 -#define NR_FIX_BTMAPS 16
21314 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21315 - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21317 + * 256 temporary boot-time mappings, used by early_ioremap(),
21318 + * before ioremap() is functional.
21320 + * We round it up to the next 512 pages boundary so that we
21321 + * can have a single pgd entry and a single pte table:
21323 +#define NR_FIX_BTMAPS 64
21324 +#define FIX_BTMAPS_NESTING 4
21326 + __end_of_permanent_fixed_addresses + 512 -
21327 + (__end_of_permanent_fixed_addresses & 511),
21328 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21330 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21331 + FIX_OHCI1394_BASE,
21333 __end_of_fixed_addresses
21336 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-02-16 16:17:21.000000000 +0100
21337 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/fixmap_64.h 2009-03-16 16:33:40.000000000 +0100
21339 #include <asm/apicdef.h>
21340 #include <asm/page.h>
21341 #include <asm/vsyscall.h>
21342 +#include <asm/efi.h>
21343 #include <asm/acpi.h>
21346 @@ -46,6 +47,10 @@ enum fixed_addresses {
21347 FIX_IO_APIC_BASE_0,
21348 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
21351 + FIX_EFI_IO_MAP_LAST_PAGE,
21352 + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
21356 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
21357 @@ -55,10 +60,22 @@ enum fixed_addresses {
21359 FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
21360 __end_of_permanent_fixed_addresses,
21361 - /* temporary boot-time mappings, used before ioremap() is functional */
21362 -#define NR_FIX_BTMAPS 16
21363 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
21364 - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
21366 + * 256 temporary boot-time mappings, used by early_ioremap(),
21367 + * before ioremap() is functional.
21369 + * We round it up to the next 512 pages boundary so that we
21370 + * can have a single pgd entry and a single pte table:
21372 +#define NR_FIX_BTMAPS 64
21373 +#define FIX_BTMAPS_NESTING 4
21375 + __end_of_permanent_fixed_addresses + 512 -
21376 + (__end_of_permanent_fixed_addresses & 511),
21377 + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
21378 +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
21379 + FIX_OHCI1394_BASE,
21381 __end_of_fixed_addresses
21384 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/highmem.h 2009-02-16 16:17:21.000000000 +0100
21385 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/highmem.h 2009-03-16 16:33:40.000000000 +0100
21386 @@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
21387 * easily, subsequent pte tables have to be allocated in one physical
21390 -#ifdef CONFIG_X86_PAE
21391 -#define LAST_PKMAP 512
21393 -#define LAST_PKMAP 1024
21398 @@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table;
21402 -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
21403 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
21404 #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
21405 #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
21407 -extern void * FASTCALL(kmap_high(struct page *page));
21408 -extern void FASTCALL(kunmap_high(struct page *page));
21409 +extern void *kmap_high(struct page *page);
21410 +extern void kunmap_high(struct page *page);
21412 void *kmap(struct page *page);
21413 void kunmap(struct page *page);
21414 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/hypervisor.h 2009-02-16 16:18:36.000000000 +0100
21415 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/hypervisor.h 2009-03-16 16:33:40.000000000 +0100
21416 @@ -264,6 +264,25 @@ HYPERVISOR_poll(
21420 +static inline int __must_check
21421 +HYPERVISOR_poll_no_timeout(
21422 + evtchn_port_t *ports, unsigned int nr_ports)
21425 + struct sched_poll sched_poll = {
21426 + .nr_ports = nr_ports
21428 + set_xen_guest_handle(sched_poll.ports, ports);
21430 + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
21431 +#if CONFIG_XEN_COMPAT <= 0x030002
21432 + if (rc == -ENOSYS)
21433 + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
21442 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/irqflags.h 2009-02-16 16:18:36.000000000 +0100
21443 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/irqflags.h 2009-03-16 16:33:40.000000000 +0100
21445 -#ifdef CONFIG_X86_32
21446 -# include "irqflags_32.h"
21447 +#ifndef _X86_IRQFLAGS_H_
21448 +#define _X86_IRQFLAGS_H_
21450 +#include <asm/processor-flags.h>
21452 +#ifndef __ASSEMBLY__
21454 + * The use of 'barrier' in the following reflects their use as local-lock
21455 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21456 + * critical operations are executed. All critical operations must complete
21457 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21458 + * includes these barriers, for example.
21461 +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21463 +#define xen_restore_fl(f) \
21465 + vcpu_info_t *_vcpu; \
21467 + _vcpu = current_vcpu_info(); \
21468 + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
21469 + barrier(); /* unmask then check (avoid races) */\
21470 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
21471 + force_evtchn_callback(); \
21475 +#define xen_irq_disable() \
21477 + current_vcpu_info()->evtchn_upcall_mask = 1; \
21481 +#define xen_irq_enable() \
21483 + vcpu_info_t *_vcpu; \
21485 + _vcpu = current_vcpu_info(); \
21486 + _vcpu->evtchn_upcall_mask = 0; \
21487 + barrier(); /* unmask then check (avoid races) */ \
21488 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
21489 + force_evtchn_callback(); \
21492 +void xen_safe_halt(void);
21494 +void xen_halt(void);
21496 +#define __raw_local_save_flags() xen_save_fl()
21498 +#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21500 +#define raw_local_irq_disable() xen_irq_disable()
21502 +#define raw_local_irq_enable() xen_irq_enable()
21505 + * Used in the idle loop; sti takes one instruction cycle
21508 +static inline void raw_safe_halt(void)
21514 + * Used when interrupts are already enabled or to
21515 + * shutdown the processor:
21517 +static inline void halt(void)
21523 + * For spinlocks, etc:
21525 +#define __raw_local_irq_save() \
21527 + unsigned long flags = __raw_local_save_flags(); \
21529 + raw_local_irq_disable(); \
21534 -# include "irqflags_64.h"
21536 +/* Offsets into shared_info_t. */
21537 +#define evtchn_upcall_pending /* 0 */
21538 +#define evtchn_upcall_mask 1
21540 +#define sizeof_vcpu_shift 6
21542 +#ifdef CONFIG_X86_64
21543 +# define __REG_si %rsi
21544 +# define __CPU_num %gs:pda_cpunumber
21546 +# define __REG_si %esi
21547 +# define __CPU_num TI_cpu(%ebp)
21551 +#define GET_VCPU_INFO movl __CPU_num,%esi ; \
21552 + shl $sizeof_vcpu_shift,%esi ; \
21553 + add HYPERVISOR_shared_info,__REG_si
21555 +#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si
21558 +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si)
21559 +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si)
21560 +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si)
21561 +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21562 + __DISABLE_INTERRUPTS
21563 +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21564 + __ENABLE_INTERRUPTS
21566 +#ifndef CONFIG_X86_64
21567 +#define INTERRUPT_RETURN iret
21568 +#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
21569 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
21570 + __TEST_PENDING ; \
21571 + jnz 14f /* process more events if necessary... */ ; \
21572 + movl PT_ESI(%esp), %esi ; \
21574 +14: __DISABLE_INTERRUPTS ; \
21575 + TRACE_IRQS_OFF ; \
21576 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
21578 + call evtchn_do_upcall ; \
21580 + jmp ret_from_intr
21584 +#endif /* __ASSEMBLY__ */
21586 +#ifndef __ASSEMBLY__
21587 +#define raw_local_save_flags(flags) \
21588 + do { (flags) = __raw_local_save_flags(); } while (0)
21590 +#define raw_local_irq_save(flags) \
21591 + do { (flags) = __raw_local_irq_save(); } while (0)
21593 +static inline int raw_irqs_disabled_flags(unsigned long flags)
21595 + return (flags != 0);
21598 +#define raw_irqs_disabled() \
21600 + unsigned long flags = __raw_local_save_flags(); \
21602 + raw_irqs_disabled_flags(flags); \
21606 + * makes the traced hardirq state match with the machine state
21608 + * should be a rarely used function, only in places where its
21609 + * otherwise impossible to know the irq state, like in traps.
21611 +static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21613 + if (raw_irqs_disabled_flags(flags))
21614 + trace_hardirqs_off();
21616 + trace_hardirqs_on();
21619 +#define trace_hardirqs_fixup() \
21620 + trace_hardirqs_fixup_flags(__raw_local_save_flags())
21624 +#ifdef CONFIG_X86_64
21626 + * Currently paravirt can't handle swapgs nicely when we
21627 + * don't have a stack we can rely on (such as a user space
21628 + * stack). So we either find a way around these or just fault
21629 + * and emulate if a guest tries to call swapgs directly.
21631 + * Either way, this is a good way to document that we don't
21632 + * have a reliable stack. x86_64 only.
21634 +#define SWAPGS_UNSAFE_STACK swapgs
21635 +#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
21636 +#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
21637 +#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
21638 +#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
21640 + ENABLE_INTERRUPTS(CLBR_NONE); \
21642 + LOCKDEP_SYS_EXIT; \
21644 + __DISABLE_INTERRUPTS; \
21648 +#define ARCH_TRACE_IRQS_ON \
21652 + call trace_hardirqs_on; \
21657 +#define ARCH_TRACE_IRQS_OFF \
21661 + call trace_hardirqs_off; \
21666 +#define ARCH_LOCKDEP_SYS_EXIT \
21670 + call lockdep_sys_exit; \
21675 +#define ARCH_LOCKDEP_SYS_EXIT_IRQ
21678 +#ifdef CONFIG_TRACE_IRQFLAGS
21679 +# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
21680 +# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
21682 +# define TRACE_IRQS_ON
21683 +# define TRACE_IRQS_OFF
21685 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
21686 +# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
21687 +# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
21689 +# define LOCKDEP_SYS_EXIT
21690 +# define LOCKDEP_SYS_EXIT_IRQ
21693 +#endif /* __ASSEMBLY__ */
21695 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/irqflags_32.h 2009-02-16 16:18:36.000000000 +0100
21696 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
21699 - * include/asm-i386/irqflags.h
21701 - * IRQ flags handling
21703 - * This file gets included from lowlevel asm headers too, to provide
21704 - * wrapped versions of the local_irq_*() APIs, based on the
21705 - * raw_local_irq_*() functions from the lowlevel headers.
21707 -#ifndef _ASM_IRQFLAGS_H
21708 -#define _ASM_IRQFLAGS_H
21710 -#ifndef __ASSEMBLY__
21711 -#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
21713 -#define xen_restore_fl(f) \
21715 - vcpu_info_t *_vcpu; \
21717 - _vcpu = current_vcpu_info(); \
21718 - if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
21719 - barrier(); /* unmask then check (avoid races) */\
21720 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
21721 - force_evtchn_callback(); \
21725 -#define xen_irq_disable() \
21727 - current_vcpu_info()->evtchn_upcall_mask = 1; \
21731 -#define xen_irq_enable() \
21733 - vcpu_info_t *_vcpu; \
21735 - _vcpu = current_vcpu_info(); \
21736 - _vcpu->evtchn_upcall_mask = 0; \
21737 - barrier(); /* unmask then check (avoid races) */ \
21738 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
21739 - force_evtchn_callback(); \
21742 -void xen_safe_halt(void);
21744 -void xen_halt(void);
21747 - * The use of 'barrier' in the following reflects their use as local-lock
21748 - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21749 - * critical operations are executed. All critical operations must complete
21750 - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21751 - * includes these barriers, for example.
21754 -#define __raw_local_save_flags() xen_save_fl()
21756 -#define raw_local_irq_restore(flags) xen_restore_fl(flags)
21758 -#define raw_local_irq_disable() xen_irq_disable()
21760 -#define raw_local_irq_enable() xen_irq_enable()
21763 - * Used in the idle loop; sti takes one instruction cycle
21766 -static inline void raw_safe_halt(void)
21772 - * Used when interrupts are already enabled or to
21773 - * shutdown the processor:
21775 -static inline void halt(void)
21781 - * For spinlocks, etc:
21783 -#define __raw_local_irq_save() \
21785 - unsigned long flags = __raw_local_save_flags(); \
21787 - raw_local_irq_disable(); \
21793 -/* Offsets into shared_info_t. */
21794 -#define evtchn_upcall_pending /* 0 */
21795 -#define evtchn_upcall_mask 1
21797 -#define sizeof_vcpu_shift 6
21800 -#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
21801 - shl $sizeof_vcpu_shift,%esi ; \
21802 - addl HYPERVISOR_shared_info,%esi
21804 -#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
21807 -#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
21808 -#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
21809 -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
21810 -#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21811 - __DISABLE_INTERRUPTS
21812 -#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
21813 - __ENABLE_INTERRUPTS
21814 -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
21815 -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
21816 - __TEST_PENDING ; \
21817 - jnz 14f /* process more events if necessary... */ ; \
21818 - movl PT_ESI(%esp), %esi ; \
21820 -14: __DISABLE_INTERRUPTS ; \
21821 - TRACE_IRQS_OFF ; \
21822 -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
21824 - call evtchn_do_upcall ; \
21826 - jmp ret_from_intr
21827 -#define INTERRUPT_RETURN iret
21828 -#endif /* __ASSEMBLY__ */
21830 -#ifndef __ASSEMBLY__
21831 -#define raw_local_save_flags(flags) \
21832 - do { (flags) = __raw_local_save_flags(); } while (0)
21834 -#define raw_local_irq_save(flags) \
21835 - do { (flags) = __raw_local_irq_save(); } while (0)
21837 -static inline int raw_irqs_disabled_flags(unsigned long flags)
21839 - return (flags != 0);
21842 -#define raw_irqs_disabled() \
21844 - unsigned long flags = __raw_local_save_flags(); \
21846 - raw_irqs_disabled_flags(flags); \
21850 - * makes the traced hardirq state match with the machine state
21852 - * should be a rarely used function, only in places where its
21853 - * otherwise impossible to know the irq state, like in traps.
21855 -static inline void trace_hardirqs_fixup_flags(unsigned long flags)
21857 - if (raw_irqs_disabled_flags(flags))
21858 - trace_hardirqs_off();
21860 - trace_hardirqs_on();
21863 -#define trace_hardirqs_fixup() \
21864 - trace_hardirqs_fixup_flags(__raw_local_save_flags())
21865 -#endif /* __ASSEMBLY__ */
21868 - * Do the CPU's IRQ-state tracing from assembly code. We call a
21869 - * C function, so save all the C-clobbered registers:
21871 -#ifdef CONFIG_TRACE_IRQFLAGS
21873 -# define TRACE_IRQS_ON \
21877 - call trace_hardirqs_on; \
21882 -# define TRACE_IRQS_OFF \
21886 - call trace_hardirqs_off; \
21892 -# define TRACE_IRQS_ON
21893 -# define TRACE_IRQS_OFF
21896 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
21897 -# define LOCKDEP_SYS_EXIT \
21901 - call lockdep_sys_exit; \
21906 -# define LOCKDEP_SYS_EXIT
21910 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/irqflags_64.h 2009-02-16 16:18:36.000000000 +0100
21911 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
21914 - * include/asm-x86_64/irqflags.h
21916 - * IRQ flags handling
21918 - * This file gets included from lowlevel asm headers too, to provide
21919 - * wrapped versions of the local_irq_*() APIs, based on the
21920 - * raw_local_irq_*() functions from the lowlevel headers.
21922 -#ifndef _ASM_IRQFLAGS_H
21923 -#define _ASM_IRQFLAGS_H
21924 -#include <asm/processor-flags.h>
21926 -#ifndef __ASSEMBLY__
21928 - * Interrupt control:
21932 - * The use of 'barrier' in the following reflects their use as local-lock
21933 - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
21934 - * critical operations are executed. All critical operations must complete
21935 - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
21936 - * includes these barriers, for example.
21939 -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
21941 -#define raw_local_save_flags(flags) \
21942 - do { (flags) = __raw_local_save_flags(); } while (0)
21944 -#define raw_local_irq_restore(x) \
21946 - vcpu_info_t *_vcpu; \
21948 - _vcpu = current_vcpu_info(); \
21949 - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
21950 - barrier(); /* unmask then check (avoid races) */ \
21951 - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
21952 - force_evtchn_callback(); \
21956 -#ifdef CONFIG_X86_VSMP
21959 - * Interrupt control for the VSMP architecture:
21962 -static inline void raw_local_irq_disable(void)
21964 - unsigned long flags = __raw_local_save_flags();
21966 - raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
21969 -static inline void raw_local_irq_enable(void)
21971 - unsigned long flags = __raw_local_save_flags();
21973 - raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
21976 -static inline int raw_irqs_disabled_flags(unsigned long flags)
21978 - return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
21981 -#else /* CONFIG_X86_VSMP */
21983 -#define raw_local_irq_disable() \
21985 - current_vcpu_info()->evtchn_upcall_mask = 1; \
21989 -#define raw_local_irq_enable() \
21991 - vcpu_info_t *_vcpu; \
21993 - _vcpu = current_vcpu_info(); \
21994 - _vcpu->evtchn_upcall_mask = 0; \
21995 - barrier(); /* unmask then check (avoid races) */ \
21996 - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \
21997 - force_evtchn_callback(); \
22000 -static inline int raw_irqs_disabled_flags(unsigned long flags)
22002 - return (flags != 0);
22008 - * For spinlocks, etc.:
22011 -#define __raw_local_irq_save() \
22013 - unsigned long flags = __raw_local_save_flags(); \
22015 - raw_local_irq_disable(); \
22020 -#define raw_local_irq_save(flags) \
22021 - do { (flags) = __raw_local_irq_save(); } while (0)
22023 -#define raw_irqs_disabled() \
22025 - unsigned long flags = __raw_local_save_flags(); \
22027 - raw_irqs_disabled_flags(flags); \
22031 - * makes the traced hardirq state match with the machine state
22033 - * should be a rarely used function, only in places where its
22034 - * otherwise impossible to know the irq state, like in traps.
22036 -static inline void trace_hardirqs_fixup_flags(unsigned long flags)
22038 - if (raw_irqs_disabled_flags(flags))
22039 - trace_hardirqs_off();
22041 - trace_hardirqs_on();
22044 -#define trace_hardirqs_fixup() \
22045 - trace_hardirqs_fixup_flags(__raw_local_save_flags())
22047 - * Used in the idle loop; sti takes one instruction cycle
22050 -void xen_safe_halt(void);
22051 -static inline void raw_safe_halt(void)
22057 - * Used when interrupts are already enabled or to
22058 - * shutdown the processor:
22060 -void xen_halt(void);
22061 -static inline void halt(void)
22066 -#else /* __ASSEMBLY__: */
22067 -# ifdef CONFIG_TRACE_IRQFLAGS
22068 -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
22069 -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk
22071 -# define TRACE_IRQS_ON
22072 -# define TRACE_IRQS_OFF
22074 -# ifdef CONFIG_DEBUG_LOCK_ALLOC
22075 -# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
22076 -# define LOCKDEP_SYS_EXIT_IRQ \
22080 - LOCKDEP_SYS_EXIT; \
22085 -# define LOCKDEP_SYS_EXIT
22086 -# define LOCKDEP_SYS_EXIT_IRQ
22091 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/maddr_32.h 2009-02-16 16:17:21.000000000 +0100
22092 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/maddr_32.h 2009-03-16 16:33:40.000000000 +0100
22094 #ifndef _I386_MADDR_H
22095 #define _I386_MADDR_H
22097 +#include <asm/bug.h>
22098 #include <xen/features.h>
22099 #include <xen/interface/xen.h>
22101 @@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy
22102 phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
22107 -#ifdef CONFIG_X86_PAE
22108 -#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } )
22109 -extern unsigned long long __supported_pte_mask;
22110 -static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
22114 - pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
22115 - (pgprot_val(pgprot) >> 32);
22116 - pte.pte_high &= (__supported_pte_mask >> 32);
22117 - pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
22118 - __supported_pte_mask;
22122 -#define __pte_ma(x) ((pte_t) { (x) } )
22123 -#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
22124 +#define pte_phys_to_machine phys_to_machine
22125 +#define pte_machine_to_phys machine_to_phys
22128 #else /* !CONFIG_XEN */
22129 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/maddr_64.h 2009-10-28 14:55:04.000000000 +0100
22130 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/maddr_64.h 2009-03-16 16:33:40.000000000 +0100
22132 #ifndef _X86_64_MADDR_H
22133 #define _X86_64_MADDR_H
22135 +#include <asm/bug.h>
22136 #include <xen/features.h>
22137 #include <xen/interface/xen.h>
22139 @@ -16,6 +17,7 @@ typedef unsigned long maddr_t;
22142 extern unsigned long *phys_to_machine_mapping;
22143 +extern unsigned long max_mapnr;
22145 #undef machine_to_phys_mapping
22146 extern unsigned long *machine_to_phys_mapping;
22147 @@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u
22149 if (xen_feature(XENFEAT_auto_translated_physmap))
22151 - BUG_ON(end_pfn && pfn >= end_pfn);
22152 + BUG_ON(max_mapnr && pfn >= max_mapnr);
22153 return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
22156 @@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin
22158 if (xen_feature(XENFEAT_auto_translated_physmap))
22160 - BUG_ON(end_pfn && pfn >= end_pfn);
22161 + BUG_ON(max_mapnr && pfn >= max_mapnr);
22162 return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
22165 @@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u
22168 if (unlikely((mfn >> machine_to_phys_order) != 0))
22170 + return max_mapnr;
22172 /* The array access can fail (e.g., device space beyond end of RAM). */
22174 @@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u
22178 - : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
22179 + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
22183 @@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u
22184 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
22186 unsigned long pfn = mfn_to_pfn(mfn);
22187 - if ((pfn < end_pfn)
22188 + if ((pfn < max_mapnr)
22189 && !xen_feature(XENFEAT_auto_translated_physmap)
22190 && (phys_to_machine_mapping[pfn] != mfn))
22191 - return end_pfn; /* force !pfn_valid() */
22192 + return max_mapnr; /* force !pfn_valid() */
22196 static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
22198 - BUG_ON(end_pfn && pfn >= end_pfn);
22199 + BUG_ON(max_mapnr && pfn >= max_mapnr);
22200 if (xen_feature(XENFEAT_auto_translated_physmap)) {
22201 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
22203 @@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy
22207 -#define __pte_ma(x) ((pte_t) { (x) } )
22208 -#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
22210 #else /* !CONFIG_XEN */
22212 #define pfn_to_mfn(pfn) (pfn)
22213 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-02-16 16:17:21.000000000 +0100
22214 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context_32.h 2009-03-16 16:33:40.000000000 +0100
22215 @@ -51,8 +51,6 @@ static inline void __prepare_arch_switch
22219 -void leave_mm(unsigned long cpu);
22221 static inline void switch_mm(struct mm_struct *prev,
22222 struct mm_struct *next,
22223 struct task_struct *tsk)
22224 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-02-16 16:17:21.000000000 +0100
22225 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/mmu_context_64.h 2009-03-16 16:33:40.000000000 +0100
22226 @@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm)
22227 extern void mm_unpin(struct mm_struct *mm);
22228 void mm_pin_all(void);
22230 -static inline void load_cr3(pgd_t *pgd)
22232 - asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
22236 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
22237 struct task_struct *tsk)
22239 @@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s
22242 if (unlikely(next->context.ldt != prev->context.ldt)) {
22243 - /* load_LDT_nolock(&next->context, cpu) */
22244 + /* load_LDT_nolock(&next->context) */
22245 op->cmd = MMUEXT_SET_LDT;
22246 op->arg1.linear_addr = (unsigned long)next->context.ldt;
22247 op->arg2.nr_ents = next->context.size;
22248 @@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s
22250 write_pda(mmu_state, TLBSTATE_OK);
22251 if (read_pda(active_mm) != next)
22252 - out_of_line_bug();
22254 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
22255 /* We were in lazy tlb mode and leave_mm disabled
22256 * tlb flush IPI delivery. We must reload CR3
22257 @@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s
22259 load_cr3(next->pgd);
22260 xen_new_user_pt(__pa(__user_pgd(next->pgd)));
22261 - load_LDT_nolock(&next->context, cpu);
22262 + load_LDT_nolock(&next->context);
22266 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/page.h 2009-02-16 16:18:36.000000000 +0100
22267 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/page.h 2009-03-16 16:33:40.000000000 +0100
22269 +#ifndef _ASM_X86_PAGE_H
22270 +#define _ASM_X86_PAGE_H
22272 +#include <linux/const.h>
22274 +/* PAGE_SHIFT determines the page size */
22275 +#define PAGE_SHIFT 12
22276 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
22277 +#define PAGE_MASK (~(PAGE_SIZE-1))
22280 -# ifdef CONFIG_X86_32
22281 -# include "page_32.h"
22283 -# include "page_64.h"
22287 + * Need to repeat this here in order to not include pgtable.h (which in turn
22288 + * depends on definitions made here), but to be able to use the symbolics
22289 + * below. The preprocessor will warn if the two definitions aren't identical.
22291 +#define _PAGE_BIT_PRESENT 0
22292 +#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
22293 +#define _PAGE_BIT_IO 9
22294 +#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
22296 +#define PHYSICAL_PAGE_MASK (~(_AT(phys_addr_t, PAGE_SIZE) - 1) & __PHYSICAL_MASK)
22297 +#define PTE_MASK _AT(pteval_t, PHYSICAL_PAGE_MASK)
22299 +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
22300 +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
22302 +#define HPAGE_SHIFT PMD_SHIFT
22303 +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
22304 +#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22305 +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
22307 +/* to align the pointer to the (next) page boundary */
22308 +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22310 +#define __PHYSICAL_MASK _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
22311 +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22313 +#ifndef __ASSEMBLY__
22314 +#include <linux/types.h>
22317 +#ifdef CONFIG_X86_64
22318 +#include <asm/page_64.h>
22319 +#define max_pfn_mapped end_pfn_map
22321 +#include <asm/page_32.h>
22322 +#define max_pfn_mapped max_low_pfn
22323 +#endif /* CONFIG_X86_64 */
22325 +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
22327 +#define VM_DATA_DEFAULT_FLAGS \
22328 + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22329 + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22332 +#ifndef __ASSEMBLY__
22334 +extern int page_is_ram(unsigned long pagenr);
22338 +static inline void clear_user_page(void *page, unsigned long vaddr,
22341 + clear_page(page);
22344 +static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
22345 + struct page *topage)
22347 + copy_page(to, from);
22350 +#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22351 + alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22352 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22354 +typedef struct { pgprotval_t pgprot; } pgprot_t;
22356 +#define pgprot_val(x) ((x).pgprot)
22357 +#define __pgprot(x) ((pgprot_t) { (x) } )
22359 +#include <asm/maddr.h>
22361 +typedef struct { pgdval_t pgd; } pgd_t;
22363 +#define __pgd_ma(x) ((pgd_t) { (x) } )
22364 +static inline pgd_t xen_make_pgd(pgdval_t val)
22366 + if (val & _PAGE_PRESENT)
22367 + val = pte_phys_to_machine(val);
22368 + return (pgd_t) { val };
22371 +#define __pgd_val(x) ((x).pgd)
22372 +static inline pgdval_t xen_pgd_val(pgd_t pgd)
22374 + pgdval_t ret = __pgd_val(pgd);
22375 +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
22377 + ret = machine_to_phys(ret) | _PAGE_PRESENT;
22379 + if (ret & _PAGE_PRESENT)
22380 + ret = pte_machine_to_phys(ret);
22385 +#if PAGETABLE_LEVELS >= 3
22386 +#if PAGETABLE_LEVELS == 4
22387 +typedef struct { pudval_t pud; } pud_t;
22389 +#define __pud_ma(x) ((pud_t) { (x) } )
22390 +static inline pud_t xen_make_pud(pudval_t val)
22392 + if (val & _PAGE_PRESENT)
22393 + val = pte_phys_to_machine(val);
22394 + return (pud_t) { val };
22397 +#define __pud_val(x) ((x).pud)
22398 +static inline pudval_t xen_pud_val(pud_t pud)
22400 + pudval_t ret = __pud_val(pud);
22401 + if (ret & _PAGE_PRESENT)
22402 + ret = pte_machine_to_phys(ret);
22405 +#else /* PAGETABLE_LEVELS == 3 */
22406 +#include <asm-generic/pgtable-nopud.h>
22408 +#define __pud_val(x) __pgd_val((x).pgd)
22409 +static inline pudval_t xen_pud_val(pud_t pud)
22411 + return xen_pgd_val(pud.pgd);
22413 +#endif /* PAGETABLE_LEVELS == 4 */
22415 +typedef struct { pmdval_t pmd; } pmd_t;
22417 +#define __pmd_ma(x) ((pmd_t) { (x) } )
22418 +static inline pmd_t xen_make_pmd(pmdval_t val)
22420 + if (val & _PAGE_PRESENT)
22421 + val = pte_phys_to_machine(val);
22422 + return (pmd_t) { val };
22425 +#define __pmd_val(x) ((x).pmd)
22426 +static inline pmdval_t xen_pmd_val(pmd_t pmd)
22428 + pmdval_t ret = __pmd_val(pmd);
22429 +#if CONFIG_XEN_COMPAT <= 0x030002
22431 + ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22434 -# include "page_32.h"
22436 -# include "page_64.h"
22438 + if (ret & _PAGE_PRESENT)
22439 + ret = pte_machine_to_phys(ret);
22443 +#else /* PAGETABLE_LEVELS == 2 */
22444 +#include <asm-generic/pgtable-nopmd.h>
22446 +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
22447 +#define __pmd_val(x) __pgd_val((x).pud.pgd)
22448 +static inline pmdval_t xen_pmd_val(pmd_t pmd)
22450 + return xen_pgd_val(pmd.pud.pgd);
22452 +#endif /* PAGETABLE_LEVELS >= 3 */
22454 +#define __pte_ma(x) ((pte_t) { .pte = (x) } )
22455 +static inline pte_t xen_make_pte(pteval_t val)
22457 + if ((val & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22458 + val = pte_phys_to_machine(val);
22459 + return (pte_t) { .pte = val };
22462 +#define __pte_val(x) ((x).pte)
22463 +static inline pteval_t xen_pte_val(pte_t pte)
22465 + pteval_t ret = __pte_val(pte);
22466 + if ((pte.pte_low & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22467 + ret = pte_machine_to_phys(ret);
22471 +#define pgd_val(x) xen_pgd_val(x)
22472 +#define __pgd(x) xen_make_pgd(x)
22474 +#ifndef __PAGETABLE_PUD_FOLDED
22475 +#define pud_val(x) xen_pud_val(x)
22476 +#define __pud(x) xen_make_pud(x)
22479 +#ifndef __PAGETABLE_PMD_FOLDED
22480 +#define pmd_val(x) xen_pmd_val(x)
22481 +#define __pmd(x) xen_make_pmd(x)
22484 +#define pte_val(x) xen_pte_val(x)
22485 +#define __pte(x) xen_make_pte(x)
22487 +#define __pa(x) __phys_addr((unsigned long)(x))
22488 +/* __pa_symbol should be used for C visible symbols.
22489 + This seems to be the official gcc blessed way to do such arithmetic. */
22490 +#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
22492 +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
22494 +#define __boot_va(x) __va(x)
22495 +#define __boot_pa(x) __pa(x)
22497 +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22498 +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
22499 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22501 +#endif /* __ASSEMBLY__ */
22503 +#include <asm-generic/memory_model.h>
22504 +#include <asm-generic/page.h>
22506 +#define __HAVE_ARCH_GATE_AREA 1
22508 +#endif /* __KERNEL__ */
22509 +#endif /* _ASM_X86_PAGE_H */
22510 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/page_64.h 2009-02-16 16:18:36.000000000 +0100
22511 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/page_64.h 2009-03-16 16:33:40.000000000 +0100
22513 #ifndef _X86_64_PAGE_H
22514 #define _X86_64_PAGE_H
22516 -/* #include <linux/string.h> */
22517 -#ifndef __ASSEMBLY__
22518 -#include <linux/kernel.h>
22519 -#include <linux/types.h>
22520 -#include <asm/bug.h>
22522 -#include <linux/const.h>
22523 -#include <xen/interface/xen.h>
22526 - * Need to repeat this here in order to not include pgtable.h (which in turn
22527 - * depends on definitions made here), but to be able to use the symbolic
22528 - * below. The preprocessor will warn if the two definitions aren't identical.
22530 -#define _PAGE_PRESENT 0x001
22531 -#define _PAGE_IO 0x200
22533 -/* PAGE_SHIFT determines the page size */
22534 -#define PAGE_SHIFT 12
22535 -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
22536 -#define PAGE_MASK (~(PAGE_SIZE-1))
22538 -/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22539 -#define __PHYSICAL_MASK_SHIFT 46
22540 -#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
22541 -#define __VIRTUAL_MASK_SHIFT 48
22542 -#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
22544 -#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
22545 +#define PAGETABLE_LEVELS 4
22547 -#define THREAD_ORDER 1
22548 +#define THREAD_ORDER 1
22549 #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
22550 #define CURRENT_MASK (~(THREAD_SIZE-1))
22552 @@ -51,106 +23,10 @@
22553 #define MCE_STACK 5
22554 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
22556 -#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
22557 -#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
22559 -#define HPAGE_SHIFT PMD_SHIFT
22560 -#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
22561 -#define HPAGE_MASK (~(HPAGE_SIZE - 1))
22562 -#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
22565 -#ifndef __ASSEMBLY__
22567 -extern unsigned long end_pfn;
22569 -#include <asm/maddr.h>
22571 -void clear_page(void *);
22572 -void copy_page(void *, void *);
22574 -#define clear_user_page(page, vaddr, pg) clear_page(page)
22575 -#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
22577 -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
22578 - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
22579 -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
22582 - * These are used to make use of C type-checking..
22584 -typedef struct { unsigned long pte; } pte_t;
22585 -typedef struct { unsigned long pmd; } pmd_t;
22586 -typedef struct { unsigned long pud; } pud_t;
22587 -typedef struct { unsigned long pgd; } pgd_t;
22588 -#define PTE_MASK PHYSICAL_PAGE_MASK
22590 -typedef struct { unsigned long pgprot; } pgprot_t;
22592 -#define __pte_val(x) ((x).pte)
22593 -#define pte_val(x) ((__pte_val(x) & (_PAGE_PRESENT|_PAGE_IO)) \
22594 - == _PAGE_PRESENT ? \
22595 - pte_machine_to_phys(__pte_val(x)) : \
22598 -#define __pmd_val(x) ((x).pmd)
22599 -static inline unsigned long pmd_val(pmd_t x)
22601 - unsigned long ret = __pmd_val(x);
22602 -#if CONFIG_XEN_COMPAT <= 0x030002
22603 - if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
22605 - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22610 -#define __pud_val(x) ((x).pud)
22611 -static inline unsigned long pud_val(pud_t x)
22613 - unsigned long ret = __pud_val(x);
22614 - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22618 -#define __pgd_val(x) ((x).pgd)
22619 -static inline unsigned long pgd_val(pgd_t x)
22621 - unsigned long ret = __pgd_val(x);
22622 - if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret);
22626 -#define pgprot_val(x) ((x).pgprot)
22628 -static inline pte_t __pte(unsigned long x)
22630 - if ((x & (_PAGE_PRESENT|_PAGE_IO)) == _PAGE_PRESENT)
22631 - x = pte_phys_to_machine(x);
22632 - return ((pte_t) { (x) });
22635 -static inline pmd_t __pmd(unsigned long x)
22637 - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22638 - return ((pmd_t) { (x) });
22641 -static inline pud_t __pud(unsigned long x)
22643 - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22644 - return ((pud_t) { (x) });
22647 -static inline pgd_t __pgd(unsigned long x)
22649 - if (x & _PAGE_PRESENT) x = pte_phys_to_machine(x);
22650 - return ((pgd_t) { (x) });
22653 -#define __pgprot(x) ((pgprot_t) { (x) } )
22654 +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
22655 +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
22657 -#endif /* !__ASSEMBLY__ */
22658 +#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
22660 #define __PHYSICAL_START CONFIG_PHYSICAL_START
22661 #define __KERNEL_ALIGN 0x200000
22662 @@ -166,52 +42,58 @@ static inline pgd_t __pgd(unsigned long
22664 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
22665 #define __START_KERNEL_map _AC(0xffffffff80000000, UL)
22666 -#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
22668 #if CONFIG_XEN_COMPAT <= 0x030002
22670 #define LOAD_OFFSET 0
22673 -/* to align the pointer to the (next) page boundary */
22674 -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
22676 -#define KERNEL_TEXT_SIZE (40*1024*1024)
22677 -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
22678 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
22679 +#define __PHYSICAL_MASK_SHIFT 46
22680 +#define __VIRTUAL_MASK_SHIFT 48
22682 -#define PAGE_OFFSET __PAGE_OFFSET
22684 + * Kernel image size is limited to 128 MB (see level2_kernel_pgt in
22685 + * arch/x86/kernel/head_64.S), and it is mapped here:
22687 +#define KERNEL_IMAGE_SIZE (128*1024*1024)
22688 +#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
22690 #ifndef __ASSEMBLY__
22691 +void clear_page(void *page);
22692 +void copy_page(void *to, void *from);
22694 +extern unsigned long end_pfn;
22695 +extern unsigned long end_pfn_map;
22697 static inline unsigned long __phys_addr(unsigned long x)
22699 - return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
22700 + return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : __PAGE_OFFSET);
22704 -#define __pa(x) __phys_addr((unsigned long)(x))
22705 -#define __pa_symbol(x) __phys_addr((unsigned long)(x))
22706 +#define __phys_reloc_hide(x) (x)
22708 -#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
22709 -#define __boot_va(x) __va(x)
22710 -#define __boot_pa(x) __pa(x)
22711 -#ifdef CONFIG_FLATMEM
22712 -#define pfn_valid(pfn) ((pfn) < end_pfn)
22715 + * These are used to make use of C type-checking..
22717 +typedef unsigned long pteval_t;
22718 +typedef unsigned long pmdval_t;
22719 +typedef unsigned long pudval_t;
22720 +typedef unsigned long pgdval_t;
22721 +typedef unsigned long pgprotval_t;
22722 +typedef unsigned long phys_addr_t;
22724 -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
22725 -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
22726 -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
22728 -#define VM_DATA_DEFAULT_FLAGS \
22729 - (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
22730 - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
22731 +typedef struct page *pgtable_t;
22733 +typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
22735 -#define __HAVE_ARCH_GATE_AREA 1
22736 #define vmemmap ((struct page *)VMEMMAP_START)
22738 -#include <asm-generic/memory_model.h>
22739 -#include <asm-generic/page.h>
22740 +#endif /* !__ASSEMBLY__ */
22742 +#ifdef CONFIG_FLATMEM
22743 +#define pfn_valid(pfn) ((pfn) < max_mapnr)
22746 -#endif /* __KERNEL__ */
22748 #endif /* _X86_64_PAGE_H */
22749 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pci.h 2009-02-16 16:18:36.000000000 +0100
22750 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pci.h 2009-03-16 16:33:40.000000000 +0100
22751 @@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc
22755 +extern void early_quirks(void);
22756 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
22757 enum pci_dma_burst_strategy *strat,
22758 unsigned long *strategy_parameter)
22759 @@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice(
22760 *strat = PCI_DMA_BURST_INFINITY;
22761 *strategy_parameter = ~0UL;
22764 +static inline void early_quirks(void) { }
22768 #endif /* __KERNEL__ */
22770 #ifdef CONFIG_X86_32
22771 @@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice(
22772 /* generic pci stuff */
22773 #include <asm-generic/pci.h>
22775 +#ifdef CONFIG_NUMA
22776 +/* Returns the node based on pci bus */
22777 +static inline int __pcibus_to_node(struct pci_bus *bus)
22779 + struct pci_sysdata *sd = bus->sysdata;
22784 +static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
22786 + return node_to_cpumask(__pcibus_to_node(bus));
22791 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-02-16 16:17:21.000000000 +0100
22792 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgalloc_32.h 2009-03-16 16:33:40.000000000 +0100
22795 #include <linux/threads.h>
22796 #include <linux/mm.h> /* for struct page */
22797 +#include <linux/pagemap.h>
22798 +#include <asm/tlb.h>
22799 +#include <asm-generic/tlb.h>
22800 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
22802 #define paravirt_alloc_pt(mm, pfn) do { } while (0)
22803 -#define paravirt_alloc_pd(pfn) do { } while (0)
22804 -#define paravirt_alloc_pd(pfn) do { } while (0)
22805 +#define paravirt_alloc_pd(mm, pfn) do { } while (0)
22806 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
22807 #define paravirt_release_pt(pfn) do { } while (0)
22808 #define paravirt_release_pd(pfn) do { } while (0)
22810 -#define pmd_populate_kernel(mm, pmd, pte) \
22812 - paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \
22813 - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
22816 -#define pmd_populate(mm, pmd, pte) \
22818 - unsigned long pfn = page_to_pfn(pte); \
22819 - paravirt_alloc_pt(mm, pfn); \
22820 - if (PagePinned(virt_to_page((mm)->pgd))) { \
22821 - if (!PageHighMem(pte)) \
22822 - BUG_ON(HYPERVISOR_update_va_mapping( \
22823 - (unsigned long)__va(pfn << PAGE_SHIFT), \
22824 - pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \
22825 - else if (!test_and_set_bit(PG_pinned, &pte->flags)) \
22826 - kmap_flush_unused(); \
22828 - __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
22830 - *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
22832 +static inline void pmd_populate_kernel(struct mm_struct *mm,
22833 + pmd_t *pmd, pte_t *pte)
22835 + paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
22836 + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
22839 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22841 + unsigned long pfn = page_to_pfn(pte);
22843 + paravirt_alloc_pt(mm, pfn);
22844 + if (PagePinned(virt_to_page(mm->pgd))) {
22845 + if (!PageHighMem(pte))
22846 + BUG_ON(HYPERVISOR_update_va_mapping(
22847 + (unsigned long)__va(pfn << PAGE_SHIFT),
22848 + pfn_pte(pfn, PAGE_KERNEL_RO), 0));
22849 + else if (!test_and_set_bit(PG_pinned, &pte->flags))
22850 + kmap_flush_unused();
22851 + set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
22853 + *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
22855 +#define pmd_pgtable(pmd) pmd_page(pmd)
22858 * Allocate and free page tables.
22860 +extern void pgd_test_and_unpin(pgd_t *);
22861 extern pgd_t *pgd_alloc(struct mm_struct *);
22862 -extern void pgd_free(pgd_t *pgd);
22863 +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
22865 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
22866 -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
22867 +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
22869 -static inline void pte_free_kernel(pte_t *pte)
22870 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
22872 make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
22873 free_page((unsigned long)pte);
22876 -extern void pte_free(struct page *pte);
22877 +extern void __pte_free(pgtable_t);
22878 +static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
22884 -#define __pte_free_tlb(tlb,pte) \
22886 - paravirt_release_pt(page_to_pfn(pte)); \
22887 - tlb_remove_page((tlb),(pte)); \
22889 +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
22891 #ifdef CONFIG_X86_PAE
22893 * In the PAE case we free the pmds as part of the pgd.
22895 -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); })
22896 -#define pmd_free(x) do { } while (0)
22897 -#define __pmd_free_tlb(tlb,x) do { } while (0)
22898 -#define pud_populate(mm, pmd, pte) BUG()
22900 +extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
22902 +extern void __pmd_free(pgtable_t);
22903 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
22905 + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
22906 + __pmd_free(virt_to_page(pmd));
22909 +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
22911 +static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
22913 + struct page *page = virt_to_page(pmd);
22914 + unsigned long pfn = page_to_pfn(page);
22916 + paravirt_alloc_pd(mm, pfn);
22918 + /* Note: almost everything apart from _PAGE_PRESENT is
22919 + reserved at the pmd (PDPT) level. */
22920 + if (PagePinned(virt_to_page(mm->pgd))) {
22921 + BUG_ON(PageHighMem(page));
22922 + BUG_ON(HYPERVISOR_update_va_mapping(
22923 + (unsigned long)__va(pfn << PAGE_SHIFT),
22924 + pfn_pte(pfn, PAGE_KERNEL_RO), 0));
22925 + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
22927 + *pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
22930 + * According to Intel App note "TLBs, Paging-Structure Caches,
22931 + * and Their Invalidation", April 2007, document 317080-001,
22932 + * section 8.1: in PAE mode we explicitly have to flush the
22933 + * TLB via cr3 if the top-level pgd is changed...
22935 + if (mm == current->active_mm)
22938 +#endif /* CONFIG_X86_PAE */
22940 #endif /* _I386_PGALLOC_H */
22941 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-02-16 16:18:36.000000000 +0100
22942 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgalloc_64.h 2009-03-16 16:33:40.000000000 +0100
22944 #include <linux/mm.h>
22945 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
22947 -#include <xen/features.h>
22948 -void make_page_readonly(void *va, unsigned int feature);
22949 -void make_page_writable(void *va, unsigned int feature);
22950 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
22951 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
22952 +pmd_t *early_get_pmd(unsigned long va);
22953 +void early_make_page_readonly(void *va, unsigned int feature);
22955 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
22957 -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
22959 - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
22962 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22964 - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
22965 - BUG_ON(HYPERVISOR_update_va_mapping(
22966 - (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
22967 - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
22968 - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
22970 - *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
22973 +#define pmd_populate_kernel(mm, pmd, pte) \
22974 + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
22976 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
22978 @@ -63,53 +46,58 @@ static inline void pgd_populate(struct m
22982 -extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
22983 -extern void pte_free(struct page *pte);
22984 +#define pmd_pgtable(pmd) pmd_page(pmd)
22986 -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
22987 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
22991 - pg = pte_alloc_one(mm, addr);
22992 - return pg ? page_address(pg) : NULL;
22993 + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
22994 + BUG_ON(HYPERVISOR_update_va_mapping(
22995 + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
22996 + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
22997 + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
22999 + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
23003 -static inline void pmd_free(pmd_t *pmd)
23004 +extern void __pmd_free(pgtable_t);
23005 +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
23007 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
23008 - pte_free(virt_to_page(pmd));
23009 + __pmd_free(virt_to_page(pmd));
23012 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
23014 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
23018 - pg = pte_alloc_one(mm, addr);
23019 - return pg ? page_address(pg) : NULL;
23020 + return (pud_t *)pmd_alloc_one(mm, addr);
23023 -static inline void pud_free(pud_t *pud)
23024 +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
23026 BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
23027 - pte_free(virt_to_page(pud));
23028 + __pmd_free(virt_to_page(pud));
23031 static inline void pgd_list_add(pgd_t *pgd)
23033 struct page *page = virt_to_page(pgd);
23034 + unsigned long flags;
23036 - spin_lock(&pgd_lock);
23037 + spin_lock_irqsave(&pgd_lock, flags);
23038 list_add(&page->lru, &pgd_list);
23039 - spin_unlock(&pgd_lock);
23040 + spin_unlock_irqrestore(&pgd_lock, flags);
23043 static inline void pgd_list_del(pgd_t *pgd)
23045 struct page *page = virt_to_page(pgd);
23046 + unsigned long flags;
23048 - spin_lock(&pgd_lock);
23049 + spin_lock_irqsave(&pgd_lock, flags);
23050 list_del(&page->lru);
23051 - spin_unlock(&pgd_lock);
23052 + spin_unlock_irqrestore(&pgd_lock, flags);
23055 extern void pgd_test_and_unpin(pgd_t *);
23056 @@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm
23060 -static inline void pgd_free(pgd_t *pgd)
23061 +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
23063 pgd_test_and_unpin(pgd);
23065 @@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne
23069 +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
23071 /* Should really implement gc for free page table pages. This could be
23072 done with a reference count in struct page. */
23074 -static inline void pte_free_kernel(pte_t *pte)
23075 +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
23077 BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
23078 make_page_writable(pte, XENFEAT_writable_page_tables);
23079 free_page((unsigned long)pte);
23082 -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
23083 +extern void __pte_free(pgtable_t);
23084 +static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
23089 +#define __pte_free_tlb(tlb,pte) \
23091 + pgtable_page_dtor((pte)); \
23092 + tlb_remove_page((tlb), (pte)); \
23095 #define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
23096 #define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
23098 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable.h 2009-02-16 16:18:36.000000000 +0100
23099 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable.h 2009-03-16 16:33:40.000000000 +0100
23101 +#ifndef _ASM_X86_PGTABLE_H
23102 +#define _ASM_X86_PGTABLE_H
23104 +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
23105 +#define FIRST_USER_ADDRESS 0
23107 +#define _PAGE_BIT_PRESENT 0
23108 +#define _PAGE_BIT_RW 1
23109 +#define _PAGE_BIT_USER 2
23110 +#define _PAGE_BIT_PWT 3
23111 +#define _PAGE_BIT_PCD 4
23112 +#define _PAGE_BIT_ACCESSED 5
23113 +#define _PAGE_BIT_DIRTY 6
23114 +#define _PAGE_BIT_FILE 6
23115 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
23116 +#define _PAGE_BIT_PAT 7 /* on 4KB pages */
23117 +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
23118 +#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
23119 + * has no associated page struct. */
23120 +#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
23121 +#define _PAGE_BIT_UNUSED3 11
23122 +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23123 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
23126 + * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
23127 + * sign-extended value on 32-bit with all 1's in the upper word,
23128 + * which preserves the upper pte values on 64-bit ptes:
23130 +#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
23131 +#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
23132 +#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
23133 +#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
23134 +#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
23135 +#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
23136 +#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
23137 +#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
23138 +#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
23139 +#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
23140 +#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
23141 +#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
23142 +#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
23143 +#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
23145 +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
23146 +#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
23148 +#define _PAGE_NX 0
23151 +/* If _PAGE_PRESENT is clear, we use these: */
23152 +#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */
23153 +#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
23154 + pte_present gives true */
23156 +#ifndef __ASSEMBLY__
23157 +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
23158 +extern unsigned int __kernel_page_user;
23160 +#define __kernel_page_user 0
23164 +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23165 +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
23167 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23169 +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23170 +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23172 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23173 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23174 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23175 +#define PAGE_COPY PAGE_COPY_NOEXEC
23176 +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23177 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23179 +#ifdef CONFIG_X86_32
23180 +#define _PAGE_KERNEL_EXEC \
23181 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23182 +#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
23184 +#ifndef __ASSEMBLY__
23185 +extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23186 +#endif /* __ASSEMBLY__ */
23188 +#define __PAGE_KERNEL_EXEC \
23189 + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
23190 +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
23193 +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
23194 +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23195 +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
23196 +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
23197 +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
23198 +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
23199 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
23200 +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
23201 +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23204 + * We don't support GLOBAL page in xenolinux64
23206 +#define MAKE_GLOBAL(x) __pgprot((x))
23208 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
23209 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
23210 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
23211 +#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
23212 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
23213 +#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
23214 +#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
23215 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
23216 +#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
23217 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
23218 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
23221 +#define __P000 PAGE_NONE
23222 +#define __P001 PAGE_READONLY
23223 +#define __P010 PAGE_COPY
23224 +#define __P011 PAGE_COPY
23225 +#define __P100 PAGE_READONLY_EXEC
23226 +#define __P101 PAGE_READONLY_EXEC
23227 +#define __P110 PAGE_COPY_EXEC
23228 +#define __P111 PAGE_COPY_EXEC
23230 +#define __S000 PAGE_NONE
23231 +#define __S001 PAGE_READONLY
23232 +#define __S010 PAGE_SHARED
23233 +#define __S011 PAGE_SHARED
23234 +#define __S100 PAGE_READONLY_EXEC
23235 +#define __S101 PAGE_READONLY_EXEC
23236 +#define __S110 PAGE_SHARED_EXEC
23237 +#define __S111 PAGE_SHARED_EXEC
23239 +#ifndef __ASSEMBLY__
23242 + * ZERO_PAGE is a global shared page that is always zero: used
23243 + * for zero-mapped memory areas etc..
23245 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
23246 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23248 +extern spinlock_t pgd_lock;
23249 +extern struct list_head pgd_list;
23252 + * The following only work if pte_present() is true.
23253 + * Undefined behaviour if not..
23255 +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
23256 +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
23257 +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
23258 +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
23259 +static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
23260 +static inline int pte_global(pte_t pte) { return 0; }
23261 +static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
23263 +static inline int pmd_large(pmd_t pte) {
23264 + return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
23265 + (_PAGE_PSE|_PAGE_PRESENT);
23268 +static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
23269 +static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
23270 +static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
23271 +static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
23272 +static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
23273 +static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
23274 +static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); }
23275 +static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
23276 +static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
23277 +static inline pte_t pte_mkglobal(pte_t pte) { return pte; }
23278 +static inline pte_t pte_clrglobal(pte_t pte) { return pte; }
23280 +extern pteval_t __supported_pte_mask;
23282 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
23284 + return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
23285 + pgprot_val(pgprot)) & __supported_pte_mask);
23288 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
23290 + return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
23291 + pgprot_val(pgprot)) & __supported_pte_mask);
23294 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
23296 + return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
23297 + pgprot_val(pgprot)) & __supported_pte_mask);
23300 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
23302 + pteval_t val = pte_val(pte);
23304 + val &= _PAGE_CHG_MASK;
23305 + val |= pgprot_val(newprot) & __supported_pte_mask;
23307 + return __pte(val);
23310 +#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
23312 +#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
23314 +#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
23315 +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
23317 +#define set_pte_atomic(ptep, pte) \
23318 + xen_set_pte_atomic(ptep, pte)
23320 +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
23322 +#ifndef __PAGETABLE_PUD_FOLDED
23323 +#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd)
23324 +#define pgd_clear(pgd) xen_pgd_clear(pgd)
23328 +# define set_pud(pudp, pud) xen_set_pud(pudp, pud)
23331 +#ifndef __PAGETABLE_PMD_FOLDED
23332 +#define pud_clear(pud) xen_pud_clear(pud)
23335 +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
23336 +#define pmd_clear(pmd) xen_pmd_clear(pmd)
23338 +#define pte_update(mm, addr, ptep) do { } while (0)
23339 +#define pte_update_defer(mm, addr, ptep) do { } while (0)
23341 +#endif /* __ASSEMBLY__ */
23343 #ifdef CONFIG_X86_32
23344 # include "pgtable_32.h"
23346 # include "pgtable_64.h"
23349 +#ifndef __ASSEMBLY__
23359 + * Helper function that returns the kernel pagetable entry controlling
23360 + * the virtual address 'address'. NULL means no pagetable entry present.
23361 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
23364 +extern pte_t *lookup_address(unsigned long address, unsigned int *level);
23366 +/* local pte updates need not use xchg for locking */
23367 +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23369 + xen_set_pte(ptep, __pte(0));
23373 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23374 + pte_t *ptep , pte_t pte)
23376 + if ((mm != current->mm && mm != &init_mm) ||
23377 + HYPERVISOR_update_va_mapping(addr, pte, 0))
23378 + xen_set_pte(ptep, pte);
23381 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
23384 + if ((mm != current->mm && mm != &init_mm)
23385 + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
23386 + __xen_pte_clear(ptep);
23389 +#ifndef CONFIG_PARAVIRT
23391 + * Rules for using pte_update - it must be called after any PTE update which
23392 + * has not been done using the set_pte / clear_pte interfaces. It is used by
23393 + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
23394 + * updates should either be sets, clears, or set_pte_atomic for P->P
23395 + * transitions, which means this hook should only be called for user PTEs.
23396 + * This hook implies a P->P protection or access change has taken place, which
23397 + * requires a subsequent TLB flush. The notification can optionally be delayed
23398 + * until the TLB flush event by using the pte_update_defer form of the
23399 + * interface, but care must be taken to assure that the flush happens while
23400 + * still holding the same page table lock so that the shadow and primary pages
23401 + * do not become out of sync on SMP.
23403 +#define pte_update(mm, addr, ptep) do { } while (0)
23404 +#define pte_update_defer(mm, addr, ptep) do { } while (0)
23408 + * We only update the dirty/accessed state if we set
23409 + * the dirty bit by hand in the kernel, since the hardware
23410 + * will do the accessed bit for us, and we don't want to
23411 + * race with other CPU's that might be updating the dirty
23412 + * bit at the same time.
23414 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23415 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
23417 + int __changed = !pte_same(*(ptep), entry); \
23418 + if (__changed && (dirty)) { \
23419 + if ( likely((vma)->vm_mm == current->mm) ) { \
23420 + BUG_ON(HYPERVISOR_update_va_mapping(address, \
23422 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23423 + UVMF_INVLPG|UVMF_MULTI)); \
23425 + xen_l1_entry_update(ptep, entry); \
23426 + flush_tlb_page(vma, address); \
23432 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23433 +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
23435 + if (pte_young(*(ptep))) \
23436 + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
23439 + pte_update((vma)->vm_mm, addr, ptep); \
23443 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
23444 +#define ptep_clear_flush_young(vma, address, ptep) \
23446 + pte_t __pte = *(ptep); \
23447 + int __young = pte_young(__pte); \
23448 + __pte = pte_mkold(__pte); \
23449 + if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
23450 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
23451 + else if (__young) \
23452 + (ptep)->pte_low = __pte.pte_low; \
23456 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
23457 +#define ptep_clear_flush(vma, addr, ptep) \
23459 + pte_t *__ptep = (ptep); \
23460 + pte_t __res = *__ptep; \
23461 + if (!pte_none(__res) && \
23462 + ((vma)->vm_mm != current->mm || \
23463 + HYPERVISOR_update_va_mapping(addr, __pte(0), \
23464 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23465 + UVMF_INVLPG|UVMF_MULTI))) { \
23466 + __xen_pte_clear(__ptep); \
23467 + flush_tlb_page(vma, addr); \
23472 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
23473 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23475 + pte_t pte = *ptep;
23476 + if (!pte_none(pte)
23477 + && (mm != &init_mm
23478 + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
23479 + pte = xen_ptep_get_and_clear(ptep, pte);
23480 + pte_update(mm, addr, ptep);
23485 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
23486 +#define ptep_get_and_clear_full(mm, addr, ptep, full) \
23488 + pte_t *__ptep = (ptep); \
23489 + pte_t __res = *__ptep; \
23490 + if (!PagePinned(virt_to_page((mm)->pgd))) \
23491 + __xen_pte_clear(__ptep); \
23492 + else if (!pte_none(__res)) \
23493 + xen_l1_entry_update(__ptep, __pte(0)); \
23496 + ptep_get_and_clear(mm, addr, ptep))
23498 +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
23500 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
23501 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23503 + pte_t pte = *ptep;
23504 + if (pte_write(pte))
23505 + set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
23508 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
23509 + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
23511 +#define arbitrary_virt_to_machine(va) \
23513 + unsigned int __lvl; \
23514 + pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \
23515 + BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
23516 + (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \
23517 + | ((unsigned long)(va) & (PAGE_SIZE - 1))); \
23520 +#ifdef CONFIG_HIGHPTE
23521 +#include <asm/io.h>
23522 +struct page *kmap_atomic_to_page(void *);
23523 +#define ptep_to_machine(ptep) \
23525 + pte_t *__ptep = (ptep); \
23526 + page_to_phys(kmap_atomic_to_page(__ptep)) \
23527 + | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \
23530 +#define ptep_to_machine(ptep) virt_to_machine(ptep)
23533 +#include <asm-generic/pgtable.h>
23535 +#include <xen/features.h>
23536 +void make_page_readonly(void *va, unsigned int feature);
23537 +void make_page_writable(void *va, unsigned int feature);
23538 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
23539 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
23541 +struct vm_area_struct;
23543 +int direct_remap_pfn_range(struct vm_area_struct *vma,
23544 + unsigned long address,
23545 + unsigned long mfn,
23546 + unsigned long size,
23549 +int direct_kernel_remap_pfn_range(unsigned long address,
23550 + unsigned long mfn,
23551 + unsigned long size,
23554 +int create_lookup_pte_addr(struct mm_struct *mm,
23555 + unsigned long address,
23557 +int touch_pte_range(struct mm_struct *mm,
23558 + unsigned long address,
23559 + unsigned long size);
23561 +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
23562 + unsigned long addr, unsigned long end, pgprot_t newprot,
23563 + int dirty_accountable);
23565 +#endif /* __ASSEMBLY__ */
23567 +#endif /* _ASM_X86_PGTABLE_H */
23568 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-02-16 16:17:21.000000000 +0100
23569 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable-3level.h 2009-03-16 16:33:40.000000000 +0100
23570 @@ -18,16 +18,18 @@
23571 printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
23572 &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
23574 -#define pud_none(pud) 0
23575 -#define pud_bad(pud) 0
23576 -#define pud_present(pud) 1
23579 - * All present pages with !NX bit are kernel-executable:
23581 -static inline int pte_exec_kernel(pte_t pte)
23582 +static inline int pud_none(pud_t pud)
23584 + return __pud_val(pud) == 0;
23586 +static inline int pud_bad(pud_t pud)
23588 - return !(__pte_val(pte) & _PAGE_NX);
23589 + return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
23591 +static inline int pud_present(pud_t pud)
23593 + return __pud_val(pud) & _PAGE_PRESENT;
23596 /* Rules for using set_pte: the pte being assigned *must* be
23597 @@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt
23598 ptep->pte_low = pte.pte_low;
23601 -static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
23602 - pte_t *ptep , pte_t pte)
23604 - if ((mm != current->mm && mm != &init_mm) ||
23605 - HYPERVISOR_update_va_mapping(addr, pte, 0))
23606 - xen_set_pte(ptep, pte);
23609 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
23611 set_64bit((unsigned long long *)(ptep),__pte_val(pte));
23612 @@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu
23613 * entry, so clear the bottom half first and enforce ordering with a compiler
23616 -static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
23617 +static inline void __xen_pte_clear(pte_t *ptep)
23619 - if ((mm != current->mm && mm != &init_mm)
23620 - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
23621 - ptep->pte_low = 0;
23623 - ptep->pte_high = 0;
23625 + ptep->pte_low = 0;
23627 + ptep->pte_high = 0;
23630 static inline void xen_pmd_clear(pmd_t *pmd)
23631 @@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t *
23632 xen_l2_entry_update(pmd, __pmd(0));
23635 -#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
23636 -#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
23637 -#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
23638 -#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
23639 -#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
23640 -#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
23641 -#define pmd_clear(pmd) xen_pmd_clear(pmd)
23642 +static inline void pud_clear(pud_t *pudp)
23646 + set_pud(pudp, __pud(0));
23649 - * Pentium-II erratum A13: in PAE mode we explicitly have to flush
23650 - * the TLB via cr3 if the top-level pgd is changed...
23651 - * We do not let the generic code free and clear pgd entries due to
23654 -static inline void pud_clear (pud_t * pud) { }
23656 + * According to Intel App note "TLBs, Paging-Structure Caches,
23657 + * and Their Invalidation", April 2007, document 317080-001,
23658 + * section 8.1: in PAE mode we explicitly have to flush the
23659 + * TLB via cr3 if the top-level pgd is changed...
23661 + * Make sure the pud entry we're updating is within the
23662 + * current pgd to avoid unnecessary TLB flushes.
23664 + pgd = read_cr3();
23665 + if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
23669 #define pud_page(pud) \
23670 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
23671 @@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle
23672 #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
23675 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
23676 -#define ptep_clear_flush(vma, addr, ptep) \
23678 - pte_t *__ptep = (ptep); \
23679 - pte_t __res = *__ptep; \
23680 - if (!pte_none(__res) && \
23681 - ((vma)->vm_mm != current->mm || \
23682 - HYPERVISOR_update_va_mapping(addr, __pte(0), \
23683 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23684 - UVMF_INVLPG|UVMF_MULTI))) { \
23685 - __ptep->pte_low = 0; \
23687 - __ptep->pte_high = 0; \
23688 - flush_tlb_page(vma, addr); \
23693 #define __HAVE_ARCH_PTE_SAME
23694 static inline int pte_same(pte_t a, pte_t b)
23696 @@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte)
23697 mfn_to_local_pfn(__pte_mfn(_pte)) : \
23700 -extern unsigned long long __supported_pte_mask;
23702 -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
23704 - return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
23705 - pgprot_val(pgprot)) & __supported_pte_mask);
23708 -static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
23710 - return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
23711 - pgprot_val(pgprot)) & __supported_pte_mask);
23715 * Bits 0, 6 and 7 are taken in the low part of the pte,
23716 * put the 32 bits of offset into the high part.
23718 #define pte_to_pgoff(pte) ((pte).pte_high)
23719 -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
23720 +#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
23721 #define PTE_FILE_MAX_BITS 32
23723 /* Encode and de-code a swap entry */
23724 @@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon
23725 #define __swp_offset(x) ((x).val >> 5)
23726 #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
23727 #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
23728 -#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val })
23730 -#define __pmd_free_tlb(tlb, x) do { } while (0)
23731 +#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
23733 #endif /* _I386_PGTABLE_3LEVEL_H */
23734 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-02-16 16:18:36.000000000 +0100
23735 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable_32.h 2009-03-16 16:33:40.000000000 +0100
23737 #ifndef _I386_PGTABLE_H
23738 #define _I386_PGTABLE_H
23740 -#include <asm/hypervisor.h>
23743 * The Linux memory management assumes a three-level page table setup. On
23744 * the i386, we use that, but "fold" the mid level into the top-level page
23745 @@ -25,20 +23,10 @@
23747 struct vm_area_struct;
23750 - * ZERO_PAGE is a global shared page that is always zero: used
23751 - * for zero-mapped memory areas etc..
23753 -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
23754 -extern unsigned long empty_zero_page[1024];
23755 extern pgd_t *swapper_pg_dir;
23756 -extern struct kmem_cache *pmd_cache;
23757 -extern spinlock_t pgd_lock;
23758 -extern struct page *pgd_list;
23759 -void check_pgt_cache(void);
23761 -void pmd_ctor(struct kmem_cache *, void *);
23762 -void pgtable_cache_init(void);
23763 +static inline void pgtable_cache_init(void) { }
23764 +static inline void check_pgt_cache(void) { }
23765 void paging_init(void);
23768 @@ -58,16 +46,9 @@ void paging_init(void);
23769 #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
23770 #define PGDIR_MASK (~(PGDIR_SIZE-1))
23772 -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE)
23773 -#define FIRST_USER_ADDRESS 0
23775 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
23776 #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
23778 -#define TWOLEVEL_PGDIR_SHIFT 22
23779 -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
23780 -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
23782 /* Just any arbitrary offset to the start of the vmalloc VM area: the
23783 * current 8MB value just means that there will be a 8MB "hole" after the
23784 * physical memory until the kernel virtual memory starts. That means that
23785 @@ -78,121 +59,19 @@ void paging_init(void);
23786 #define VMALLOC_OFFSET (8*1024*1024)
23787 #define VMALLOC_START (((unsigned long) high_memory + \
23788 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
23789 -#ifdef CONFIG_HIGHMEM
23790 -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
23792 -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
23796 - * _PAGE_PSE set in the page directory entry just means that
23797 - * the page directory entry points directly to a 4MB-aligned block of
23800 -#define _PAGE_BIT_PRESENT 0
23801 -#define _PAGE_BIT_RW 1
23802 -#define _PAGE_BIT_USER 2
23803 -#define _PAGE_BIT_PWT 3
23804 -#define _PAGE_BIT_PCD 4
23805 -#define _PAGE_BIT_ACCESSED 5
23806 -#define _PAGE_BIT_DIRTY 6
23807 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
23808 -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
23809 -/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */
23810 -#define _PAGE_BIT_UNUSED2 10
23811 -#define _PAGE_BIT_UNUSED3 11
23812 -#define _PAGE_BIT_NX 63
23814 -#define _PAGE_PRESENT 0x001
23815 -#define _PAGE_RW 0x002
23816 -#define _PAGE_USER 0x004
23817 -#define _PAGE_PWT 0x008
23818 -#define _PAGE_PCD 0x010
23819 -#define _PAGE_ACCESSED 0x020
23820 -#define _PAGE_DIRTY 0x040
23821 -#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */
23822 -#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */
23823 -/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */
23824 -#define _PAGE_UNUSED2 0x400
23825 -#define _PAGE_UNUSED3 0x800
23827 -/* If _PAGE_PRESENT is clear, we use these: */
23828 -#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
23829 -#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE;
23830 - pte_present gives true */
23831 #ifdef CONFIG_X86_PAE
23832 -#define _PAGE_NX (1ULL<<_PAGE_BIT_NX)
23833 +#define LAST_PKMAP 512
23835 -#define _PAGE_NX 0
23836 +#define LAST_PKMAP 1024
23839 -/* Mapped page is I/O or foreign and has no associated page struct. */
23840 -#define _PAGE_IO 0x200
23841 +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
23843 -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
23844 -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
23845 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
23847 -#define PAGE_NONE \
23848 - __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
23849 -#define PAGE_SHARED \
23850 - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23852 -#define PAGE_SHARED_EXEC \
23853 - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
23854 -#define PAGE_COPY_NOEXEC \
23855 - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23856 -#define PAGE_COPY_EXEC \
23857 - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23858 -#define PAGE_COPY \
23860 -#define PAGE_READONLY \
23861 - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
23862 -#define PAGE_READONLY_EXEC \
23863 - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
23865 -#define _PAGE_KERNEL \
23866 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
23867 -#define _PAGE_KERNEL_EXEC \
23868 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
23870 -extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
23871 -#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
23872 -#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
23873 -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
23874 -#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
23875 -#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
23877 -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
23878 -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
23879 -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
23880 -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
23881 -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
23882 -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
23883 -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
23886 - * The i386 can't do page protection for execute, and considers that
23887 - * the same are read. Also, write permissions imply read permissions.
23888 - * This is the closest we can get..
23890 -#define __P000 PAGE_NONE
23891 -#define __P001 PAGE_READONLY
23892 -#define __P010 PAGE_COPY
23893 -#define __P011 PAGE_COPY
23894 -#define __P100 PAGE_READONLY_EXEC
23895 -#define __P101 PAGE_READONLY_EXEC
23896 -#define __P110 PAGE_COPY_EXEC
23897 -#define __P111 PAGE_COPY_EXEC
23899 -#define __S000 PAGE_NONE
23900 -#define __S001 PAGE_READONLY
23901 -#define __S010 PAGE_SHARED
23902 -#define __S011 PAGE_SHARED
23903 -#define __S100 PAGE_READONLY_EXEC
23904 -#define __S101 PAGE_READONLY_EXEC
23905 -#define __S110 PAGE_SHARED_EXEC
23906 -#define __S111 PAGE_SHARED_EXEC
23907 +#ifdef CONFIG_HIGHMEM
23908 +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
23910 +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
23914 * Define this if things work differently on an i386 and an i486:
23915 @@ -221,28 +100,6 @@ extern unsigned long pg0[];
23917 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
23920 - * The following only work if pte_present() is true.
23921 - * Undefined behaviour if not..
23923 -static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
23924 -static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
23925 -static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
23926 -static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; }
23929 - * The following only works if pte_present() is not true.
23931 -static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
23933 -static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
23934 -static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
23935 -static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
23936 -static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
23937 -static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
23938 -static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
23939 -static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; }
23941 #ifdef CONFIG_X86_PAE
23942 # include <asm/pgtable-3level.h>
23944 @@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte
23948 - * Rules for using pte_update - it must be called after any PTE update which
23949 - * has not been done using the set_pte / clear_pte interfaces. It is used by
23950 - * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
23951 - * updates should either be sets, clears, or set_pte_atomic for P->P
23952 - * transitions, which means this hook should only be called for user PTEs.
23953 - * This hook implies a P->P protection or access change has taken place, which
23954 - * requires a subsequent TLB flush. The notification can optionally be delayed
23955 - * until the TLB flush event by using the pte_update_defer form of the
23956 - * interface, but care must be taken to assure that the flush happens while
23957 - * still holding the same page table lock so that the shadow and primary pages
23958 - * do not become out of sync on SMP.
23960 -#define pte_update(mm, addr, ptep) do { } while (0)
23961 -#define pte_update_defer(mm, addr, ptep) do { } while (0)
23963 -/* local pte updates need not use xchg for locking */
23964 -static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
23966 - xen_set_pte(ptep, __pte(0));
23971 - * We only update the dirty/accessed state if we set
23972 - * the dirty bit by hand in the kernel, since the hardware
23973 - * will do the accessed bit for us, and we don't want to
23974 - * race with other CPU's that might be updating the dirty
23975 - * bit at the same time.
23977 -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
23978 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
23980 - int __changed = !pte_same(*(ptep), entry); \
23981 - if (__changed && (dirty)) { \
23982 - if ( likely((vma)->vm_mm == current->mm) ) { \
23983 - BUG_ON(HYPERVISOR_update_va_mapping(address, \
23985 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
23986 - UVMF_INVLPG|UVMF_MULTI)); \
23988 - xen_l1_entry_update(ptep, entry); \
23989 - flush_tlb_page(vma, address); \
23995 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
23996 -#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
23998 - if (pte_young(*(ptep))) \
23999 - __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
24000 - &(ptep)->pte_low); \
24002 - pte_update((vma)->vm_mm, addr, ptep); \
24006 -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24007 -#define ptep_clear_flush_young(vma, address, ptep) \
24009 - pte_t __pte = *(ptep); \
24010 - int __young = pte_young(__pte); \
24011 - __pte = pte_mkold(__pte); \
24012 - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
24013 - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24014 - else if (__young) \
24015 - (ptep)->pte_low = __pte.pte_low; \
24019 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24020 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24022 - pte_t pte = *ptep;
24023 - if (!pte_none(pte)
24024 - && (mm != &init_mm
24025 - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
24026 - pte = xen_ptep_get_and_clear(ptep, pte);
24027 - pte_update(mm, addr, ptep);
24032 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24033 -#define ptep_get_and_clear_full(mm, addr, ptep, full) \
24035 - pte_t __res = *(ptep); \
24036 - if (PagePinned(virt_to_page((mm)->pgd))) \
24037 - xen_l1_entry_update(ptep, __pte(0)); \
24039 - *(ptep) = __pte(0); \
24042 - ptep_get_and_clear(mm, addr, ptep))
24044 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24045 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24047 - pte_t pte = *ptep;
24048 - if (pte_write(pte))
24049 - set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24053 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
24055 * dst - pointer to pgd range anwhere on a pgd page
24056 @@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t
24058 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
24060 -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24063 - * Since this might change the present bit (which controls whether
24064 - * a pte_t object has undergone p2m translation), we must use
24065 - * pte_val() on the input pte and __pte() for the return value.
24067 - paddr_t pteval = pte_val(pte);
24069 - pteval &= _PAGE_CHG_MASK;
24070 - pteval |= pgprot_val(newprot);
24071 -#ifdef CONFIG_X86_PAE
24072 - pteval &= __supported_pte_mask;
24074 - return __pte(pteval);
24077 -#define pmd_large(pmd) \
24078 -((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
24081 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
24083 @@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte
24085 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
24087 +static inline int pud_large(pud_t pud) { return 0; }
24090 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
24092 @@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte
24093 #define pmd_page_vaddr(pmd) \
24094 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
24097 - * Helper function that returns the kernel pagetable entry controlling
24098 - * the virtual address 'address'. NULL means no pagetable entry present.
24099 - * NOTE: the return type is pte_t but if the pmd is PSE then we return it
24102 -extern pte_t *lookup_address(unsigned long address);
24105 - * Make a given kernel text page executable/non-executable.
24106 - * Returns the previous executability setting of that page (which
24107 - * is used to restore the previous state). Used by the SMP bootup code.
24108 - * NOTE: this is an __init function for security reasons.
24110 -#ifdef CONFIG_X86_PAE
24111 - extern int set_kernel_exec(unsigned long vaddr, int enable);
24113 - static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
24116 #if defined(CONFIG_HIGHPTE)
24117 #define pte_offset_map(dir, address) \
24118 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
24119 @@ -496,72 +210,22 @@ extern pte_t *lookup_address(unsigned lo
24121 #define update_mmu_cache(vma,address,pte) do { } while (0)
24123 -#include <xen/features.h>
24124 void make_lowmem_page_readonly(void *va, unsigned int feature);
24125 void make_lowmem_page_writable(void *va, unsigned int feature);
24126 -void make_page_readonly(void *va, unsigned int feature);
24127 -void make_page_writable(void *va, unsigned int feature);
24128 -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
24129 -void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
24131 -#define virt_to_ptep(va) \
24133 - pte_t *__ptep = lookup_address((unsigned long)(va)); \
24134 - BUG_ON(!__ptep || !pte_present(*__ptep)); \
24138 -#define arbitrary_virt_to_machine(va) \
24139 - (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
24140 - | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24142 -#ifdef CONFIG_HIGHPTE
24143 -#include <asm/io.h>
24144 -struct page *kmap_atomic_to_page(void *);
24145 -#define ptep_to_machine(ptep) \
24147 - pte_t *__ptep = (ptep); \
24148 - page_to_phys(kmap_atomic_to_page(__ptep)) \
24149 - | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \
24152 -#define ptep_to_machine(ptep) virt_to_machine(ptep)
24155 #endif /* !__ASSEMBLY__ */
24158 + * kern_addr_valid() is (1) for FLATMEM and (0) for
24159 + * SPARSEMEM and DISCONTIGMEM
24161 #ifdef CONFIG_FLATMEM
24162 #define kern_addr_valid(addr) (1)
24163 -#endif /* CONFIG_FLATMEM */
24165 -int direct_remap_pfn_range(struct vm_area_struct *vma,
24166 - unsigned long address,
24167 - unsigned long mfn,
24168 - unsigned long size,
24171 -int direct_kernel_remap_pfn_range(unsigned long address,
24172 - unsigned long mfn,
24173 - unsigned long size,
24176 -int create_lookup_pte_addr(struct mm_struct *mm,
24177 - unsigned long address,
24179 -int touch_pte_range(struct mm_struct *mm,
24180 - unsigned long address,
24181 - unsigned long size);
24183 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24184 - unsigned long addr, unsigned long end, pgprot_t newprot,
24185 - int dirty_accountable);
24187 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24188 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24190 +#define kern_addr_valid(kaddr) (0)
24193 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
24194 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
24196 -#include <asm-generic/pgtable.h>
24198 #endif /* _I386_PGTABLE_H */
24199 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-02-16 16:18:36.000000000 +0100
24200 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/pgtable_64.h 2009-03-16 16:33:40.000000000 +0100
24201 @@ -13,49 +13,26 @@
24202 #include <linux/threads.h>
24203 #include <linux/sched.h>
24204 #include <asm/pda.h>
24206 -#include <asm/hypervisor.h>
24209 extern pud_t level3_user_pgt[512];
24211 extern void xen_init_pt(void);
24213 -extern pte_t *lookup_address(unsigned long address);
24215 -#define virt_to_ptep(va) \
24217 - pte_t *__ptep = lookup_address((unsigned long)(va)); \
24218 - BUG_ON(!__ptep || !pte_present(*__ptep)); \
24222 -#define arbitrary_virt_to_machine(va) \
24223 - (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \
24224 - | ((unsigned long)(va) & (PAGE_SIZE - 1)))
24226 -#define ptep_to_machine(ptep) virt_to_machine(ptep)
24229 extern pud_t level3_kernel_pgt[512];
24230 extern pud_t level3_ident_pgt[512];
24231 extern pmd_t level2_kernel_pgt[512];
24232 extern pgd_t init_level4_pgt[];
24233 -extern unsigned long __supported_pte_mask;
24235 #define swapper_pg_dir init_level4_pgt
24237 extern void paging_init(void);
24238 -extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
24241 - * ZERO_PAGE is a global shared page that is always zero: used
24242 - * for zero-mapped memory areas etc..
24244 -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
24245 -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
24247 #endif /* !__ASSEMBLY__ */
24249 +#define SHARED_KERNEL_PMD 1
24252 * PGDIR_SHIFT determines what a top-level page table entry can map
24254 @@ -98,31 +75,63 @@ extern unsigned long empty_zero_page[PAG
24255 #define pgd_none(x) (!__pgd_val(x))
24256 #define pud_none(x) (!__pud_val(x))
24258 -static inline void set_pte(pte_t *dst, pte_t val)
24261 +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
24263 +static inline void xen_set_pte(pte_t *ptep, pte_t pte)
24268 +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
24271 + xen_set_pte(ptep, pte);
24274 -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
24275 -#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
24276 -#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
24278 +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
24280 + return __pte_ma(xchg(&xp->pte, 0));
24283 +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
24286 -static inline void pud_clear (pud_t * pud)
24287 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
24289 - set_pud(pud, __pud(0));
24290 + xen_l2_entry_update(pmdp, pmd);
24293 +static inline void xen_pmd_clear(pmd_t *pmd)
24295 + xen_set_pmd(pmd, xen_make_pmd(0));
24298 +static inline void xen_set_pud(pud_t *pudp, pud_t pud)
24300 + xen_l3_entry_update(pudp, pud);
24303 +static inline void xen_pud_clear(pud_t *pud)
24305 + xen_set_pud(pud, xen_make_pud(0));
24308 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
24310 -static inline void pgd_clear (pgd_t * pgd)
24311 +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
24313 - set_pgd(pgd, __pgd(0));
24314 - set_pgd(__user_pgd(pgd), __pgd(0));
24315 + xen_l4_entry_update(pgdp, pgd);
24318 -#define pte_same(a, b) ((a).pte == (b).pte)
24319 +static inline void xen_pgd_clear(pgd_t * pgd)
24321 + xen_set_pgd(pgd, xen_make_pgd(0));
24322 + xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
24325 -#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
24326 +#define pte_same(a, b) ((a).pte == (b).pte)
24328 #endif /* !__ASSEMBLY__ */
24330 @@ -133,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg
24331 #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
24332 #define PGDIR_MASK (~(PGDIR_SIZE-1))
24334 -#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
24335 -#define FIRST_USER_ADDRESS 0
24337 #define MAXMEM _AC(0x3fffffffffff, UL)
24338 #define VMALLOC_START _AC(0xffffc20000000000, UL)
24339 @@ -144,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg
24340 #define MODULES_END _AC(0xfffffffffff00000, UL)
24341 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
24343 -#define _PAGE_BIT_PRESENT 0
24344 -#define _PAGE_BIT_RW 1
24345 -#define _PAGE_BIT_USER 2
24346 -#define _PAGE_BIT_PWT 3
24347 -#define _PAGE_BIT_PCD 4
24348 -#define _PAGE_BIT_ACCESSED 5
24349 -#define _PAGE_BIT_DIRTY 6
24350 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
24351 -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
24352 -#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24354 -#define _PAGE_PRESENT 0x001
24355 -#define _PAGE_RW 0x002
24356 -#define _PAGE_USER 0x004
24357 -#define _PAGE_PWT 0x008
24358 -#define _PAGE_PCD 0x010
24359 -#define _PAGE_ACCESSED 0x020
24360 -#define _PAGE_DIRTY 0x040
24361 -#define _PAGE_PSE 0x080 /* 2MB page */
24362 -#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */
24363 -#define _PAGE_GLOBAL 0x100 /* Global TLB entry */
24365 -#define _PAGE_PROTNONE 0x080 /* If not present */
24366 -#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
24368 -/* Mapped page is I/O or foreign and has no associated page struct. */
24369 -#define _PAGE_IO 0x200
24371 -#ifndef __ASSEMBLY__
24372 -#if CONFIG_XEN_COMPAT <= 0x030002
24373 -extern unsigned int __kernel_page_user;
24375 -#define __kernel_page_user 0
24379 -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
24380 -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
24382 -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
24384 -#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
24385 -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24386 -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
24387 -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24388 -#define PAGE_COPY PAGE_COPY_NOEXEC
24389 -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24390 -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
24391 -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24392 -#define __PAGE_KERNEL \
24393 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24394 -#define __PAGE_KERNEL_EXEC \
24395 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
24396 -#define __PAGE_KERNEL_NOCACHE \
24397 - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24398 -#define __PAGE_KERNEL_RO \
24399 - (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
24400 -#define __PAGE_KERNEL_VSYSCALL \
24401 - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
24402 -#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
24403 - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
24404 -#define __PAGE_KERNEL_LARGE \
24405 - (__PAGE_KERNEL | _PAGE_PSE)
24406 -#define __PAGE_KERNEL_LARGE_EXEC \
24407 - (__PAGE_KERNEL_EXEC | _PAGE_PSE)
24410 - * We don't support GLOBAL page in xenolinux64
24412 -#define MAKE_GLOBAL(x) __pgprot((x))
24414 -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
24415 -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
24416 -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
24417 -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
24418 -#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
24419 -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
24420 -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
24421 -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
24424 -#define __P000 PAGE_NONE
24425 -#define __P001 PAGE_READONLY
24426 -#define __P010 PAGE_COPY
24427 -#define __P011 PAGE_COPY
24428 -#define __P100 PAGE_READONLY_EXEC
24429 -#define __P101 PAGE_READONLY_EXEC
24430 -#define __P110 PAGE_COPY_EXEC
24431 -#define __P111 PAGE_COPY_EXEC
24433 -#define __S000 PAGE_NONE
24434 -#define __S001 PAGE_READONLY
24435 -#define __S010 PAGE_SHARED
24436 -#define __S011 PAGE_SHARED
24437 -#define __S100 PAGE_READONLY_EXEC
24438 -#define __S101 PAGE_READONLY_EXEC
24439 -#define __S110 PAGE_SHARED_EXEC
24440 -#define __S111 PAGE_SHARED_EXEC
24442 #ifndef __ASSEMBLY__
24444 static inline unsigned long pgd_bad(pgd_t pgd)
24445 @@ -260,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_
24446 return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
24449 -#define set_pte_at(_mm,addr,ptep,pteval) do { \
24450 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
24451 - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
24452 - set_pte((ptep), (pteval)); \
24455 #define pte_none(x) (!(x).pte)
24456 #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
24457 -#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
24459 -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
24460 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */
24462 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
24463 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
24464 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
24465 -#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \
24466 +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
24467 (_pte).pte & _PAGE_PRESENT ? \
24468 mfn_to_local_pfn(__pte_mfn(_pte)) : \
24471 #define pte_page(x) pfn_to_page(pte_pfn(x))
24473 -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
24475 - unsigned long pte = page_nr << PAGE_SHIFT;
24476 - pte |= pgprot_val(pgprot);
24477 - pte &= __supported_pte_mask;
24478 - return __pte(pte);
24481 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24483 - pte_t pte = *ptep;
24484 - if (!pte_none(pte)) {
24485 - if ((mm != &init_mm) ||
24486 - HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
24487 - pte = __pte_ma(xchg(&ptep->pte, 0));
24492 -static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
24495 - pte_t pte = *ptep;
24496 - if (PagePinned(virt_to_page(mm->pgd)))
24497 - xen_l1_entry_update(ptep, __pte(0));
24499 - *ptep = __pte(0);
24502 - return ptep_get_and_clear(mm, addr, ptep);
24505 -#define ptep_clear_flush(vma, addr, ptep) \
24507 - pte_t *__ptep = (ptep); \
24508 - pte_t __res = *__ptep; \
24509 - if (!pte_none(__res) && \
24510 - ((vma)->vm_mm != current->mm || \
24511 - HYPERVISOR_update_va_mapping(addr, __pte(0), \
24512 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24513 - UVMF_INVLPG|UVMF_MULTI))) { \
24514 - __ptep->pte = 0; \
24515 - flush_tlb_page(vma, addr); \
24521 - * The following only work if pte_present() is true.
24522 - * Undefined behaviour if not..
24524 -#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
24525 -static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
24526 -static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
24527 -static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
24528 -static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; }
24529 -static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; }
24531 -static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
24532 -static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
24533 -static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
24534 -static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; }
24535 -static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
24536 -static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
24537 -static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
24538 -static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
24539 -static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
24541 -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
24543 - if (!pte_young(*ptep))
24545 - return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
24548 -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
24550 - pte_t pte = *ptep;
24551 - if (pte_write(pte))
24552 - set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
24556 * Macro to mark a page protection value as "uncacheable".
24558 #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
24560 -static inline int pmd_large(pmd_t pte) {
24561 - return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
24566 * Conversion functions: convert a page and protection to a page entry,
24567 @@ -388,6 +203,7 @@ static inline int pmd_large(pmd_t pte) {
24568 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
24569 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
24570 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
24571 +static inline int pgd_large(pgd_t pgd) { return 0; }
24572 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
24574 /* PUD - Level3 access */
24575 @@ -398,6 +214,12 @@ static inline int pmd_large(pmd_t pte) {
24576 #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
24577 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
24579 +static inline int pud_large(pud_t pte)
24581 + return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
24582 + (_PAGE_PSE|_PAGE_PRESENT);
24585 /* PMD - Level 2 access */
24586 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
24587 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
24588 @@ -413,36 +235,18 @@ static inline int pmd_large(pmd_t pte) {
24590 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
24592 -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
24593 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
24594 #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
24596 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
24597 -#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
24598 +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
24599 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
24601 /* PTE - Level 1 access. */
24603 /* page, protection -> pte */
24604 #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
24605 -#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
24607 -/* Change flags of a PTE */
24608 -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
24611 - * Since this might change the present bit (which controls whether
24612 - * a pte_t object has undergone p2m translation), we must use
24613 - * pte_val() on the input pte and __pte() for the return value.
24615 - unsigned long pteval = pte_val(pte);
24617 - pteval &= _PAGE_CHG_MASK;
24618 - pteval |= pgprot_val(newprot);
24619 - pteval &= __supported_pte_mask;
24620 - return __pte(pteval);
24623 #define pte_index(address) \
24624 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
24625 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
24626 @@ -456,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte
24628 #define update_mmu_cache(vma,address,pte) do { } while (0)
24631 - * Rules for using ptep_establish: the pte MUST be a user pte, and
24632 - * must be a present->present transition.
24634 -#define __HAVE_ARCH_PTEP_ESTABLISH
24635 -#define ptep_establish(vma, address, ptep, pteval) \
24637 - if ( likely((vma)->vm_mm == current->mm) ) { \
24638 - BUG_ON(HYPERVISOR_update_va_mapping(address, \
24640 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
24641 - UVMF_INVLPG|UVMF_MULTI)); \
24643 - xen_l1_entry_update(ptep, pteval); \
24644 - flush_tlb_page(vma, address); \
24648 -/* We only update the dirty/accessed state if we set
24649 - * the dirty bit by hand in the kernel, since the hardware
24650 - * will do the accessed bit for us, and we don't want to
24651 - * race with other CPU's that might be updating the dirty
24652 - * bit at the same time. */
24653 -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
24654 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
24656 - int __changed = !pte_same(*(ptep), entry); \
24657 - if (__changed && (dirty)) \
24658 - ptep_establish(vma, address, ptep, entry); \
24662 -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
24663 -#define ptep_clear_flush_young(vma, address, ptep) \
24665 - pte_t __pte = *(ptep); \
24666 - int __young = pte_young(__pte); \
24667 - __pte = pte_mkold(__pte); \
24668 - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \
24669 - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
24670 - else if (__young) \
24671 - set_pte(ptep, __pte); \
24675 /* Encode and de-code a swap entry */
24676 #define __swp_type(x) (((x).val >> 1) & 0x3f)
24677 #define __swp_offset(x) ((x).val >> 8)
24678 #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
24679 #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) })
24680 -#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
24682 -extern spinlock_t pgd_lock;
24683 -extern struct list_head pgd_list;
24684 +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
24686 extern int kern_addr_valid(unsigned long addr);
24688 -#define DOMID_LOCAL (0xFFFFU)
24690 -struct vm_area_struct;
24692 -int direct_remap_pfn_range(struct vm_area_struct *vma,
24693 - unsigned long address,
24694 - unsigned long mfn,
24695 - unsigned long size,
24699 -int direct_kernel_remap_pfn_range(unsigned long address,
24700 - unsigned long mfn,
24701 - unsigned long size,
24705 -int create_lookup_pte_addr(struct mm_struct *mm,
24706 - unsigned long address,
24709 -int touch_pte_range(struct mm_struct *mm,
24710 - unsigned long address,
24711 - unsigned long size);
24713 -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
24714 - unsigned long addr, unsigned long end, pgprot_t newprot,
24715 - int dirty_accountable);
24717 -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
24718 - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
24720 -pte_t *lookup_address(unsigned long addr);
24721 +extern void cleanup_highmap(void);
24723 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
24724 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
24726 #define HAVE_ARCH_UNMAPPED_AREA
24727 +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
24729 #define pgtable_cache_init() do { } while (0)
24730 #define check_pgt_cache() do { } while (0)
24731 @@ -563,13 +287,7 @@ pte_t *lookup_address(unsigned long addr
24732 #define kc_offset_to_vaddr(o) \
24733 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
24735 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
24736 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
24737 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
24738 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
24739 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
24740 #define __HAVE_ARCH_PTE_SAME
24741 -#include <asm-generic/pgtable.h>
24742 #endif /* !__ASSEMBLY__ */
24744 #endif /* _X86_64_PGTABLE_H */
24745 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/processor.h 2009-02-16 16:18:36.000000000 +0100
24746 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/processor.h 2009-03-16 16:33:40.000000000 +0100
24748 +#ifndef __ASM_X86_PROCESSOR_H
24749 +#define __ASM_X86_PROCESSOR_H
24751 +#include <asm/processor-flags.h>
24753 +/* migration helpers, for KVM - will be removed in 2.6.25: */
24754 +#include <asm/vm86.h>
24755 +#define Xgt_desc_struct desc_ptr
24757 +/* Forward declaration, a strange C thing */
24758 +struct task_struct;
24761 +#include <asm/vm86.h>
24762 +#include <asm/math_emu.h>
24763 +#include <asm/segment.h>
24764 +#include <asm/types.h>
24765 +#include <asm/sigcontext.h>
24766 +#include <asm/current.h>
24767 +#include <asm/cpufeature.h>
24768 +#include <asm/system.h>
24769 +#include <asm/page.h>
24770 +#include <asm/percpu.h>
24771 +#include <asm/msr.h>
24772 +#include <asm/desc_defs.h>
24773 +#include <asm/nops.h>
24774 +#include <linux/personality.h>
24775 +#include <linux/cpumask.h>
24776 +#include <linux/cache.h>
24777 +#include <linux/threads.h>
24778 +#include <linux/init.h>
24779 +#include <xen/interface/physdev.h>
24782 + * Default implementation of macro that returns current
24783 + * instruction pointer ("program counter").
24785 +static inline void *current_text_addr(void)
24788 + asm volatile("mov $1f,%0\n1:":"=r" (pc));
24792 +#ifdef CONFIG_X86_VSMP
24793 +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
24794 +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
24796 +#define ARCH_MIN_TASKALIGN 16
24797 +#define ARCH_MIN_MMSTRUCT_ALIGN 0
24801 + * CPU type and hardware bug flags. Kept separately for each CPU.
24802 + * Members of this structure are referenced in head.S, so think twice
24803 + * before touching them. [mj]
24806 +struct cpuinfo_x86 {
24807 + __u8 x86; /* CPU family */
24808 + __u8 x86_vendor; /* CPU vendor */
24811 +#ifdef CONFIG_X86_32
24812 + char wp_works_ok; /* It doesn't on 386's */
24813 + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
24821 + /* number of 4K pages in DTLB/ITLB combined(in pages)*/
24823 + __u8 x86_virt_bits, x86_phys_bits;
24824 + /* cpuid returned core id bits */
24825 + __u8 x86_coreid_bits;
24826 + /* Max extended CPUID function supported */
24827 + __u32 extended_cpuid_level;
24829 + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
24830 + __u32 x86_capability[NCAPINTS];
24831 + char x86_vendor_id[16];
24832 + char x86_model_id[64];
24833 + int x86_cache_size; /* in KB - valid for CPUS which support this
24835 + int x86_cache_alignment; /* In bytes */
24837 + unsigned long loops_per_jiffy;
24839 + cpumask_t llc_shared_map; /* cpus sharing the last level cache */
24841 + u16 x86_max_cores; /* cpuid returned max cores value */
24843 + u16 x86_clflush_size;
24845 + u16 booted_cores; /* number of cores as seen by OS */
24846 + u16 phys_proc_id; /* Physical processor id. */
24847 + u16 cpu_core_id; /* Core id */
24848 + u16 cpu_index; /* index into per_cpu list */
24850 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
24852 +#define X86_VENDOR_INTEL 0
24853 +#define X86_VENDOR_CYRIX 1
24854 +#define X86_VENDOR_AMD 2
24855 +#define X86_VENDOR_UMC 3
24856 +#define X86_VENDOR_NEXGEN 4
24857 +#define X86_VENDOR_CENTAUR 5
24858 +#define X86_VENDOR_TRANSMETA 7
24859 +#define X86_VENDOR_NSC 8
24860 +#define X86_VENDOR_NUM 9
24861 +#define X86_VENDOR_UNKNOWN 0xff
24864 + * capabilities of CPUs
24866 +extern struct cpuinfo_x86 boot_cpu_data;
24867 +extern struct cpuinfo_x86 new_cpu_data;
24868 +extern __u32 cleared_cpu_caps[NCAPINTS];
24871 +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
24872 +#define cpu_data(cpu) per_cpu(cpu_info, cpu)
24873 +#define current_cpu_data cpu_data(smp_processor_id())
24875 +#define cpu_data(cpu) boot_cpu_data
24876 +#define current_cpu_data boot_cpu_data
24879 +void cpu_detect(struct cpuinfo_x86 *c);
24881 +extern void identify_cpu(struct cpuinfo_x86 *);
24882 +extern void identify_boot_cpu(void);
24883 +extern void identify_secondary_cpu(struct cpuinfo_x86 *);
24884 +extern void print_cpu_info(struct cpuinfo_x86 *);
24885 +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
24886 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
24887 +extern unsigned short num_cache_leaves;
24889 +#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
24890 +extern void detect_ht(struct cpuinfo_x86 *c);
24892 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
24895 +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
24896 + unsigned int *ecx, unsigned int *edx)
24898 + /* ecx is often an input as well as an output. */
24899 + __asm__(XEN_CPUID
24904 + : "0" (*eax), "2" (*ecx));
24907 +static inline void load_cr3(pgd_t *pgdir)
24909 + write_cr3(__pa(pgdir));
24912 +#ifndef CONFIG_X86_NO_TSS
24913 +#ifdef CONFIG_X86_32
24914 +/* This is the TSS defined by the hardware. */
24915 +struct x86_hw_tss {
24916 + unsigned short back_link, __blh;
24917 + unsigned long sp0;
24918 + unsigned short ss0, __ss0h;
24919 + unsigned long sp1;
24920 + unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */
24921 + unsigned long sp2;
24922 + unsigned short ss2, __ss2h;
24923 + unsigned long __cr3;
24924 + unsigned long ip;
24925 + unsigned long flags;
24926 + unsigned long ax, cx, dx, bx;
24927 + unsigned long sp, bp, si, di;
24928 + unsigned short es, __esh;
24929 + unsigned short cs, __csh;
24930 + unsigned short ss, __ssh;
24931 + unsigned short ds, __dsh;
24932 + unsigned short fs, __fsh;
24933 + unsigned short gs, __gsh;
24934 + unsigned short ldt, __ldth;
24935 + unsigned short trace, io_bitmap_base;
24936 +} __attribute__((packed));
24937 +extern struct tss_struct doublefault_tss;
24939 +struct x86_hw_tss {
24949 + u16 io_bitmap_base;
24950 +} __attribute__((packed)) ____cacheline_aligned;
24952 +#endif /* CONFIG_X86_NO_TSS */
24955 + * Size of io_bitmap.
24957 +#define IO_BITMAP_BITS 65536
24958 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
24959 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
24960 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
24961 +#define INVALID_IO_BITMAP_OFFSET 0x8000
24962 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
24964 +#ifndef CONFIG_X86_NO_TSS
24965 +struct tss_struct {
24966 + struct x86_hw_tss x86_tss;
24969 + * The extra 1 is there because the CPU will access an
24970 + * additional byte beyond the end of the IO permission
24971 + * bitmap. The extra byte must be all 1 bits, and must
24972 + * be within the limit.
24974 + unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
24976 + * Cache the current maximum and the last task that used the bitmap:
24978 + unsigned long io_bitmap_max;
24979 + struct thread_struct *io_bitmap_owner;
24981 + * pads the TSS to be cacheline-aligned (size is 0x100)
24983 + unsigned long __cacheline_filler[35];
24985 + * .. and then another 0x100 bytes for emergency kernel stack
24987 + unsigned long stack[64];
24988 +} __attribute__((packed));
24990 +DECLARE_PER_CPU(struct tss_struct, init_tss);
24992 +/* Save the original ist values for checking stack pointers during debugging */
24994 + unsigned long ist[7];
24996 +#endif /* CONFIG_X86_NO_TSS */
24998 +#define MXCSR_DEFAULT 0x1f80
25000 +struct i387_fsave_struct {
25008 + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25009 + u32 status; /* software status information */
25012 +struct i387_fxsave_struct {
25031 + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
25032 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
25034 +} __attribute__((aligned(16)));
25036 +struct i387_soft_struct {
25044 + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25045 + u8 ftop, changed, lookahead, no_update, rm, alimit;
25046 + struct info *info;
25050 +union i387_union {
25051 + struct i387_fsave_struct fsave;
25052 + struct i387_fxsave_struct fxsave;
25053 + struct i387_soft_struct soft;
25056 +#ifdef CONFIG_X86_32
25057 +DECLARE_PER_CPU(u8, cpu_llc_id);
25058 +#elif !defined(CONFIG_X86_NO_TSS)
25059 +DECLARE_PER_CPU(struct orig_ist, orig_ist);
25062 +extern void print_cpu_info(struct cpuinfo_x86 *);
25063 +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25064 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25065 +extern unsigned short num_cache_leaves;
25067 +struct thread_struct {
25068 +/* cached TLS descriptors. */
25069 + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25070 + unsigned long sp0;
25071 + unsigned long sp;
25072 +#ifdef CONFIG_X86_32
25073 + unsigned long sysenter_cs;
25075 + unsigned long usersp; /* Copy from PDA */
25076 + unsigned short es, ds, fsindex, gsindex;
25078 + unsigned long ip;
25079 + unsigned long fs;
25080 + unsigned long gs;
25081 +/* Hardware debugging registers */
25082 + unsigned long debugreg0;
25083 + unsigned long debugreg1;
25084 + unsigned long debugreg2;
25085 + unsigned long debugreg3;
25086 + unsigned long debugreg6;
25087 + unsigned long debugreg7;
25089 + unsigned long cr2, trap_no, error_code;
25090 +/* floating point info */
25091 + union i387_union i387 __attribute__((aligned(16)));;
25092 +#ifdef CONFIG_X86_32
25093 +/* virtual 86 mode info */
25094 + struct vm86_struct __user *vm86_info;
25095 + unsigned long screen_bitmap;
25096 + unsigned long v86flags, v86mask, saved_sp0;
25097 + unsigned int saved_fs, saved_gs;
25099 +/* IO permissions */
25100 + unsigned long *io_bitmap_ptr;
25101 + unsigned long iopl;
25102 +/* max allowed port in the bitmap, in bytes: */
25103 + unsigned io_bitmap_max;
25104 +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
25105 + unsigned long debugctlmsr;
25106 +/* Debug Store - if not 0 points to a DS Save Area configuration;
25107 + * goes into MSR_IA32_DS_AREA */
25108 + unsigned long ds_area_msr;
25111 +static inline unsigned long xen_get_debugreg(int regno)
25113 + return HYPERVISOR_get_debugreg(regno);
25116 +static inline void xen_set_debugreg(int regno, unsigned long value)
25118 + WARN_ON(HYPERVISOR_set_debugreg(regno, value));
25122 + * Set IOPL bits in EFLAGS from given mask
25124 +static inline void xen_set_iopl_mask(unsigned mask)
25126 + struct physdev_set_iopl set_iopl;
25128 + /* Force the change at ring 0. */
25129 + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
25130 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
25133 +#ifndef CONFIG_X86_NO_TSS
25134 +static inline void native_load_sp0(struct tss_struct *tss,
25135 + struct thread_struct *thread)
25137 + tss->x86_tss.sp0 = thread->sp0;
25138 +#ifdef CONFIG_X86_32
25139 + /* Only happens when SEP is enabled, no need to test "SEP"arately */
25140 + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
25141 + tss->x86_tss.ss1 = thread->sysenter_cs;
25142 + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
25147 +#define xen_load_sp0(tss, thread) do { \
25148 + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
25153 +#define __cpuid xen_cpuid
25154 +#define paravirt_enabled() 0
25157 + * These special macros can be used to get or set a debugging register
25159 +#define get_debugreg(var, register) \
25160 + (var) = xen_get_debugreg(register)
25161 +#define set_debugreg(value, register) \
25162 + xen_set_debugreg(register, value)
25164 +#define load_sp0 xen_load_sp0
25166 +#define set_iopl_mask xen_set_iopl_mask
25169 + * Save the cr4 feature set we're using (ie
25170 + * Pentium 4MB enable and PPro Global page
25171 + * enable), so that any CPU's that boot up
25172 + * after us can get the correct flags.
25174 +extern unsigned long mmu_cr4_features;
25176 +static inline void set_in_cr4(unsigned long mask)
25179 + mmu_cr4_features |= mask;
25180 + cr4 = read_cr4();
25185 +static inline void clear_in_cr4(unsigned long mask)
25188 + mmu_cr4_features &= ~mask;
25189 + cr4 = read_cr4();
25194 +struct microcode_header {
25195 + unsigned int hdrver;
25196 + unsigned int rev;
25197 + unsigned int date;
25198 + unsigned int sig;
25199 + unsigned int cksum;
25200 + unsigned int ldrver;
25202 + unsigned int datasize;
25203 + unsigned int totalsize;
25204 + unsigned int reserved[3];
25207 +struct microcode {
25208 + struct microcode_header hdr;
25209 + unsigned int bits[0];
25212 +typedef struct microcode microcode_t;
25213 +typedef struct microcode_header microcode_header_t;
25215 +/* microcode format is extended from prescott processors */
25216 +struct extended_signature {
25217 + unsigned int sig;
25219 + unsigned int cksum;
25222 +struct extended_sigtable {
25223 + unsigned int count;
25224 + unsigned int cksum;
25225 + unsigned int reserved[3];
25226 + struct extended_signature sigs[0];
25230 + unsigned long seg;
25235 + * create a kernel thread without removing it from tasklists
25237 +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
25239 +/* Free all resources held by a thread. */
25240 +extern void release_thread(struct task_struct *);
25242 +/* Prepare to copy thread state - unlazy all lazy status */
25243 +extern void prepare_to_copy(struct task_struct *tsk);
25245 +unsigned long get_wchan(struct task_struct *p);
25248 + * Generic CPUID function
25249 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
25250 + * resulting in stale register contents being returned.
25252 +static inline void cpuid(unsigned int op,
25253 + unsigned int *eax, unsigned int *ebx,
25254 + unsigned int *ecx, unsigned int *edx)
25258 + __cpuid(eax, ebx, ecx, edx);
25261 +/* Some CPUID calls want 'count' to be placed in ecx */
25262 +static inline void cpuid_count(unsigned int op, int count,
25263 + unsigned int *eax, unsigned int *ebx,
25264 + unsigned int *ecx, unsigned int *edx)
25268 + __cpuid(eax, ebx, ecx, edx);
25272 + * CPUID functions returning a single datum
25274 +static inline unsigned int cpuid_eax(unsigned int op)
25276 + unsigned int eax, ebx, ecx, edx;
25278 + cpuid(op, &eax, &ebx, &ecx, &edx);
25281 +static inline unsigned int cpuid_ebx(unsigned int op)
25283 + unsigned int eax, ebx, ecx, edx;
25285 + cpuid(op, &eax, &ebx, &ecx, &edx);
25288 +static inline unsigned int cpuid_ecx(unsigned int op)
25290 + unsigned int eax, ebx, ecx, edx;
25292 + cpuid(op, &eax, &ebx, &ecx, &edx);
25295 +static inline unsigned int cpuid_edx(unsigned int op)
25297 + unsigned int eax, ebx, ecx, edx;
25299 + cpuid(op, &eax, &ebx, &ecx, &edx);
25303 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
25304 +static inline void rep_nop(void)
25306 + __asm__ __volatile__("rep;nop": : :"memory");
25309 +/* Stop speculative execution */
25310 +static inline void sync_core(void)
25313 + asm volatile("cpuid" : "=a" (tmp) : "0" (1)
25314 + : "ebx", "ecx", "edx", "memory");
25317 +#define cpu_relax() rep_nop()
25319 +static inline void __monitor(const void *eax, unsigned long ecx,
25320 + unsigned long edx)
25322 + /* "monitor %eax,%ecx,%edx;" */
25324 + ".byte 0x0f,0x01,0xc8;"
25325 + : :"a" (eax), "c" (ecx), "d"(edx));
25328 +static inline void __mwait(unsigned long eax, unsigned long ecx)
25330 + /* "mwait %eax,%ecx;" */
25332 + ".byte 0x0f,0x01,0xc9;"
25333 + : :"a" (eax), "c" (ecx));
25336 +static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
25338 + /* "mwait %eax,%ecx;" */
25340 + "sti; .byte 0x0f,0x01,0xc9;"
25341 + : :"a" (eax), "c" (ecx));
25344 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25346 +extern int force_mwait;
25348 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
25350 +extern unsigned long boot_option_idle_override;
25352 +extern void enable_sep_cpu(void);
25353 +extern int sysenter_setup(void);
25355 +/* Defined in head.S */
25356 +extern struct desc_ptr early_gdt_descr;
25358 +extern void cpu_set_gdt(int);
25359 +extern void switch_to_new_gdt(void);
25360 +extern void cpu_init(void);
25361 +extern void init_gdt(int cpu);
25363 +/* from system description table in BIOS. Mostly for MCA use, but
25364 + * others may find it useful. */
25365 +extern unsigned int machine_id;
25366 +extern unsigned int machine_submodel_id;
25367 +extern unsigned int BIOS_revision;
25369 +/* Boot loader type from the setup header */
25370 +extern int bootloader_type;
25372 +extern char ignore_fpu_irq;
25373 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
25375 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
25376 +#define ARCH_HAS_PREFETCHW
25377 +#define ARCH_HAS_SPINLOCK_PREFETCH
25379 +#ifdef CONFIG_X86_32
25380 +#define BASE_PREFETCH ASM_NOP4
25381 +#define ARCH_HAS_PREFETCH
25383 +#define BASE_PREFETCH "prefetcht0 (%1)"
25386 +/* Prefetch instructions for Pentium III and AMD Athlon */
25387 +/* It's not worth to care about 3dnow! prefetches for the K6
25388 + because they are microcoded there and very slow.
25389 + However we don't do prefetches for pre XP Athlons currently
25390 + That should be fixed. */
25391 +static inline void prefetch(const void *x)
25393 + alternative_input(BASE_PREFETCH,
25394 + "prefetchnta (%1)",
25399 +/* 3dnow! prefetch to get an exclusive cache line. Useful for
25400 + spinlocks to avoid one state transition in the cache coherency protocol. */
25401 +static inline void prefetchw(const void *x)
25403 + alternative_input(BASE_PREFETCH,
25404 + "prefetchw (%1)",
25405 + X86_FEATURE_3DNOW,
25409 +#define spin_lock_prefetch(x) prefetchw(x)
25410 #ifdef CONFIG_X86_32
25411 -# include "processor_32.h"
25413 + * User space process size: 3GB (default).
25415 +#define TASK_SIZE (PAGE_OFFSET)
25416 +#define STACK_TOP TASK_SIZE
25417 +#define STACK_TOP_MAX STACK_TOP
25419 +#define INIT_THREAD { \
25420 + .sp0 = sizeof(init_stack) + (long)&init_stack, \
25421 + .vm86_info = NULL, \
25422 + .sysenter_cs = __KERNEL_CS, \
25423 + .io_bitmap_ptr = NULL, \
25424 + .fs = __KERNEL_PERCPU, \
25428 + * Note that the .io_bitmap member must be extra-big. This is because
25429 + * the CPU will access an additional byte beyond the end of the IO
25430 + * permission bitmap. The extra byte must be all 1 bits, and must
25431 + * be within the limit.
25433 +#define INIT_TSS { \
25435 + .sp0 = sizeof(init_stack) + (long)&init_stack, \
25436 + .ss0 = __KERNEL_DS, \
25437 + .ss1 = __KERNEL_CS, \
25438 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
25440 + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
25443 +#define start_thread(regs, new_eip, new_esp) do { \
25444 + __asm__("movl %0,%%gs": :"r" (0)); \
25446 + set_fs(USER_DS); \
25447 + regs->ds = __USER_DS; \
25448 + regs->es = __USER_DS; \
25449 + regs->ss = __USER_DS; \
25450 + regs->cs = __USER_CS; \
25451 + regs->ip = new_eip; \
25452 + regs->sp = new_esp; \
25456 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
25458 +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
25459 +#define KSTK_TOP(info) \
25461 + unsigned long *__ptr = (unsigned long *)(info); \
25462 + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
25466 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25467 + * This is necessary to guarantee that the entire "struct pt_regs"
25468 + * is accessable even if the CPU haven't stored the SS/ESP registers
25469 + * on the stack (interrupt gate does not save these registers
25470 + * when switching to the same priv ring).
25471 + * Therefore beware: accessing the ss/esp fields of the
25472 + * "struct pt_regs" is possible, but they may contain the
25473 + * completely wrong values.
25475 +#define task_pt_regs(task) \
25477 + struct pt_regs *__regs__; \
25478 + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
25482 +#define KSTK_ESP(task) (task_pt_regs(task)->sp)
25485 -# include "processor_64.h"
25487 + * User space process size. 47bits minus one guard page.
25489 +#define TASK_SIZE64 (0x800000000000UL - 4096)
25491 +/* This decides where the kernel will search for a free chunk of vm
25492 + * space during mmap's.
25494 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
25495 + 0xc0000000 : 0xFFFFe000)
25497 +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
25498 + IA32_PAGE_OFFSET : TASK_SIZE64)
25499 +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
25500 + IA32_PAGE_OFFSET : TASK_SIZE64)
25502 +#define STACK_TOP TASK_SIZE
25503 +#define STACK_TOP_MAX TASK_SIZE64
25505 +#define INIT_THREAD { \
25506 + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
25509 +#define INIT_TSS { \
25510 + .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
25513 +#define start_thread(regs, new_rip, new_rsp) do { \
25514 + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
25515 + load_gs_index(0); \
25516 + (regs)->ip = (new_rip); \
25517 + (regs)->sp = (new_rsp); \
25518 + write_pda(oldrsp, (new_rsp)); \
25519 + (regs)->cs = __USER_CS; \
25520 + (regs)->ss = __USER_DS; \
25521 + (regs)->flags = 0x200; \
25522 + set_fs(USER_DS); \
25526 + * Return saved PC of a blocked thread.
25527 + * What is this good for? it will be always the scheduler or ret_from_fork.
25529 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
25531 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
25532 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
25533 +#endif /* CONFIG_X86_64 */
25535 +/* This decides where the kernel will search for a free chunk of vm
25536 + * space during mmap's.
25538 +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
25540 +#define KSTK_EIP(task) (task_pt_regs(task)->ip)
25543 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/processor_32.h 2009-02-16 16:18:36.000000000 +0100
25544 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
25547 - * include/asm-i386/processor.h
25549 - * Copyright (C) 1994 Linus Torvalds
25552 -#ifndef __ASM_I386_PROCESSOR_H
25553 -#define __ASM_I386_PROCESSOR_H
25555 -#include <asm/vm86.h>
25556 -#include <asm/math_emu.h>
25557 -#include <asm/segment.h>
25558 -#include <asm/page.h>
25559 -#include <asm/types.h>
25560 -#include <asm/sigcontext.h>
25561 -#include <asm/cpufeature.h>
25562 -#include <asm/msr.h>
25563 -#include <asm/system.h>
25564 -#include <linux/cache.h>
25565 -#include <linux/threads.h>
25566 -#include <asm/percpu.h>
25567 -#include <linux/cpumask.h>
25568 -#include <linux/init.h>
25569 -#include <asm/processor-flags.h>
25570 -#include <xen/interface/physdev.h>
25572 -/* flag for disabling the tsc */
25573 -#define tsc_disable 0
25575 -struct desc_struct {
25576 - unsigned long a,b;
25579 -#define desc_empty(desc) \
25580 - (!((desc)->a | (desc)->b))
25582 -#define desc_equal(desc1, desc2) \
25583 - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
25585 - * Default implementation of macro that returns current
25586 - * instruction pointer ("program counter").
25588 -#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
25591 - * CPU type and hardware bug flags. Kept separately for each CPU.
25592 - * Members of this structure are referenced in head.S, so think twice
25593 - * before touching them. [mj]
25596 -struct cpuinfo_x86 {
25597 - __u8 x86; /* CPU family */
25598 - __u8 x86_vendor; /* CPU vendor */
25601 - char wp_works_ok; /* It doesn't on 386's */
25602 - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
25605 - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
25606 - unsigned long x86_capability[NCAPINTS];
25607 - char x86_vendor_id[16];
25608 - char x86_model_id[64];
25609 - int x86_cache_size; /* in KB - valid for CPUS which support this
25611 - int x86_cache_alignment; /* In bytes */
25617 - unsigned long loops_per_jiffy;
25619 - cpumask_t llc_shared_map; /* cpus sharing the last level cache */
25621 - unsigned char x86_max_cores; /* cpuid returned max cores value */
25622 - unsigned char apicid;
25623 - unsigned short x86_clflush_size;
25625 - unsigned char booted_cores; /* number of cores as seen by OS */
25626 - __u8 phys_proc_id; /* Physical processor id. */
25627 - __u8 cpu_core_id; /* Core id */
25628 - __u8 cpu_index; /* index into per_cpu list */
25630 -} __attribute__((__aligned__(SMP_CACHE_BYTES)));
25632 -#define X86_VENDOR_INTEL 0
25633 -#define X86_VENDOR_CYRIX 1
25634 -#define X86_VENDOR_AMD 2
25635 -#define X86_VENDOR_UMC 3
25636 -#define X86_VENDOR_NEXGEN 4
25637 -#define X86_VENDOR_CENTAUR 5
25638 -#define X86_VENDOR_TRANSMETA 7
25639 -#define X86_VENDOR_NSC 8
25640 -#define X86_VENDOR_NUM 9
25641 -#define X86_VENDOR_UNKNOWN 0xff
25644 - * capabilities of CPUs
25647 -extern struct cpuinfo_x86 boot_cpu_data;
25648 -extern struct cpuinfo_x86 new_cpu_data;
25649 -#ifndef CONFIG_X86_NO_TSS
25650 -extern struct tss_struct doublefault_tss;
25651 -DECLARE_PER_CPU(struct tss_struct, init_tss);
25655 -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
25656 -#define cpu_data(cpu) per_cpu(cpu_info, cpu)
25657 -#define current_cpu_data cpu_data(smp_processor_id())
25659 -#define cpu_data(cpu) boot_cpu_data
25660 -#define current_cpu_data boot_cpu_data
25664 - * the following now lives in the per cpu area:
25665 - * extern int cpu_llc_id[NR_CPUS];
25667 -DECLARE_PER_CPU(u8, cpu_llc_id);
25668 -extern char ignore_fpu_irq;
25670 -void __init cpu_detect(struct cpuinfo_x86 *c);
25672 -extern void identify_boot_cpu(void);
25673 -extern void identify_secondary_cpu(struct cpuinfo_x86 *);
25674 -extern void print_cpu_info(struct cpuinfo_x86 *);
25675 -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
25676 -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
25677 -extern unsigned short num_cache_leaves;
25679 -#ifdef CONFIG_X86_HT
25680 -extern void detect_ht(struct cpuinfo_x86 *c);
25682 -static inline void detect_ht(struct cpuinfo_x86 *c) {}
25685 -static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
25686 - unsigned int *ecx, unsigned int *edx)
25688 - /* ecx is often an input as well as an output. */
25689 - __asm__(XEN_CPUID
25694 - : "0" (*eax), "2" (*ecx));
25697 -#define load_cr3(pgdir) write_cr3(__pa(pgdir))
25700 - * Save the cr4 feature set we're using (ie
25701 - * Pentium 4MB enable and PPro Global page
25702 - * enable), so that any CPU's that boot up
25703 - * after us can get the correct flags.
25705 -extern unsigned long mmu_cr4_features;
25707 -static inline void set_in_cr4 (unsigned long mask)
25710 - mmu_cr4_features |= mask;
25711 - cr4 = read_cr4();
25716 -static inline void clear_in_cr4 (unsigned long mask)
25719 - mmu_cr4_features &= ~mask;
25720 - cr4 = read_cr4();
25725 -/* Stop speculative execution */
25726 -static inline void sync_core(void)
25729 - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
25732 -static inline void __monitor(const void *eax, unsigned long ecx,
25733 - unsigned long edx)
25735 - /* "monitor %eax,%ecx,%edx;" */
25737 - ".byte 0x0f,0x01,0xc8;"
25738 - : :"a" (eax), "c" (ecx), "d"(edx));
25741 -static inline void __mwait(unsigned long eax, unsigned long ecx)
25743 - /* "mwait %eax,%ecx;" */
25745 - ".byte 0x0f,0x01,0xc9;"
25746 - : :"a" (eax), "c" (ecx));
25749 -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
25751 -/* from system description table in BIOS. Mostly for MCA use, but
25752 -others may find it useful. */
25753 -extern unsigned int machine_id;
25754 -extern unsigned int machine_submodel_id;
25755 -extern unsigned int BIOS_revision;
25756 -extern unsigned int mca_pentium_flag;
25758 -/* Boot loader type from the setup header */
25759 -extern int bootloader_type;
25762 - * User space process size: 3GB (default).
25764 -#define TASK_SIZE (PAGE_OFFSET)
25766 -/* This decides where the kernel will search for a free chunk of vm
25767 - * space during mmap's.
25769 -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
25771 -#define HAVE_ARCH_PICK_MMAP_LAYOUT
25773 -extern void hard_disable_TSC(void);
25774 -extern void disable_TSC(void);
25775 -extern void hard_enable_TSC(void);
25778 - * Size of io_bitmap.
25780 -#define IO_BITMAP_BITS 65536
25781 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
25782 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
25783 -#ifndef CONFIG_X86_NO_TSS
25784 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
25786 -#define INVALID_IO_BITMAP_OFFSET 0x8000
25787 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
25789 -struct i387_fsave_struct {
25797 - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25798 - long status; /* software status information */
25801 -struct i387_fxsave_struct {
25802 - unsigned short cwd;
25803 - unsigned short swd;
25804 - unsigned short twd;
25805 - unsigned short fop;
25812 - long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
25813 - long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
25814 - long padding[56];
25815 -} __attribute__ ((aligned (16)));
25817 -struct i387_soft_struct {
25825 - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
25826 - unsigned char ftop, changed, lookahead, no_update, rm, alimit;
25827 - struct info *info;
25828 - unsigned long entry_eip;
25831 -union i387_union {
25832 - struct i387_fsave_struct fsave;
25833 - struct i387_fxsave_struct fxsave;
25834 - struct i387_soft_struct soft;
25838 - unsigned long seg;
25841 -struct thread_struct;
25843 -#ifndef CONFIG_X86_NO_TSS
25844 -/* This is the TSS defined by the hardware. */
25845 -struct i386_hw_tss {
25846 - unsigned short back_link,__blh;
25847 - unsigned long esp0;
25848 - unsigned short ss0,__ss0h;
25849 - unsigned long esp1;
25850 - unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
25851 - unsigned long esp2;
25852 - unsigned short ss2,__ss2h;
25853 - unsigned long __cr3;
25854 - unsigned long eip;
25855 - unsigned long eflags;
25856 - unsigned long eax,ecx,edx,ebx;
25857 - unsigned long esp;
25858 - unsigned long ebp;
25859 - unsigned long esi;
25860 - unsigned long edi;
25861 - unsigned short es, __esh;
25862 - unsigned short cs, __csh;
25863 - unsigned short ss, __ssh;
25864 - unsigned short ds, __dsh;
25865 - unsigned short fs, __fsh;
25866 - unsigned short gs, __gsh;
25867 - unsigned short ldt, __ldth;
25868 - unsigned short trace, io_bitmap_base;
25869 -} __attribute__((packed));
25871 -struct tss_struct {
25872 - struct i386_hw_tss x86_tss;
25875 - * The extra 1 is there because the CPU will access an
25876 - * additional byte beyond the end of the IO permission
25877 - * bitmap. The extra byte must be all 1 bits, and must
25878 - * be within the limit.
25880 - unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
25882 - * Cache the current maximum and the last task that used the bitmap:
25884 - unsigned long io_bitmap_max;
25885 - struct thread_struct *io_bitmap_owner;
25887 - * pads the TSS to be cacheline-aligned (size is 0x100)
25889 - unsigned long __cacheline_filler[35];
25891 - * .. and then another 0x100 bytes for emergency kernel stack
25893 - unsigned long stack[64];
25894 -} __attribute__((packed));
25897 -#define ARCH_MIN_TASKALIGN 16
25899 -struct thread_struct {
25900 -/* cached TLS descriptors. */
25901 - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
25902 - unsigned long esp0;
25903 - unsigned long sysenter_cs;
25904 - unsigned long eip;
25905 - unsigned long esp;
25906 - unsigned long fs;
25907 - unsigned long gs;
25908 -/* Hardware debugging registers */
25909 - unsigned long debugreg[8]; /* %%db0-7 debug registers */
25911 - unsigned long cr2, trap_no, error_code;
25912 -/* floating point info */
25913 - union i387_union i387;
25914 -/* virtual 86 mode info */
25915 - struct vm86_struct __user * vm86_info;
25916 - unsigned long screen_bitmap;
25917 - unsigned long v86flags, v86mask, saved_esp0;
25918 - unsigned int saved_fs, saved_gs;
25919 -/* IO permissions */
25920 - unsigned long *io_bitmap_ptr;
25921 - unsigned long iopl;
25922 -/* max allowed port in the bitmap, in bytes: */
25923 - unsigned long io_bitmap_max;
25926 -#define INIT_THREAD { \
25927 - .esp0 = sizeof(init_stack) + (long)&init_stack, \
25928 - .vm86_info = NULL, \
25929 - .sysenter_cs = __KERNEL_CS, \
25930 - .io_bitmap_ptr = NULL, \
25931 - .fs = __KERNEL_PERCPU, \
25935 - * Note that the .io_bitmap member must be extra-big. This is because
25936 - * the CPU will access an additional byte beyond the end of the IO
25937 - * permission bitmap. The extra byte must be all 1 bits, and must
25938 - * be within the limit.
25940 -#define INIT_TSS { \
25942 - .esp0 = sizeof(init_stack) + (long)&init_stack, \
25943 - .ss0 = __KERNEL_DS, \
25944 - .ss1 = __KERNEL_CS, \
25945 - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
25947 - .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
25950 -#define start_thread(regs, new_eip, new_esp) do { \
25951 - __asm__("movl %0,%%gs": :"r" (0)); \
25953 - set_fs(USER_DS); \
25954 - regs->xds = __USER_DS; \
25955 - regs->xes = __USER_DS; \
25956 - regs->xss = __USER_DS; \
25957 - regs->xcs = __USER_CS; \
25958 - regs->eip = new_eip; \
25959 - regs->esp = new_esp; \
25962 -/* Forward declaration, a strange C thing */
25963 -struct task_struct;
25966 -/* Free all resources held by a thread. */
25967 -extern void release_thread(struct task_struct *);
25969 -/* Prepare to copy thread state - unlazy all lazy status */
25970 -extern void prepare_to_copy(struct task_struct *tsk);
25973 - * create a kernel thread without removing it from tasklists
25975 -extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
25977 -extern unsigned long thread_saved_pc(struct task_struct *tsk);
25978 -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
25980 -unsigned long get_wchan(struct task_struct *p);
25982 -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
25983 -#define KSTK_TOP(info) \
25985 - unsigned long *__ptr = (unsigned long *)(info); \
25986 - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
25990 - * The below -8 is to reserve 8 bytes on top of the ring0 stack.
25991 - * This is necessary to guarantee that the entire "struct pt_regs"
25992 - * is accessable even if the CPU haven't stored the SS/ESP registers
25993 - * on the stack (interrupt gate does not save these registers
25994 - * when switching to the same priv ring).
25995 - * Therefore beware: accessing the xss/esp fields of the
25996 - * "struct pt_regs" is possible, but they may contain the
25997 - * completely wrong values.
25999 -#define task_pt_regs(task) \
26001 - struct pt_regs *__regs__; \
26002 - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
26006 -#define KSTK_EIP(task) (task_pt_regs(task)->eip)
26007 -#define KSTK_ESP(task) (task_pt_regs(task)->esp)
26010 -struct microcode_header {
26011 - unsigned int hdrver;
26012 - unsigned int rev;
26013 - unsigned int date;
26014 - unsigned int sig;
26015 - unsigned int cksum;
26016 - unsigned int ldrver;
26018 - unsigned int datasize;
26019 - unsigned int totalsize;
26020 - unsigned int reserved[3];
26023 -struct microcode {
26024 - struct microcode_header hdr;
26025 - unsigned int bits[0];
26028 -typedef struct microcode microcode_t;
26029 -typedef struct microcode_header microcode_header_t;
26031 -/* microcode format is extended from prescott processors */
26032 -struct extended_signature {
26033 - unsigned int sig;
26035 - unsigned int cksum;
26038 -struct extended_sigtable {
26039 - unsigned int count;
26040 - unsigned int cksum;
26041 - unsigned int reserved[3];
26042 - struct extended_signature sigs[0];
26045 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26046 -static inline void rep_nop(void)
26048 - __asm__ __volatile__("rep;nop": : :"memory");
26051 -#define cpu_relax() rep_nop()
26053 -#ifndef CONFIG_X86_NO_TSS
26054 -static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
26056 - tss->x86_tss.esp0 = thread->esp0;
26057 - /* This can only happen when SEP is enabled, no need to test "SEP"arately */
26058 - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
26059 - tss->x86_tss.ss1 = thread->sysenter_cs;
26060 - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
26064 -#define xen_load_esp0(tss, thread) do { \
26065 - if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
26071 -static inline unsigned long xen_get_debugreg(int regno)
26073 - return HYPERVISOR_get_debugreg(regno);
26076 -static inline void xen_set_debugreg(int regno, unsigned long value)
26078 - WARN_ON(HYPERVISOR_set_debugreg(regno, value));
26082 - * Set IOPL bits in EFLAGS from given mask
26084 -static inline void xen_set_iopl_mask(unsigned mask)
26086 - struct physdev_set_iopl set_iopl;
26088 - /* Force the change at ring 0. */
26089 - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
26090 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
26094 -#define paravirt_enabled() 0
26095 -#define __cpuid xen_cpuid
26097 -#define load_esp0 xen_load_esp0
26100 - * These special macros can be used to get or set a debugging register
26102 -#define get_debugreg(var, register) \
26103 - (var) = xen_get_debugreg(register)
26104 -#define set_debugreg(value, register) \
26105 - xen_set_debugreg(register, value)
26107 -#define set_iopl_mask xen_set_iopl_mask
26110 - * Generic CPUID function
26111 - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
26112 - * resulting in stale register contents being returned.
26114 -static inline void cpuid(unsigned int op,
26115 - unsigned int *eax, unsigned int *ebx,
26116 - unsigned int *ecx, unsigned int *edx)
26120 - __cpuid(eax, ebx, ecx, edx);
26123 -/* Some CPUID calls want 'count' to be placed in ecx */
26124 -static inline void cpuid_count(unsigned int op, int count,
26125 - unsigned int *eax, unsigned int *ebx,
26126 - unsigned int *ecx, unsigned int *edx)
26130 - __cpuid(eax, ebx, ecx, edx);
26134 - * CPUID functions returning a single datum
26136 -static inline unsigned int cpuid_eax(unsigned int op)
26138 - unsigned int eax, ebx, ecx, edx;
26140 - cpuid(op, &eax, &ebx, &ecx, &edx);
26143 -static inline unsigned int cpuid_ebx(unsigned int op)
26145 - unsigned int eax, ebx, ecx, edx;
26147 - cpuid(op, &eax, &ebx, &ecx, &edx);
26150 -static inline unsigned int cpuid_ecx(unsigned int op)
26152 - unsigned int eax, ebx, ecx, edx;
26154 - cpuid(op, &eax, &ebx, &ecx, &edx);
26157 -static inline unsigned int cpuid_edx(unsigned int op)
26159 - unsigned int eax, ebx, ecx, edx;
26161 - cpuid(op, &eax, &ebx, &ecx, &edx);
26165 -/* generic versions from gas */
26166 -#define GENERIC_NOP1 ".byte 0x90\n"
26167 -#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
26168 -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
26169 -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
26170 -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
26171 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
26172 -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
26173 -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
26175 -/* Opteron nops */
26176 -#define K8_NOP1 GENERIC_NOP1
26177 -#define K8_NOP2 ".byte 0x66,0x90\n"
26178 -#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
26179 -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
26180 -#define K8_NOP5 K8_NOP3 K8_NOP2
26181 -#define K8_NOP6 K8_NOP3 K8_NOP3
26182 -#define K8_NOP7 K8_NOP4 K8_NOP3
26183 -#define K8_NOP8 K8_NOP4 K8_NOP4
26186 -/* uses eax dependencies (arbitary choice) */
26187 -#define K7_NOP1 GENERIC_NOP1
26188 -#define K7_NOP2 ".byte 0x8b,0xc0\n"
26189 -#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
26190 -#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
26191 -#define K7_NOP5 K7_NOP4 ASM_NOP1
26192 -#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
26193 -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
26194 -#define K7_NOP8 K7_NOP7 ASM_NOP1
26197 -/* uses eax dependencies (Intel-recommended choice) */
26198 -#define P6_NOP1 GENERIC_NOP1
26199 -#define P6_NOP2 ".byte 0x66,0x90\n"
26200 -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
26201 -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
26202 -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
26203 -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26204 -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26205 -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26208 -#define ASM_NOP1 K8_NOP1
26209 -#define ASM_NOP2 K8_NOP2
26210 -#define ASM_NOP3 K8_NOP3
26211 -#define ASM_NOP4 K8_NOP4
26212 -#define ASM_NOP5 K8_NOP5
26213 -#define ASM_NOP6 K8_NOP6
26214 -#define ASM_NOP7 K8_NOP7
26215 -#define ASM_NOP8 K8_NOP8
26216 -#elif defined(CONFIG_MK7)
26217 -#define ASM_NOP1 K7_NOP1
26218 -#define ASM_NOP2 K7_NOP2
26219 -#define ASM_NOP3 K7_NOP3
26220 -#define ASM_NOP4 K7_NOP4
26221 -#define ASM_NOP5 K7_NOP5
26222 -#define ASM_NOP6 K7_NOP6
26223 -#define ASM_NOP7 K7_NOP7
26224 -#define ASM_NOP8 K7_NOP8
26225 -#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
26226 - defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
26227 - defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
26228 -#define ASM_NOP1 P6_NOP1
26229 -#define ASM_NOP2 P6_NOP2
26230 -#define ASM_NOP3 P6_NOP3
26231 -#define ASM_NOP4 P6_NOP4
26232 -#define ASM_NOP5 P6_NOP5
26233 -#define ASM_NOP6 P6_NOP6
26234 -#define ASM_NOP7 P6_NOP7
26235 -#define ASM_NOP8 P6_NOP8
26237 -#define ASM_NOP1 GENERIC_NOP1
26238 -#define ASM_NOP2 GENERIC_NOP2
26239 -#define ASM_NOP3 GENERIC_NOP3
26240 -#define ASM_NOP4 GENERIC_NOP4
26241 -#define ASM_NOP5 GENERIC_NOP5
26242 -#define ASM_NOP6 GENERIC_NOP6
26243 -#define ASM_NOP7 GENERIC_NOP7
26244 -#define ASM_NOP8 GENERIC_NOP8
26247 -#define ASM_NOP_MAX 8
26249 -/* Prefetch instructions for Pentium III and AMD Athlon */
26250 -/* It's not worth to care about 3dnow! prefetches for the K6
26251 - because they are microcoded there and very slow.
26252 - However we don't do prefetches for pre XP Athlons currently
26253 - That should be fixed. */
26254 -#define ARCH_HAS_PREFETCH
26255 -static inline void prefetch(const void *x)
26257 - alternative_input(ASM_NOP4,
26258 - "prefetchnta (%1)",
26263 -#define ARCH_HAS_PREFETCH
26264 -#define ARCH_HAS_PREFETCHW
26265 -#define ARCH_HAS_SPINLOCK_PREFETCH
26267 -/* 3dnow! prefetch to get an exclusive cache line. Useful for
26268 - spinlocks to avoid one state transition in the cache coherency protocol. */
26269 -static inline void prefetchw(const void *x)
26271 - alternative_input(ASM_NOP4,
26272 - "prefetchw (%1)",
26273 - X86_FEATURE_3DNOW,
26276 -#define spin_lock_prefetch(x) prefetchw(x)
26278 -extern void select_idle_routine(const struct cpuinfo_x86 *c);
26280 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26282 -extern unsigned long boot_option_idle_override;
26283 -extern void enable_sep_cpu(void);
26284 -extern int sysenter_setup(void);
26286 -/* Defined in head.S */
26287 -extern struct Xgt_desc_struct early_gdt_descr;
26289 -extern void cpu_set_gdt(int);
26290 -extern void switch_to_new_gdt(void);
26291 -extern void cpu_init(void);
26292 -extern void init_gdt(int cpu);
26294 -extern int force_mwait;
26296 -#endif /* __ASM_I386_PROCESSOR_H */
26297 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/processor_64.h 2009-02-16 16:18:36.000000000 +0100
26298 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26301 - * include/asm-x86_64/processor.h
26303 - * Copyright (C) 1994 Linus Torvalds
26306 -#ifndef __ASM_X86_64_PROCESSOR_H
26307 -#define __ASM_X86_64_PROCESSOR_H
26309 -#include <asm/segment.h>
26310 -#include <asm/page.h>
26311 -#include <asm/types.h>
26312 -#include <asm/sigcontext.h>
26313 -#include <asm/cpufeature.h>
26314 -#include <linux/threads.h>
26315 -#include <asm/msr.h>
26316 -#include <asm/current.h>
26317 -#include <asm/system.h>
26318 -#include <asm/mmsegment.h>
26319 -#include <asm/percpu.h>
26320 -#include <linux/personality.h>
26321 -#include <linux/cpumask.h>
26322 -#include <asm/processor-flags.h>
26324 -#define TF_MASK 0x00000100
26325 -#define IF_MASK 0x00000200
26326 -#define IOPL_MASK 0x00003000
26327 -#define NT_MASK 0x00004000
26328 -#define VM_MASK 0x00020000
26329 -#define AC_MASK 0x00040000
26330 -#define VIF_MASK 0x00080000 /* virtual interrupt flag */
26331 -#define VIP_MASK 0x00100000 /* virtual interrupt pending */
26332 -#define ID_MASK 0x00200000
26334 -#define desc_empty(desc) \
26335 - (!((desc)->a | (desc)->b))
26337 -#define desc_equal(desc1, desc2) \
26338 - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
26341 - * Default implementation of macro that returns current
26342 - * instruction pointer ("program counter").
26344 -#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
26347 - * CPU type and hardware bug flags. Kept separately for each CPU.
26350 -struct cpuinfo_x86 {
26351 - __u8 x86; /* CPU family */
26352 - __u8 x86_vendor; /* CPU vendor */
26355 - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
26356 - __u32 x86_capability[NCAPINTS];
26357 - char x86_vendor_id[16];
26358 - char x86_model_id[64];
26359 - int x86_cache_size; /* in KB */
26360 - int x86_clflush_size;
26361 - int x86_cache_alignment;
26362 - int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/
26363 - __u8 x86_virt_bits, x86_phys_bits;
26364 - __u8 x86_max_cores; /* cpuid returned max cores value */
26366 - __u32 extended_cpuid_level; /* Max extended CPUID function supported */
26367 - unsigned long loops_per_jiffy;
26369 - cpumask_t llc_shared_map; /* cpus sharing the last level cache */
26373 - __u8 booted_cores; /* number of cores as seen by OS */
26374 - __u8 phys_proc_id; /* Physical Processor id. */
26375 - __u8 cpu_core_id; /* Core id. */
26376 - __u8 cpu_index; /* index into per_cpu list */
26378 -} ____cacheline_aligned;
26380 -#define X86_VENDOR_INTEL 0
26381 -#define X86_VENDOR_CYRIX 1
26382 -#define X86_VENDOR_AMD 2
26383 -#define X86_VENDOR_UMC 3
26384 -#define X86_VENDOR_NEXGEN 4
26385 -#define X86_VENDOR_CENTAUR 5
26386 -#define X86_VENDOR_TRANSMETA 7
26387 -#define X86_VENDOR_NUM 8
26388 -#define X86_VENDOR_UNKNOWN 0xff
26391 -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
26392 -#define cpu_data(cpu) per_cpu(cpu_info, cpu)
26393 -#define current_cpu_data cpu_data(smp_processor_id())
26395 -#define cpu_data(cpu) boot_cpu_data
26396 -#define current_cpu_data boot_cpu_data
26399 -extern char ignore_irq13;
26401 -extern void identify_cpu(struct cpuinfo_x86 *);
26402 -extern void print_cpu_info(struct cpuinfo_x86 *);
26403 -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
26404 -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
26405 -extern unsigned short num_cache_leaves;
26408 - * Save the cr4 feature set we're using (ie
26409 - * Pentium 4MB enable and PPro Global page
26410 - * enable), so that any CPU's that boot up
26411 - * after us can get the correct flags.
26413 -extern unsigned long mmu_cr4_features;
26415 -static inline void set_in_cr4 (unsigned long mask)
26417 - mmu_cr4_features |= mask;
26418 - __asm__("movq %%cr4,%%rax\n\t"
26419 - "orq %0,%%rax\n\t"
26420 - "movq %%rax,%%cr4\n"
26425 -static inline void clear_in_cr4 (unsigned long mask)
26427 - mmu_cr4_features &= ~mask;
26428 - __asm__("movq %%cr4,%%rax\n\t"
26429 - "andq %0,%%rax\n\t"
26430 - "movq %%rax,%%cr4\n"
26431 - : : "irg" (~mask)
26437 - * User space process size. 47bits minus one guard page.
26439 -#define TASK_SIZE64 (0x800000000000UL - 4096)
26441 -/* This decides where the kernel will search for a free chunk of vm
26442 - * space during mmap's.
26444 -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
26446 -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
26447 -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
26449 -#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3)
26452 - * Size of io_bitmap.
26454 -#define IO_BITMAP_BITS 65536
26455 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
26456 -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
26457 -#ifndef CONFIG_X86_NO_TSS
26458 -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
26460 -#define INVALID_IO_BITMAP_OFFSET 0x8000
26462 -struct i387_fxsave_struct {
26471 - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
26472 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
26474 -} __attribute__ ((aligned (16)));
26476 -union i387_union {
26477 - struct i387_fxsave_struct fxsave;
26480 -#ifndef CONFIG_X86_NO_TSS
26481 -struct tss_struct {
26491 - u16 io_bitmap_base;
26493 - * The extra 1 is there because the CPU will access an
26494 - * additional byte beyond the end of the IO permission
26495 - * bitmap. The extra byte must be all 1 bits, and must
26496 - * be within the limit. Thus we have:
26498 - * 128 bytes, the bitmap itself, for ports 0..0x3ff
26499 - * 8 bytes, for an extra "long" of ~0UL
26501 - unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
26502 -} __attribute__((packed)) ____cacheline_aligned;
26504 -DECLARE_PER_CPU(struct tss_struct,init_tss);
26508 -extern struct cpuinfo_x86 boot_cpu_data;
26509 -#ifndef CONFIG_X86_NO_TSS
26510 -/* Save the original ist values for checking stack pointers during debugging */
26512 - unsigned long ist[7];
26514 -DECLARE_PER_CPU(struct orig_ist, orig_ist);
26517 -#ifdef CONFIG_X86_VSMP
26518 -#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
26519 -#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
26521 -#define ARCH_MIN_TASKALIGN 16
26522 -#define ARCH_MIN_MMSTRUCT_ALIGN 0
26525 -struct thread_struct {
26526 - unsigned long rsp0;
26527 - unsigned long rsp;
26528 - unsigned long userrsp; /* Copy from PDA */
26529 - unsigned long fs;
26530 - unsigned long gs;
26531 - unsigned short es, ds, fsindex, gsindex;
26532 -/* Hardware debugging registers */
26533 - unsigned long debugreg0;
26534 - unsigned long debugreg1;
26535 - unsigned long debugreg2;
26536 - unsigned long debugreg3;
26537 - unsigned long debugreg6;
26538 - unsigned long debugreg7;
26540 - unsigned long cr2, trap_no, error_code;
26541 -/* floating point info */
26542 - union i387_union i387 __attribute__((aligned(16)));
26543 -/* IO permissions. the bitmap could be moved into the GDT, that would make
26544 - switch faster for a limited number of ioperm using tasks. -AK */
26546 - unsigned long *io_bitmap_ptr;
26547 - unsigned io_bitmap_max;
26548 -/* cached TLS descriptors. */
26549 - u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
26550 - unsigned int iopl;
26551 -} __attribute__((aligned(16)));
26553 -#define INIT_THREAD { \
26554 - .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26557 -#ifndef CONFIG_X86_NO_TSS
26558 -#define INIT_TSS { \
26559 - .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
26563 -#define INIT_MMAP \
26564 -{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
26566 -#define start_thread(regs,new_rip,new_rsp) do { \
26567 - asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \
26568 - load_gs_index(0); \
26569 - (regs)->rip = (new_rip); \
26570 - (regs)->rsp = (new_rsp); \
26571 - write_pda(oldrsp, (new_rsp)); \
26572 - (regs)->cs = __USER_CS; \
26573 - (regs)->ss = __USER_DS; \
26574 - (regs)->eflags = 0x200; \
26575 - set_fs(USER_DS); \
26578 -#define get_debugreg(var, register) \
26579 - var = HYPERVISOR_get_debugreg(register)
26580 -#define set_debugreg(value, register) do { \
26581 - if (HYPERVISOR_set_debugreg(register, value)) \
26585 -struct task_struct;
26588 -/* Free all resources held by a thread. */
26589 -extern void release_thread(struct task_struct *);
26591 -/* Prepare to copy thread state - unlazy all lazy status */
26592 -extern void prepare_to_copy(struct task_struct *tsk);
26595 - * create a kernel thread without removing it from tasklists
26597 -extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
26600 - * Return saved PC of a blocked thread.
26601 - * What is this good for? it will be always the scheduler or ret_from_fork.
26603 -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
26605 -extern unsigned long get_wchan(struct task_struct *p);
26606 -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
26607 -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
26608 -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
26611 -struct microcode_header {
26612 - unsigned int hdrver;
26613 - unsigned int rev;
26614 - unsigned int date;
26615 - unsigned int sig;
26616 - unsigned int cksum;
26617 - unsigned int ldrver;
26619 - unsigned int datasize;
26620 - unsigned int totalsize;
26621 - unsigned int reserved[3];
26624 -struct microcode {
26625 - struct microcode_header hdr;
26626 - unsigned int bits[0];
26629 -typedef struct microcode microcode_t;
26630 -typedef struct microcode_header microcode_header_t;
26632 -/* microcode format is extended from prescott processors */
26633 -struct extended_signature {
26634 - unsigned int sig;
26636 - unsigned int cksum;
26639 -struct extended_sigtable {
26640 - unsigned int count;
26641 - unsigned int cksum;
26642 - unsigned int reserved[3];
26643 - struct extended_signature sigs[0];
26647 -#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
26648 -#define ASM_NOP1 P6_NOP1
26649 -#define ASM_NOP2 P6_NOP2
26650 -#define ASM_NOP3 P6_NOP3
26651 -#define ASM_NOP4 P6_NOP4
26652 -#define ASM_NOP5 P6_NOP5
26653 -#define ASM_NOP6 P6_NOP6
26654 -#define ASM_NOP7 P6_NOP7
26655 -#define ASM_NOP8 P6_NOP8
26657 -#define ASM_NOP1 K8_NOP1
26658 -#define ASM_NOP2 K8_NOP2
26659 -#define ASM_NOP3 K8_NOP3
26660 -#define ASM_NOP4 K8_NOP4
26661 -#define ASM_NOP5 K8_NOP5
26662 -#define ASM_NOP6 K8_NOP6
26663 -#define ASM_NOP7 K8_NOP7
26664 -#define ASM_NOP8 K8_NOP8
26667 -/* Opteron nops */
26668 -#define K8_NOP1 ".byte 0x90\n"
26669 -#define K8_NOP2 ".byte 0x66,0x90\n"
26670 -#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
26671 -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
26672 -#define K8_NOP5 K8_NOP3 K8_NOP2
26673 -#define K8_NOP6 K8_NOP3 K8_NOP3
26674 -#define K8_NOP7 K8_NOP4 K8_NOP3
26675 -#define K8_NOP8 K8_NOP4 K8_NOP4
26678 -/* uses eax dependencies (Intel-recommended choice) */
26679 -#define P6_NOP1 ".byte 0x90\n"
26680 -#define P6_NOP2 ".byte 0x66,0x90\n"
26681 -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
26682 -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
26683 -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
26684 -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
26685 -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
26686 -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
26688 -#define ASM_NOP_MAX 8
26690 -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
26691 -static inline void rep_nop(void)
26693 - __asm__ __volatile__("rep;nop": : :"memory");
26696 -/* Stop speculative execution */
26697 -static inline void sync_core(void)
26700 - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
26703 -#define ARCH_HAS_PREFETCHW 1
26704 -static inline void prefetchw(void *x)
26706 - alternative_input("prefetcht0 (%1)",
26707 - "prefetchw (%1)",
26708 - X86_FEATURE_3DNOW,
26712 -#define ARCH_HAS_SPINLOCK_PREFETCH 1
26714 -#define spin_lock_prefetch(x) prefetchw(x)
26716 -#define cpu_relax() rep_nop()
26718 -static inline void __monitor(const void *eax, unsigned long ecx,
26719 - unsigned long edx)
26721 - /* "monitor %eax,%ecx,%edx;" */
26723 - ".byte 0x0f,0x01,0xc8;"
26724 - : :"a" (eax), "c" (ecx), "d"(edx));
26727 -static inline void __mwait(unsigned long eax, unsigned long ecx)
26729 - /* "mwait %eax,%ecx;" */
26731 - ".byte 0x0f,0x01,0xc9;"
26732 - : :"a" (eax), "c" (ecx));
26735 -static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
26737 - /* "mwait %eax,%ecx;" */
26739 - "sti; .byte 0x0f,0x01,0xc9;"
26740 - : :"a" (eax), "c" (ecx));
26743 -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
26745 -#define stack_current() \
26747 - struct thread_info *ti; \
26748 - asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
26752 -#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
26754 -extern unsigned long boot_option_idle_override;
26755 -/* Boot loader type from the setup header */
26756 -extern int bootloader_type;
26758 -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
26760 -#endif /* __ASM_X86_64_PROCESSOR_H */
26761 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/segment.h 2009-02-16 16:18:36.000000000 +0100
26762 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/segment.h 2009-03-16 16:33:40.000000000 +0100
26764 +#ifndef _ASM_X86_SEGMENT_H_
26765 +#define _ASM_X86_SEGMENT_H_
26767 +/* Simple and small GDT entries for booting only */
26769 +#define GDT_ENTRY_BOOT_CS 2
26770 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
26772 +#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
26773 +#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
26775 +#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
26776 +#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
26778 #ifdef CONFIG_X86_32
26779 -# include "segment_32.h"
26781 + * The layout of the per-CPU GDT under Linux:
26788 + * 4 - unused <==== new cacheline
26791 + * ------- start of TLS (Thread-Local Storage) segments:
26793 + * 6 - TLS segment #1 [ glibc's TLS segment ]
26794 + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
26795 + * 8 - TLS segment #3
26800 + * ------- start of kernel segments:
26802 + * 12 - kernel code segment <==== new cacheline
26803 + * 13 - kernel data segment
26804 + * 14 - default user CS
26805 + * 15 - default user DS
26808 + * 18 - PNPBIOS support (16->32 gate)
26809 + * 19 - PNPBIOS support
26810 + * 20 - PNPBIOS support
26811 + * 21 - PNPBIOS support
26812 + * 22 - PNPBIOS support
26813 + * 23 - APM BIOS support
26814 + * 24 - APM BIOS support
26815 + * 25 - APM BIOS support
26817 + * 26 - ESPFIX small SS
26818 + * 27 - per-cpu [ offset to per-cpu data area ]
26822 + * 31 - TSS for double fault handler
26824 +#define GDT_ENTRY_TLS_MIN 6
26825 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
26827 +#define GDT_ENTRY_DEFAULT_USER_CS 14
26828 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
26830 +#define GDT_ENTRY_DEFAULT_USER_DS 15
26831 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
26833 +#define GDT_ENTRY_KERNEL_BASE 12
26835 +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
26836 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
26838 +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
26839 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
26841 +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
26842 +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
26844 +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
26845 +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
26847 +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
26848 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
26850 +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
26852 +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
26854 -# include "../../segment_64.h"
26855 +#define __KERNEL_PERCPU 0
26858 +#define GDT_ENTRY_DOUBLEFAULT_TSS 31
26861 + * The GDT has 32 entries
26863 +#define GDT_ENTRIES 32
26865 +/* The PnP BIOS entries in the GDT */
26866 +#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
26867 +#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
26868 +#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
26869 +#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
26870 +#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
26872 +/* The PnP BIOS selectors */
26873 +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
26874 +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
26875 +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
26876 +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
26877 +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
26879 +/* Bottom two bits of selector give the ring privilege level */
26880 +#define SEGMENT_RPL_MASK 0x3
26881 +/* Bit 2 is table indicator (LDT/GDT) */
26882 +#define SEGMENT_TI_MASK 0x4
26884 +/* User mode is privilege level 3 */
26885 +#define USER_RPL 0x3
26886 +/* LDT segment has TI set, GDT has it cleared */
26887 +#define SEGMENT_LDT 0x4
26888 +#define SEGMENT_GDT 0x0
26891 + * Matching rules for certain types of segments.
26894 +/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
26895 +#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
26896 + || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
26898 +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
26899 +#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
26900 + || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
26901 + || ((x) & ~3) == (FLAT_USER_CS & ~3))
26903 +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
26904 +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
26906 +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
26909 +#include <asm/cache.h>
26911 +#define __KERNEL_CS 0x10
26912 +#define __KERNEL_DS 0x18
26914 +#define __KERNEL32_CS 0x08
26917 + * we cannot use the same code segment descriptor for user and kernel
26918 + * -- not even in the long flat mode, because of different DPL /kkeil
26919 + * The segment offset needs to contain a RPL. Grr. -AK
26920 + * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
26923 +#define __USER32_CS 0x23 /* 4*8+3 */
26924 +#define __USER_DS 0x2b /* 5*8+3 */
26925 +#define __USER_CS 0x33 /* 6*8+3 */
26926 +#define __USER32_DS __USER_DS
26928 +#define GDT_ENTRY_TSS 8 /* needs two entries */
26929 +#define GDT_ENTRY_LDT 10 /* needs two entries */
26930 +#define GDT_ENTRY_TLS_MIN 12
26931 +#define GDT_ENTRY_TLS_MAX 14
26933 +#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
26934 +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
26936 +/* TLS indexes for 64bit - hardcoded in arch_prctl */
26940 +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
26941 +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
26943 +#define GDT_ENTRIES 16
26947 +/* User mode is privilege level 3 */
26948 +#define USER_RPL 0x3
26949 +/* LDT segment has TI set, GDT has it cleared */
26950 +#define SEGMENT_LDT 0x4
26951 +#define SEGMENT_GDT 0x0
26953 +/* Bottom two bits of selector give the ring privilege level */
26954 +#define SEGMENT_RPL_MASK 0x3
26955 +/* Bit 2 is table indicator (LDT/GDT) */
26956 +#define SEGMENT_TI_MASK 0x4
26958 +#define IDT_ENTRIES 256
26959 +#define GDT_SIZE (GDT_ENTRIES * 8)
26960 +#define GDT_ENTRY_TLS_ENTRIES 3
26961 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
26964 +#ifndef __ASSEMBLY__
26965 +extern const char early_idt_handlers[IDT_ENTRIES][10];
26970 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/segment_32.h 2008-12-15 11:27:22.000000000 +0100
26971 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
26973 -#ifndef _ASM_SEGMENT_H
26974 -#define _ASM_SEGMENT_H
26977 - * The layout of the per-CPU GDT under Linux:
26984 - * 4 - unused <==== new cacheline
26987 - * ------- start of TLS (Thread-Local Storage) segments:
26989 - * 6 - TLS segment #1 [ glibc's TLS segment ]
26990 - * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
26991 - * 8 - TLS segment #3
26996 - * ------- start of kernel segments:
26998 - * 12 - kernel code segment <==== new cacheline
26999 - * 13 - kernel data segment
27000 - * 14 - default user CS
27001 - * 15 - default user DS
27004 - * 18 - PNPBIOS support (16->32 gate)
27005 - * 19 - PNPBIOS support
27006 - * 20 - PNPBIOS support
27007 - * 21 - PNPBIOS support
27008 - * 22 - PNPBIOS support
27009 - * 23 - APM BIOS support
27010 - * 24 - APM BIOS support
27011 - * 25 - APM BIOS support
27013 - * 26 - ESPFIX small SS
27014 - * 27 - per-cpu [ offset to per-cpu data area ]
27018 - * 31 - TSS for double fault handler
27020 -#define GDT_ENTRY_TLS_ENTRIES 3
27021 -#define GDT_ENTRY_TLS_MIN 6
27022 -#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
27024 -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
27026 -#define GDT_ENTRY_DEFAULT_USER_CS 14
27027 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
27029 -#define GDT_ENTRY_DEFAULT_USER_DS 15
27030 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
27032 -#define GDT_ENTRY_KERNEL_BASE 12
27034 -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
27035 -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
27037 -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
27038 -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
27040 -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
27041 -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
27043 -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
27044 -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
27046 -#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
27047 -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
27049 -#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
27051 -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
27053 -#define __KERNEL_PERCPU 0
27056 -#define GDT_ENTRY_DOUBLEFAULT_TSS 31
27059 - * The GDT has 32 entries
27061 -#define GDT_ENTRIES 32
27062 -#define GDT_SIZE (GDT_ENTRIES * 8)
27064 -/* Simple and small GDT entries for booting only */
27066 -#define GDT_ENTRY_BOOT_CS 2
27067 -#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
27069 -#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
27070 -#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
27072 -/* The PnP BIOS entries in the GDT */
27073 -#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
27074 -#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
27075 -#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
27076 -#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
27077 -#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
27079 -/* The PnP BIOS selectors */
27080 -#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
27081 -#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
27082 -#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
27083 -#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
27084 -#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
27087 - * The interrupt descriptor table has room for 256 idt's,
27088 - * the global descriptor table is dependent on the number
27089 - * of tasks we can have..
27091 -#define IDT_ENTRIES 256
27093 -/* Bottom two bits of selector give the ring privilege level */
27094 -#define SEGMENT_RPL_MASK 0x3
27095 -/* Bit 2 is table indicator (LDT/GDT) */
27096 -#define SEGMENT_TI_MASK 0x4
27098 -/* User mode is privilege level 3 */
27099 -#define USER_RPL 0x3
27100 -/* LDT segment has TI set, GDT has it cleared */
27101 -#define SEGMENT_LDT 0x4
27102 -#define SEGMENT_GDT 0x0
27104 -#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
27107 - * Matching rules for certain types of segments.
27110 -/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
27111 -#define SEGMENT_IS_KERNEL_CODE(x) (((x) & ~3) == GDT_ENTRY_KERNEL_CS * 8 \
27112 - || ((x) & ~3) == (FLAT_KERNEL_CS & ~3))
27114 -/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
27115 -#define SEGMENT_IS_FLAT_CODE(x) (((x) & ~0x13) == GDT_ENTRY_KERNEL_CS * 8 \
27116 - || ((x) & ~3) == (FLAT_KERNEL_CS & ~3) \
27117 - || ((x) & ~3) == (FLAT_USER_CS & ~3))
27119 -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
27120 -#define SEGMENT_IS_PNP_CODE(x) (((x) & ~0x0b) == GDT_ENTRY_PNPBIOS_BASE * 8)
27123 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/smp_32.h 2009-02-16 16:18:36.000000000 +0100
27124 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/smp_32.h 2009-03-16 16:33:40.000000000 +0100
27126 #ifndef __ASM_SMP_H
27127 #define __ASM_SMP_H
27129 +#ifndef __ASSEMBLY__
27130 +#include <linux/cpumask.h>
27131 +#include <linux/init.h>
27134 * We need the APIC definitions automatically as part of 'smp.h'
27136 -#ifndef __ASSEMBLY__
27137 -#include <linux/kernel.h>
27138 -#include <linux/threads.h>
27139 -#include <linux/cpumask.h>
27140 +#ifdef CONFIG_X86_LOCAL_APIC
27141 +# include <asm/mpspec.h>
27142 +# include <asm/apic.h>
27143 +# ifdef CONFIG_X86_IO_APIC
27144 +# include <asm/io_apic.h>
27148 -#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
27149 -#include <linux/bitops.h>
27150 -#include <asm/mpspec.h>
27151 -#include <asm/apic.h>
27152 -#ifdef CONFIG_X86_IO_APIC
27153 -#include <asm/io_apic.h>
27156 +#define cpu_callout_map cpu_possible_map
27157 +#define cpu_callin_map cpu_possible_map
27159 -#define BAD_APICID 0xFFu
27161 -#ifndef __ASSEMBLY__
27162 +extern int smp_num_siblings;
27163 +extern unsigned int num_processors;
27166 - * Private routines/data
27169 extern void smp_alloc_memory(void);
27170 -extern int pic_mode;
27171 -extern int smp_num_siblings;
27172 -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27173 -DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27174 +extern void lock_ipi_call_lock(void);
27175 +extern void unlock_ipi_call_lock(void);
27177 extern void (*mtrr_hook) (void);
27178 extern void zap_low_mappings (void);
27179 -extern void lock_ipi_call_lock(void);
27180 -extern void unlock_ipi_call_lock(void);
27182 -#define MAX_APICID 256
27183 -extern u8 __initdata x86_cpu_to_apicid_init[];
27184 -extern void *x86_cpu_to_apicid_ptr;
27185 +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27186 +DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27187 +DECLARE_PER_CPU(u8, cpu_llc_id);
27188 DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
27190 -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27192 #ifdef CONFIG_HOTPLUG_CPU
27193 extern void cpu_exit_clear(void);
27194 extern void cpu_uninit(void);
27201 +/* Globals due to paravirt */
27202 +extern void set_cpu_sibling_map(int cpu);
27206 void (*smp_prepare_boot_cpu)(void);
27207 @@ -104,11 +99,11 @@ void native_smp_prepare_cpus(unsigned in
27208 int native_cpu_up(unsigned int cpunum);
27209 void native_smp_cpus_done(unsigned int max_cpus);
27211 -#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
27215 +#ifndef CONFIG_PARAVIRT
27216 +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
27219 +#else /* CONFIG_XEN */
27221 void xen_smp_send_stop(void);
27222 void xen_smp_send_reschedule(int cpu);
27223 @@ -120,7 +115,12 @@ int xen_smp_call_function_mask(cpumask_t
27224 #define smp_send_reschedule xen_smp_send_reschedule
27225 #define smp_call_function_mask xen_smp_call_function_mask
27228 +extern void prefill_possible_map(void);
27230 +#endif /* CONFIG_XEN */
27232 +extern int __cpu_disable(void);
27233 +extern void __cpu_die(unsigned int cpu);
27236 * This function is needed by all SMP systems. It must _always_ be valid
27237 @@ -130,64 +130,49 @@ int xen_smp_call_function_mask(cpumask_t
27238 DECLARE_PER_CPU(int, cpu_number);
27239 #define raw_smp_processor_id() (x86_read_percpu(cpu_number))
27241 -extern cpumask_t cpu_possible_map;
27242 -#define cpu_callin_map cpu_possible_map
27243 +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27245 +#define safe_smp_processor_id() smp_processor_id()
27247 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
27248 static inline int num_booting_cpus(void)
27250 - return cpus_weight(cpu_possible_map);
27251 + return cpus_weight(cpu_callout_map);
27254 -#define safe_smp_processor_id() smp_processor_id()
27255 -extern int __cpu_disable(void);
27256 -extern void __cpu_die(unsigned int cpu);
27257 -extern void prefill_possible_map(void);
27258 -extern unsigned int num_processors;
27260 -#endif /* !__ASSEMBLY__ */
27262 #else /* CONFIG_SMP */
27264 #define safe_smp_processor_id() 0
27265 #define cpu_physical_id(cpu) boot_cpu_physical_apicid
27267 -#define NO_PROC_ID 0xFF /* No processor magic marker */
27269 -#endif /* CONFIG_SMP */
27271 -#ifndef __ASSEMBLY__
27272 +#endif /* !CONFIG_SMP */
27274 #ifdef CONFIG_X86_LOCAL_APIC
27276 -#ifdef APIC_DEFINITION
27277 +static __inline int logical_smp_processor_id(void)
27279 + /* we don't want to mark this access volatile - bad code generation */
27280 + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27283 +# ifdef APIC_DEFINITION
27284 extern int hard_smp_processor_id(void);
27286 -#include <mach_apicdef.h>
27288 +# include <mach_apicdef.h>
27289 static inline int hard_smp_processor_id(void)
27291 /* we don't want to mark this access volatile - bad code generation */
27292 - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
27293 + return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27295 -#endif /* APIC_DEFINITION */
27296 +# endif /* APIC_DEFINITION */
27298 #else /* CONFIG_X86_LOCAL_APIC */
27300 -#ifndef CONFIG_SMP
27301 -#define hard_smp_processor_id() 0
27303 +# ifndef CONFIG_SMP
27304 +# define hard_smp_processor_id() 0
27307 #endif /* CONFIG_X86_LOCAL_APIC */
27309 -extern u8 apicid_2_node[];
27311 -#ifdef CONFIG_X86_LOCAL_APIC
27312 -static __inline int logical_smp_processor_id(void)
27314 - /* we don't want to mark this access volatile - bad code generation */
27315 - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27320 +#endif /* !ASSEMBLY */
27322 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/smp_64.h 2009-02-16 16:18:36.000000000 +0100
27323 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/smp_64.h 2009-03-16 16:33:40.000000000 +0100
27324 @@ -1,139 +1,103 @@
27325 #ifndef __ASM_SMP_H
27326 #define __ASM_SMP_H
27329 - * We need the APIC definitions automatically as part of 'smp.h'
27331 -#include <linux/threads.h>
27332 #include <linux/cpumask.h>
27333 -#include <linux/bitops.h>
27334 #include <linux/init.h>
27335 -extern int disable_apic;
27337 #ifdef CONFIG_X86_LOCAL_APIC
27338 -#include <asm/mpspec.h>
27340 + * We need the APIC definitions automatically as part of 'smp.h'
27342 #include <asm/apic.h>
27343 #ifdef CONFIG_X86_IO_APIC
27344 #include <asm/io_apic.h>
27346 -#include <asm/thread_info.h>
27347 +#include <asm/mpspec.h>
27352 #include <asm/pda.h>
27353 +#include <asm/thread_info.h>
27357 -extern cpumask_t cpu_present_mask;
27358 -extern cpumask_t cpu_possible_map;
27359 -extern cpumask_t cpu_online_map;
27360 extern cpumask_t cpu_initialized;
27363 - * Private routines/data
27366 +extern int smp_num_siblings;
27367 +extern unsigned int num_processors;
27369 extern void smp_alloc_memory(void);
27370 -extern volatile unsigned long smp_invalidate_needed;
27371 extern void lock_ipi_call_lock(void);
27372 extern void unlock_ipi_call_lock(void);
27373 -extern int smp_num_siblings;
27374 -extern void smp_send_reschedule(int cpu);
27376 extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
27377 void *info, int wait);
27380 - * cpu_sibling_map and cpu_core_map now live
27381 - * in the per cpu area
27383 - * extern cpumask_t cpu_sibling_map[NR_CPUS];
27384 - * extern cpumask_t cpu_core_map[NR_CPUS];
27386 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
27387 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
27388 -DECLARE_PER_CPU(u8, cpu_llc_id);
27390 -#define SMP_TRAMPOLINE_BASE 0x6000
27391 +DECLARE_PER_CPU(u16, cpu_llc_id);
27392 +DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
27393 +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
27396 - * On x86 all CPUs are mapped 1:1 to the APIC space.
27397 - * This simplifies scheduling and IPI sending and
27398 - * compresses data structures.
27401 -static inline int num_booting_cpus(void)
27402 +#ifdef CONFIG_X86_LOCAL_APIC
27403 +static inline int cpu_present_to_apicid(int mps_cpu)
27405 - return cpus_weight(cpu_possible_map);
27406 + if (cpu_present(mps_cpu))
27407 + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
27409 + return BAD_APICID;
27413 -#define raw_smp_processor_id() read_pda(cpunumber)
27416 +#define SMP_TRAMPOLINE_BASE 0x6000
27418 extern int __cpu_disable(void);
27419 extern void __cpu_die(unsigned int cpu);
27420 extern void prefill_possible_map(void);
27421 -extern unsigned num_processors;
27422 extern unsigned __cpuinitdata disabled_cpus;
27424 -#define NO_PROC_ID 0xFF /* No processor magic marker */
27426 -#endif /* CONFIG_SMP */
27427 +#define raw_smp_processor_id() read_pda(cpunumber)
27428 +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27430 -#define safe_smp_processor_id() smp_processor_id()
27432 -#ifdef CONFIG_X86_LOCAL_APIC
27433 -static inline int hard_smp_processor_id(void)
27435 - /* we don't want to mark this access volatile - bad code generation */
27436 - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
27439 +#define stack_smp_processor_id() \
27441 + struct thread_info *ti; \
27442 + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
27447 - * Some lowlevel functions might want to know about
27448 - * the real APIC ID <-> CPU # mapping.
27449 + * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
27450 + * scheduling and IPI sending and compresses data structures.
27452 -extern u8 __initdata x86_cpu_to_apicid_init[];
27453 -extern void *x86_cpu_to_apicid_ptr;
27454 -DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */
27455 -extern u8 bios_cpu_apicid[];
27457 -#ifdef CONFIG_X86_LOCAL_APIC
27458 -static inline int cpu_present_to_apicid(int mps_cpu)
27459 +static inline int num_booting_cpus(void)
27461 - if (mps_cpu < NR_CPUS)
27462 - return (int)bios_cpu_apicid[mps_cpu];
27464 - return BAD_APICID;
27465 + return cpus_weight(cpu_possible_map);
27469 -#ifndef CONFIG_SMP
27470 +extern void smp_send_reschedule(int cpu);
27472 +#else /* CONFIG_SMP */
27474 +extern unsigned int boot_cpu_id;
27475 +#define cpu_physical_id(cpu) boot_cpu_id
27476 #define stack_smp_processor_id() 0
27477 -#define cpu_logical_map(x) (x)
27479 -#include <asm/thread_info.h>
27480 -#define stack_smp_processor_id() \
27482 - struct thread_info *ti; \
27483 - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
27488 +#endif /* !CONFIG_SMP */
27490 +#define safe_smp_processor_id() smp_processor_id()
27492 #ifdef CONFIG_X86_LOCAL_APIC
27493 static __inline int logical_smp_processor_id(void)
27495 /* we don't want to mark this access volatile - bad code generation */
27496 - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
27497 + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
27500 +static inline int hard_smp_processor_id(void)
27502 + /* we don't want to mark this access volatile - bad code generation */
27503 + return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
27508 -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
27510 -extern unsigned int boot_cpu_id;
27511 -#define cpu_physical_id(cpu) boot_cpu_id
27512 -#endif /* !CONFIG_SMP */
27515 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
27516 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/spinlock.h 2009-03-16 16:33:40.000000000 +0100
27518 +#ifndef _X86_SPINLOCK_H_
27519 +#define _X86_SPINLOCK_H_
27521 +#include <asm/atomic.h>
27522 +#include <asm/rwlock.h>
27523 +#include <asm/page.h>
27524 +#include <asm/processor.h>
27525 +#include <linux/compiler.h>
27528 + * Your basic SMP spinlocks, allowing only a single CPU anywhere
27530 + * Simple spin lock operations. There are two variants, one clears IRQ's
27531 + * on the local processor, one does not.
27533 + * These are fair FIFO ticket locks, which are currently limited to 256
27536 + * (the type definitions are in asm/spinlock_types.h)
27539 +#ifdef CONFIG_X86_32
27540 +# define LOCK_PTR_REG "a"
27541 +# define REG_PTR_MODE "k"
27543 +# define LOCK_PTR_REG "D"
27544 +# define REG_PTR_MODE "q"
27547 +#if defined(CONFIG_X86_32) && \
27548 + (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
27550 + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
27551 + * (PPro errata 66, 92)
27553 +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
27555 +# define UNLOCK_LOCK_PREFIX
27558 +int xen_spinlock_init(unsigned int cpu);
27559 +void xen_spinlock_cleanup(unsigned int cpu);
27560 +extern int xen_spin_wait(raw_spinlock_t *, unsigned int token);
27561 +extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token,
27562 + unsigned int flags);
27563 +extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token);
27564 +extern void xen_spin_kick(raw_spinlock_t *, unsigned int token);
27567 + * Ticket locks are conceptually two parts, one indicating the current head of
27568 + * the queue, and the other indicating the current tail. The lock is acquired
27569 + * by atomically noting the tail and incrementing it by one (thus adding
27570 + * ourself to the queue and noting our position), then waiting until the head
27571 + * becomes equal to the the initial value of the tail.
27573 + * We use an xadd covering *both* parts of the lock, to increment the tail and
27574 + * also load the position of the head, which takes care of memory ordering
27575 + * issues and should be optimal for the uncontended case. Note the tail must be
27576 + * in the high part, because a wide xadd increment of the low part would carry
27577 + * up and contaminate the high part.
27579 + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
27580 + * save some instructions and make the code more elegant. There really isn't
27581 + * much between them in performance though, especially as locks are out of line.
27583 +#if (NR_CPUS < 256)
27584 +#define TICKET_SHIFT 8
27585 +#define __raw_spin_lock_preamble \
27586 + asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
27587 + "cmpb %h0, %b0\n\t" \
27589 + : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
27591 + : "memory", "cc")
27592 +#define __raw_spin_lock_body \
27594 + "cmpb %h0, %b0\n\t" \
27598 + "rep ; nop\n\t" \
27599 + "movb %2, %b0\n\t" \
27600 + /* don't need lfence here, because loads are in-order */ \
27603 + : "+Q" (token), "+g" (count) \
27604 + : "m" (lock->slock) \
27605 + : "memory", "cc")
27608 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27612 + asm("movzwl %2, %0\n\t"
27613 + "cmpb %h0, %b0\n\t"
27614 + "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
27616 + LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
27619 + "movzbl %b1, %0\n\t"
27620 + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27622 + : "memory", "cc");
27627 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27629 + unsigned int token;
27630 + unsigned char kick;
27632 + asm(UNLOCK_LOCK_PREFIX "incb %2\n\t"
27633 + "movzwl %2, %0\n\t"
27634 + "cmpb %h0, %b0\n\t"
27636 + : "=&Q" (token), "=qm" (kick), "+m" (lock->slock)
27638 + : "memory", "cc");
27640 + xen_spin_kick(lock, token);
27643 +#define TICKET_SHIFT 16
27644 +#define __raw_spin_lock_preamble \
27646 + unsigned int tmp; \
27647 + asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
27648 + "shldl $16, %0, %3\n\t" \
27649 + "cmpw %w3, %w0\n\t" \
27651 + : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
27653 + : "0" (0x00010000) \
27654 + : "memory", "cc"); \
27656 +#define __raw_spin_lock_body \
27658 + unsigned int tmp; \
27659 + asm("shldl $16, %0, %2\n" \
27661 + "cmpw %w2, %w0\n\t" \
27665 + "rep ; nop\n\t" \
27666 + "movw %3, %w0\n\t" \
27667 + /* don't need lfence here, because loads are in-order */ \
27670 + : "+r" (token), "+g" (count), "=&g" (tmp) \
27671 + : "m" (lock->slock) \
27672 + : "memory", "cc"); \
27675 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
27680 + asm("movl %2, %0\n\t"
27681 + "movl %0, %1\n\t"
27682 + "roll $16, %0\n\t"
27683 + "cmpl %0, %1\n\t"
27684 + "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
27686 + LOCK_PREFIX "cmpxchgl %1, %2\n"
27689 + "movzbl %b1, %0\n\t"
27690 + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
27692 + : "memory", "cc");
27697 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
27699 + unsigned int token, tmp;
27702 + asm(UNLOCK_LOCK_PREFIX "incw %2\n\t"
27703 + "movl %2, %0\n\t"
27704 + "shldl $16, %0, %3\n\t"
27705 + "cmpw %w3, %w0\n\t"
27707 + : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp)
27709 + : "memory", "cc");
27711 + xen_spin_kick(lock, token);
27715 +static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
27717 + int tmp = *(volatile signed int *)(&(lock)->slock);
27719 + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
27722 +static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
27724 + int tmp = *(volatile signed int *)(&(lock)->slock);
27726 + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
27729 +static inline void __raw_spin_lock(raw_spinlock_t *lock)
27731 + unsigned int token, count;
27734 + __raw_spin_lock_preamble;
27735 + if (unlikely(!free))
27736 + token = xen_spin_adjust(lock, token);
27739 + __raw_spin_lock_body;
27740 + } while (unlikely(!count) && !xen_spin_wait(lock, token));
27743 +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
27744 + unsigned long flags)
27746 + unsigned int token, count;
27749 + __raw_spin_lock_preamble;
27750 + if (unlikely(!free))
27751 + token = xen_spin_adjust(lock, token);
27754 + __raw_spin_lock_body;
27755 + } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags));
27758 +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
27760 + while (__raw_spin_is_locked(lock))
27765 + * Read-write spinlocks, allowing multiple readers
27766 + * but only one writer.
27768 + * NOTE! it is quite common to have readers in interrupts
27769 + * but no interrupt writers. For those circumstances we
27770 + * can "mix" irq-safe locks - any writer needs to get a
27771 + * irq-safe write-lock, but readers can get non-irqsafe
27774 + * On x86, we implement read-write locks as a 32-bit counter
27775 + * with the high bit (sign) being the "contended" bit.
27779 + * read_can_lock - would read_trylock() succeed?
27780 + * @lock: the rwlock in question.
27782 +static inline int __raw_read_can_lock(raw_rwlock_t *lock)
27784 + return (int)(lock)->lock > 0;
27788 + * write_can_lock - would write_trylock() succeed?
27789 + * @lock: the rwlock in question.
27791 +static inline int __raw_write_can_lock(raw_rwlock_t *lock)
27793 + return (lock)->lock == RW_LOCK_BIAS;
27796 +static inline void __raw_read_lock(raw_rwlock_t *rw)
27798 + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
27800 + "call __read_lock_failed\n\t"
27802 + ::LOCK_PTR_REG (rw) : "memory");
27805 +static inline void __raw_write_lock(raw_rwlock_t *rw)
27807 + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
27809 + "call __write_lock_failed\n\t"
27811 + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
27814 +static inline int __raw_read_trylock(raw_rwlock_t *lock)
27816 + atomic_t *count = (atomic_t *)lock;
27818 + atomic_dec(count);
27819 + if (atomic_read(count) >= 0)
27821 + atomic_inc(count);
27825 +static inline int __raw_write_trylock(raw_rwlock_t *lock)
27827 + atomic_t *count = (atomic_t *)lock;
27829 + if (atomic_sub_and_test(RW_LOCK_BIAS, count))
27831 + atomic_add(RW_LOCK_BIAS, count);
27835 +static inline void __raw_read_unlock(raw_rwlock_t *rw)
27837 + asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
27840 +static inline void __raw_write_unlock(raw_rwlock_t *rw)
27842 + asm volatile(LOCK_PREFIX "addl %1, %0"
27843 + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
27846 +#define _raw_spin_relax(lock) cpu_relax()
27847 +#define _raw_read_relax(lock) cpu_relax()
27848 +#define _raw_write_relax(lock) cpu_relax()
27851 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/system.h 2009-02-16 16:18:36.000000000 +0100
27852 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/system.h 2009-03-16 16:33:40.000000000 +0100
27854 +#ifndef _ASM_X86_SYSTEM_H_
27855 +#define _ASM_X86_SYSTEM_H_
27857 +#include <asm/asm.h>
27858 +#include <asm/segment.h>
27859 +#include <asm/cpufeature.h>
27860 +#include <asm/cmpxchg.h>
27861 +#include <asm/nops.h>
27862 +#include <asm/hypervisor.h>
27864 +#include <linux/kernel.h>
27865 +#include <linux/irqflags.h>
27867 +/* entries in ARCH_DLINFO: */
27868 +#ifdef CONFIG_IA32_EMULATION
27869 +# define AT_VECTOR_SIZE_ARCH 2
27871 +# define AT_VECTOR_SIZE_ARCH 1
27874 +#ifdef CONFIG_X86_32
27876 +struct task_struct; /* one of the stranger aspects of C forward declarations */
27877 +struct task_struct *__switch_to(struct task_struct *prev,
27878 + struct task_struct *next);
27881 + * Saving eflags is important. It switches not only IOPL between tasks,
27882 + * it also protects other tasks from NT leaking through sysenter etc.
27884 +#define switch_to(prev, next, last) do { \
27885 + unsigned long esi, edi; \
27886 + asm volatile("pushfl\n\t" /* Save flags */ \
27887 + "pushl %%ebp\n\t" \
27888 + "movl %%esp,%0\n\t" /* save ESP */ \
27889 + "movl %5,%%esp\n\t" /* restore ESP */ \
27890 + "movl $1f,%1\n\t" /* save EIP */ \
27891 + "pushl %6\n\t" /* restore EIP */ \
27892 + "jmp __switch_to\n" \
27894 + "popl %%ebp\n\t" \
27896 + :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \
27897 + "=a" (last), "=S" (esi), "=D" (edi) \
27898 + :"m" (next->thread.sp), "m" (next->thread.ip), \
27899 + "2" (prev), "d" (next)); \
27903 + * disable hlt during certain critical i/o operations
27905 +#define HAVE_DISABLE_HLT
27907 +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
27908 +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
27910 +/* frame pointer must be last for get_wchan */
27911 +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
27912 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
27914 +#define __EXTRA_CLOBBER \
27915 + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
27916 + "r12", "r13", "r14", "r15"
27918 +/* Save restore flags to clear handle leaking NT */
27919 +#define switch_to(prev, next, last) \
27920 + asm volatile(SAVE_CONTEXT \
27921 + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
27922 + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
27923 + "call __switch_to\n\t" \
27924 + ".globl thread_return\n" \
27925 + "thread_return:\n\t" \
27926 + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
27927 + "movq %P[thread_info](%%rsi),%%r8\n\t" \
27928 + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
27929 + "movq %%rax,%%rdi\n\t" \
27930 + "jc ret_from_fork\n\t" \
27931 + RESTORE_CONTEXT \
27933 + : [next] "S" (next), [prev] "D" (prev), \
27934 + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
27935 + [ti_flags] "i" (offsetof(struct thread_info, flags)), \
27936 + [tif_fork] "i" (TIF_FORK), \
27937 + [thread_info] "i" (offsetof(struct task_struct, stack)), \
27938 + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
27939 + : "memory", "cc" __EXTRA_CLOBBER)
27943 +#define _set_base(addr, base) do { unsigned long __pr; \
27944 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
27945 + "rorl $16,%%edx\n\t" \
27946 + "movb %%dl,%2\n\t" \
27949 + :"m" (*((addr)+2)), \
27950 + "m" (*((addr)+4)), \
27951 + "m" (*((addr)+7)), \
27955 +#define _set_limit(addr, limit) do { unsigned long __lr; \
27956 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
27957 + "rorl $16,%%edx\n\t" \
27958 + "movb %2,%%dh\n\t" \
27959 + "andb $0xf0,%%dh\n\t" \
27960 + "orb %%dh,%%dl\n\t" \
27963 + :"m" (*(addr)), \
27964 + "m" (*((addr)+6)), \
27968 +#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
27969 +#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
27971 +extern void load_gs_index(unsigned);
27974 + * Load a segment. Fall back on loading the zero
27975 + * segment if something goes wrong..
27977 +#define loadsegment(seg, value) \
27978 + asm volatile("\n" \
27980 + "movl %k0,%%" #seg "\n" \
27982 + ".section .fixup,\"ax\"\n" \
27984 + "movl %k1, %%" #seg "\n\t" \
27987 + _ASM_EXTABLE(1b,3b) \
27988 + : :"r" (value), "r" (0))
27992 + * Save a segment register away
27994 +#define savesegment(seg, value) \
27995 + asm volatile("mov %%" #seg ",%0":"=rm" (value))
27997 +static inline unsigned long get_limit(unsigned long segment)
27999 + unsigned long __limit;
28000 + __asm__("lsll %1,%0"
28001 + :"=r" (__limit):"r" (segment));
28002 + return __limit+1;
28005 +static inline void xen_clts(void)
28007 + HYPERVISOR_fpu_taskswitch(0);
28010 +static inline void xen_stts(void)
28012 + HYPERVISOR_fpu_taskswitch(1);
28016 + * Volatile isn't enough to prevent the compiler from reordering the
28017 + * read/write functions for the control registers and messing everything up.
28018 + * A memory clobber would solve the problem, but would prevent reordering of
28019 + * all loads stores around it, which can hurt performance. Solution is to
28020 + * use a variable and mimic reads and writes to it to enforce serialization
28022 +static unsigned long __force_order;
28024 +static inline unsigned long xen_read_cr0(void)
28026 + unsigned long val;
28027 + asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
28031 +static inline void xen_write_cr0(unsigned long val)
28033 + asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
28036 +#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28037 +#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28039 +static inline unsigned long xen_read_cr3(void)
28041 + unsigned long val;
28042 + asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
28043 +#ifdef CONFIG_X86_32
28044 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28046 + return machine_to_phys(val);
28050 +static inline void xen_write_cr3(unsigned long val)
28052 +#ifdef CONFIG_X86_32
28053 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28055 + val = phys_to_machine(val);
28057 + asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
28060 +static inline unsigned long xen_read_cr4(void)
28062 + unsigned long val;
28063 + asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
28067 +#define xen_read_cr4_safe() xen_read_cr4()
28069 +static inline void xen_write_cr4(unsigned long val)
28071 + asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
28074 +#ifdef CONFIG_X86_64
28075 +static inline unsigned long xen_read_cr8(void)
28080 +static inline void xen_write_cr8(unsigned long val)
28086 +static inline void xen_wbinvd(void)
28088 + asm volatile("wbinvd": : :"memory");
28090 +#define read_cr0() (xen_read_cr0())
28091 +#define write_cr0(x) (xen_write_cr0(x))
28092 +#define read_cr2() (xen_read_cr2())
28093 +#define write_cr2(x) (xen_write_cr2(x))
28094 +#define read_cr3() (xen_read_cr3())
28095 +#define write_cr3(x) (xen_write_cr3(x))
28096 +#define read_cr4() (xen_read_cr4())
28097 +#define read_cr4_safe() (xen_read_cr4_safe())
28098 +#define write_cr4(x) (xen_write_cr4(x))
28099 +#define wbinvd() (xen_wbinvd())
28100 +#ifdef CONFIG_X86_64
28101 +#define read_cr8() (xen_read_cr8())
28102 +#define write_cr8(x) (xen_write_cr8(x))
28105 +/* Clear the 'TS' bit */
28106 +#define clts() (xen_clts())
28107 +#define stts() (xen_stts())
28109 +#endif /* __KERNEL__ */
28111 +static inline void clflush(volatile void *__p)
28113 + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
28116 +#define nop() __asm__ __volatile__ ("nop")
28118 +void disable_hlt(void);
28119 +void enable_hlt(void);
28121 +extern int es7000_plat;
28122 +void cpu_idle_wait(void);
28124 +extern unsigned long arch_align_stack(unsigned long sp);
28125 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28127 +void default_idle(void);
28130 + * Force strict CPU ordering.
28131 + * And yes, this is required on UP too when we're talking
28134 #ifdef CONFIG_X86_32
28135 -# include "system_32.h"
28137 + * For now, "wmb()" doesn't actually do anything, as all
28138 + * Intel CPU's follow what Intel calls a *Processor Order*,
28139 + * in which all writes are seen in the program order even
28140 + * outside the CPU.
28142 + * I expect future Intel CPU's to have a weaker ordering,
28143 + * but I'd also expect them to finally get their act together
28144 + * and add some real memory barriers if so.
28146 + * Some non intel clones support out of order store. wmb() ceases to be a
28149 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28150 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28151 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28153 +#define mb() asm volatile("mfence":::"memory")
28154 +#define rmb() asm volatile("lfence":::"memory")
28155 +#define wmb() asm volatile("sfence" ::: "memory")
28159 + * read_barrier_depends - Flush all pending reads that subsequents reads
28162 + * No data-dependent reads from memory-like regions are ever reordered
28163 + * over this barrier. All reads preceding this primitive are guaranteed
28164 + * to access memory (but not necessarily other CPUs' caches) before any
28165 + * reads following this primitive that depend on the data return by
28166 + * any of the preceding reads. This primitive is much lighter weight than
28167 + * rmb() on most CPUs, and is never heavier weight than is
28170 + * These ordering constraints are respected by both the local CPU
28171 + * and the compiler.
28173 + * Ordering is not guaranteed by anything other than these primitives,
28174 + * not even by data dependencies. See the documentation for
28175 + * memory_barrier() for examples and URLs to more information.
28177 + * For example, the following code would force ordering (the initial
28178 + * value of "a" is zero, "b" is one, and "p" is "&a"):
28180 + * <programlisting>
28184 + * memory_barrier();
28186 + * read_barrier_depends();
28188 + * </programlisting>
28190 + * because the read of "*q" depends on the read of "p" and these
28191 + * two reads are separated by a read_barrier_depends(). However,
28192 + * the following code, with the same initial values for "a" and "b":
28194 + * <programlisting>
28198 + * memory_barrier();
28200 + * read_barrier_depends();
28202 + * </programlisting>
28204 + * does not enforce ordering, since there is no data dependency between
28205 + * the read of "a" and the read of "b". Therefore, on some CPUs, such
28206 + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
28207 + * in cases like this where there are no data dependencies.
28210 +#define read_barrier_depends() do { } while (0)
28213 +#define smp_mb() mb()
28214 +#ifdef CONFIG_X86_PPRO_FENCE
28215 +# define smp_rmb() rmb()
28217 -# include "system_64.h"
28218 +# define smp_rmb() barrier()
28220 +#ifdef CONFIG_X86_OOSTORE
28221 +# define smp_wmb() wmb()
28223 +# define smp_wmb() barrier()
28225 +#define smp_read_barrier_depends() read_barrier_depends()
28226 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28228 +#define smp_mb() barrier()
28229 +#define smp_rmb() barrier()
28230 +#define smp_wmb() barrier()
28231 +#define smp_read_barrier_depends() do { } while (0)
28232 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
28236 + * Stop RDTSC speculation. This is needed when you need to use RDTSC
28237 + * (or get_cycles or vread that possibly accesses the TSC) in a defined
28240 + * (Could use an alternative three way for this if there was one.)
28242 +static inline void rdtsc_barrier(void)
28244 + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
28245 + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
28249 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/system_32.h 2009-02-16 16:18:36.000000000 +0100
28250 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
28252 -#ifndef __ASM_SYSTEM_H
28253 -#define __ASM_SYSTEM_H
28255 -#include <linux/kernel.h>
28256 -#include <asm/segment.h>
28257 -#include <asm/cpufeature.h>
28258 -#include <asm/cmpxchg.h>
28259 -#include <asm/synch_bitops.h>
28260 -#include <asm/hypervisor.h>
28263 -#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
28265 -struct task_struct; /* one of the stranger aspects of C forward declarations.. */
28266 -extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
28269 - * Saving eflags is important. It switches not only IOPL between tasks,
28270 - * it also protects other tasks from NT leaking through sysenter etc.
28272 -#define switch_to(prev,next,last) do { \
28273 - unsigned long esi,edi; \
28274 - asm volatile("pushfl\n\t" /* Save flags */ \
28275 - "pushl %%ebp\n\t" \
28276 - "movl %%esp,%0\n\t" /* save ESP */ \
28277 - "movl %5,%%esp\n\t" /* restore ESP */ \
28278 - "movl $1f,%1\n\t" /* save EIP */ \
28279 - "pushl %6\n\t" /* restore EIP */ \
28280 - "jmp __switch_to\n" \
28282 - "popl %%ebp\n\t" \
28284 - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
28285 - "=a" (last),"=S" (esi),"=D" (edi) \
28286 - :"m" (next->thread.esp),"m" (next->thread.eip), \
28287 - "2" (prev), "d" (next)); \
28290 -#define _set_base(addr,base) do { unsigned long __pr; \
28291 -__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28292 - "rorl $16,%%edx\n\t" \
28293 - "movb %%dl,%2\n\t" \
28296 - :"m" (*((addr)+2)), \
28297 - "m" (*((addr)+4)), \
28298 - "m" (*((addr)+7)), \
28302 -#define _set_limit(addr,limit) do { unsigned long __lr; \
28303 -__asm__ __volatile__ ("movw %%dx,%1\n\t" \
28304 - "rorl $16,%%edx\n\t" \
28305 - "movb %2,%%dh\n\t" \
28306 - "andb $0xf0,%%dh\n\t" \
28307 - "orb %%dh,%%dl\n\t" \
28310 - :"m" (*(addr)), \
28311 - "m" (*((addr)+6)), \
28315 -#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
28316 -#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
28319 - * Load a segment. Fall back on loading the zero
28320 - * segment if something goes wrong..
28322 -#define loadsegment(seg,value) \
28323 - asm volatile("\n" \
28325 - "mov %0,%%" #seg "\n" \
28327 - ".section .fixup,\"ax\"\n" \
28330 - "popl %%" #seg "\n\t" \
28333 - ".section __ex_table,\"a\"\n\t" \
28335 - ".long 1b,3b\n" \
28340 - * Save a segment register away
28342 -#define savesegment(seg, value) \
28343 - asm volatile("mov %%" #seg ",%0":"=rm" (value))
28345 -static inline void xen_clts(void)
28347 - HYPERVISOR_fpu_taskswitch(0);
28350 -static inline unsigned long xen_read_cr0(void)
28352 - unsigned long val;
28353 - asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
28357 -static inline void xen_write_cr0(unsigned long val)
28359 - asm volatile("movl %0,%%cr0": :"r" (val));
28362 -#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
28364 -static inline void xen_write_cr2(unsigned long val)
28366 - asm volatile("movl %0,%%cr2": :"r" (val));
28369 -static inline unsigned long xen_read_cr3(void)
28371 - unsigned long val;
28372 - asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
28373 - return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
28376 -static inline void xen_write_cr3(unsigned long val)
28378 - val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
28379 - asm volatile("movl %0,%%cr3": :"r" (val));
28382 -static inline unsigned long xen_read_cr4(void)
28384 - unsigned long val;
28385 - asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
28389 -static inline unsigned long xen_read_cr4_safe(void)
28391 - unsigned long val;
28392 - /* This could fault if %cr4 does not exist */
28393 - asm volatile("1: movl %%cr4, %0 \n"
28395 - ".section __ex_table,\"a\" \n"
28398 - : "=r" (val): "0" (0));
28402 -static inline void xen_write_cr4(unsigned long val)
28404 - asm volatile("movl %0,%%cr4": :"r" (val));
28407 -static inline void xen_wbinvd(void)
28409 - asm volatile("wbinvd": : :"memory");
28412 -static inline void clflush(volatile void *__p)
28414 - asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28417 -#define read_cr0() (xen_read_cr0())
28418 -#define write_cr0(x) (xen_write_cr0(x))
28419 -#define read_cr2() (xen_read_cr2())
28420 -#define write_cr2(x) (xen_write_cr2(x))
28421 -#define read_cr3() (xen_read_cr3())
28422 -#define write_cr3(x) (xen_write_cr3(x))
28423 -#define read_cr4() (xen_read_cr4())
28424 -#define read_cr4_safe() (xen_read_cr4_safe())
28425 -#define write_cr4(x) (xen_write_cr4(x))
28426 -#define wbinvd() (xen_wbinvd())
28428 -/* Clear the 'TS' bit */
28429 -#define clts() (xen_clts())
28431 -/* Set the 'TS' bit */
28432 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
28434 -#endif /* __KERNEL__ */
28436 -static inline unsigned long get_limit(unsigned long segment)
28438 - unsigned long __limit;
28439 - __asm__("lsll %1,%0"
28440 - :"=r" (__limit):"r" (segment));
28441 - return __limit+1;
28444 -#define nop() __asm__ __volatile__ ("nop")
28447 - * Force strict CPU ordering.
28448 - * And yes, this is required on UP too when we're talking
28451 - * For now, "wmb()" doesn't actually do anything, as all
28452 - * Intel CPU's follow what Intel calls a *Processor Order*,
28453 - * in which all writes are seen in the program order even
28454 - * outside the CPU.
28456 - * I expect future Intel CPU's to have a weaker ordering,
28457 - * but I'd also expect them to finally get their act together
28458 - * and add some real memory barriers if so.
28460 - * Some non intel clones support out of order store. wmb() ceases to be a
28465 -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
28466 -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
28467 -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
28470 - * read_barrier_depends - Flush all pending reads that subsequents reads
28473 - * No data-dependent reads from memory-like regions are ever reordered
28474 - * over this barrier. All reads preceding this primitive are guaranteed
28475 - * to access memory (but not necessarily other CPUs' caches) before any
28476 - * reads following this primitive that depend on the data return by
28477 - * any of the preceding reads. This primitive is much lighter weight than
28478 - * rmb() on most CPUs, and is never heavier weight than is
28481 - * These ordering constraints are respected by both the local CPU
28482 - * and the compiler.
28484 - * Ordering is not guaranteed by anything other than these primitives,
28485 - * not even by data dependencies. See the documentation for
28486 - * memory_barrier() for examples and URLs to more information.
28488 - * For example, the following code would force ordering (the initial
28489 - * value of "a" is zero, "b" is one, and "p" is "&a"):
28491 - * <programlisting>
28495 - * memory_barrier();
28497 - * read_barrier_depends();
28499 - * </programlisting>
28501 - * because the read of "*q" depends on the read of "p" and these
28502 - * two reads are separated by a read_barrier_depends(). However,
28503 - * the following code, with the same initial values for "a" and "b":
28505 - * <programlisting>
28509 - * memory_barrier();
28511 - * read_barrier_depends();
28513 - * </programlisting>
28515 - * does not enforce ordering, since there is no data dependency between
28516 - * the read of "a" and the read of "b". Therefore, on some CPUs, such
28517 - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
28518 - * in cases like this where there are no data dependencies.
28521 -#define read_barrier_depends() do { } while(0)
28524 -#define smp_mb() mb()
28525 -#ifdef CONFIG_X86_PPRO_FENCE
28526 -# define smp_rmb() rmb()
28528 -# define smp_rmb() barrier()
28530 -#ifdef CONFIG_X86_OOSTORE
28531 -# define smp_wmb() wmb()
28533 -# define smp_wmb() barrier()
28535 -#define smp_read_barrier_depends() read_barrier_depends()
28536 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28538 -#define smp_mb() barrier()
28539 -#define smp_rmb() barrier()
28540 -#define smp_wmb() barrier()
28541 -#define smp_read_barrier_depends() do { } while(0)
28542 -#define set_mb(var, value) do { var = value; barrier(); } while (0)
28545 -#include <linux/irqflags.h>
28548 - * disable hlt during certain critical i/o operations
28550 -#define HAVE_DISABLE_HLT
28551 -void disable_hlt(void);
28552 -void enable_hlt(void);
28554 -extern int es7000_plat;
28555 -void cpu_idle_wait(void);
28557 -extern unsigned long arch_align_stack(unsigned long sp);
28558 -extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28560 -void default_idle(void);
28561 -void __show_registers(struct pt_regs *, int all);
28564 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/system_64.h 2009-02-16 16:18:36.000000000 +0100
28565 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/system_64.h 2009-03-16 16:33:40.000000000 +0100
28567 #ifndef __ASM_SYSTEM_H
28568 #define __ASM_SYSTEM_H
28570 -#include <linux/kernel.h>
28571 #include <asm/segment.h>
28572 #include <asm/cmpxchg.h>
28574 -#include <asm/synch_bitops.h>
28575 -#include <asm/hypervisor.h>
28576 -#include <xen/interface/arch-x86_64.h>
28580 -/* entries in ARCH_DLINFO: */
28581 -#ifdef CONFIG_IA32_EMULATION
28582 -# define AT_VECTOR_SIZE_ARCH 2
28584 -# define AT_VECTOR_SIZE_ARCH 1
28587 -#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
28588 -#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
28590 -/* frame pointer must be last for get_wchan */
28591 -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
28592 -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
28594 -#define __EXTRA_CLOBBER \
28595 - ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
28597 -/* Save restore flags to clear handle leaking NT */
28598 -#define switch_to(prev,next,last) \
28599 - asm volatile(SAVE_CONTEXT \
28600 - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
28601 - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
28602 - "call __switch_to\n\t" \
28603 - ".globl thread_return\n" \
28604 - "thread_return:\n\t" \
28605 - "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
28606 - "movq %P[thread_info](%%rsi),%%r8\n\t" \
28607 - LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
28608 - "movq %%rax,%%rdi\n\t" \
28609 - "jc ret_from_fork\n\t" \
28610 - RESTORE_CONTEXT \
28612 - : [next] "S" (next), [prev] "D" (prev), \
28613 - [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
28614 - [ti_flags] "i" (offsetof(struct thread_info, flags)),\
28615 - [tif_fork] "i" (TIF_FORK), \
28616 - [thread_info] "i" (offsetof(struct task_struct, stack)), \
28617 - [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
28618 - : "memory", "cc" __EXTRA_CLOBBER)
28620 -extern void load_gs_index(unsigned);
28623 - * Load a segment. Fall back on loading the zero
28624 - * segment if something goes wrong..
28626 -#define loadsegment(seg,value) \
28627 - asm volatile("\n" \
28629 - "movl %k0,%%" #seg "\n" \
28631 - ".section .fixup,\"ax\"\n" \
28633 - "movl %1,%%" #seg "\n\t" \
28636 - ".section __ex_table,\"a\"\n\t" \
28638 - ".quad 1b,3b\n" \
28640 - : :"r" (value), "r" (0))
28643 - * Clear and set 'TS' bit respectively
28645 -#define clts() (HYPERVISOR_fpu_taskswitch(0))
28647 -static inline unsigned long read_cr0(void)
28649 - unsigned long cr0;
28650 - asm volatile("movq %%cr0,%0" : "=r" (cr0));
28654 -static inline void write_cr0(unsigned long val)
28656 - asm volatile("movq %0,%%cr0" :: "r" (val));
28659 -#define read_cr2() current_vcpu_info()->arch.cr2
28661 -#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
28663 -#define read_cr3() ({ \
28664 - unsigned long __dummy; \
28665 - asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \
28666 - machine_to_phys(__dummy); \
28669 -static inline void write_cr3(unsigned long val)
28671 - val = phys_to_machine(val);
28672 - asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
28675 -static inline unsigned long read_cr4(void)
28677 - unsigned long cr4;
28678 - asm volatile("movq %%cr4,%0" : "=r" (cr4));
28682 -static inline void write_cr4(unsigned long val)
28684 - asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
28687 static inline unsigned long read_cr8(void)
28689 @@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo
28693 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
28695 -#define wbinvd() \
28696 - __asm__ __volatile__ ("wbinvd": : :"memory")
28698 -#endif /* __KERNEL__ */
28700 -static inline void clflush(volatile void *__p)
28702 - asm volatile("clflush %0" : "+m" (*(char __force *)__p));
28705 -#define nop() __asm__ __volatile__ ("nop")
28708 -#define smp_mb() mb()
28709 -#define smp_rmb() barrier()
28710 -#define smp_wmb() barrier()
28711 -#define smp_read_barrier_depends() do {} while(0)
28713 -#define smp_mb() barrier()
28714 -#define smp_rmb() barrier()
28715 -#define smp_wmb() barrier()
28716 -#define smp_read_barrier_depends() do {} while(0)
28721 - * Force strict CPU ordering.
28722 - * And yes, this is required on UP too when we're talking
28725 -#define mb() asm volatile("mfence":::"memory")
28726 -#define rmb() asm volatile("lfence":::"memory")
28727 -#define wmb() asm volatile("sfence" ::: "memory")
28729 -#define read_barrier_depends() do {} while(0)
28730 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
28732 -#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
28734 #include <linux/irqflags.h>
28736 -void cpu_idle_wait(void);
28738 -extern unsigned long arch_align_stack(unsigned long sp);
28739 -extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
28742 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/tlbflush.h 2009-02-16 16:18:36.000000000 +0100
28743 +++ sle11-2009-10-16/include/asm-x86/mach-xen/asm/tlbflush.h 2009-03-16 16:33:40.000000000 +0100
28745 +#ifndef _ASM_X86_TLBFLUSH_H
28746 +#define _ASM_X86_TLBFLUSH_H
28748 +#include <linux/mm.h>
28749 +#include <linux/sched.h>
28751 +#include <asm/processor.h>
28752 +#include <asm/system.h>
28754 +#define __flush_tlb() xen_tlb_flush()
28755 +#define __flush_tlb_global() xen_tlb_flush()
28756 +#define __flush_tlb_single(addr) xen_invlpg(addr)
28757 +#define __flush_tlb_all() xen_tlb_flush()
28758 +#define __flush_tlb_one(addr) xen_invlpg(addr)
28760 #ifdef CONFIG_X86_32
28761 -# include "tlbflush_32.h"
28762 +# define TLB_FLUSH_ALL 0xffffffff
28764 -# include "tlbflush_64.h"
28765 +# define TLB_FLUSH_ALL -1ULL
28771 + * - flush_tlb() flushes the current mm struct TLBs
28772 + * - flush_tlb_all() flushes all processes TLBs
28773 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's
28774 + * - flush_tlb_page(vma, vmaddr) flushes one page
28775 + * - flush_tlb_range(vma, start, end) flushes a range of pages
28776 + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28778 + * ..but the i386 has somewhat limited tlb flushing capabilities,
28779 + * and page-granular flushes are available only on i486 and up.
28781 + * x86-64 can only flush individual pages or full VMs. For a range flush
28782 + * we always do the full VM. Might be worth trying if for a small
28783 + * range a few INVLPGs in a row are a win.
28786 +#ifndef CONFIG_SMP
28788 +#define flush_tlb() __flush_tlb()
28789 +#define flush_tlb_all() __flush_tlb_all()
28790 +#define local_flush_tlb() __flush_tlb()
28792 +static inline void flush_tlb_mm(struct mm_struct *mm)
28794 + if (mm == current->active_mm)
28798 +static inline void flush_tlb_page(struct vm_area_struct *vma,
28799 + unsigned long addr)
28801 + if (vma->vm_mm == current->active_mm)
28802 + __flush_tlb_one(addr);
28805 +static inline void flush_tlb_range(struct vm_area_struct *vma,
28806 + unsigned long start, unsigned long end)
28808 + if (vma->vm_mm == current->active_mm)
28814 +#include <asm/smp.h>
28816 +#define local_flush_tlb() __flush_tlb()
28818 +#define flush_tlb_all xen_tlb_flush_all
28819 +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
28820 +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
28821 +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
28823 +#define flush_tlb() flush_tlb_current_task()
28825 +static inline void flush_tlb_range(struct vm_area_struct *vma,
28826 + unsigned long start, unsigned long end)
28828 + flush_tlb_mm(vma->vm_mm);
28831 +#define TLBSTATE_OK 1
28832 +#define TLBSTATE_LAZY 2
28834 +#ifdef CONFIG_X86_32
28837 + struct mm_struct *active_mm;
28839 + char __cacheline_padding[L1_CACHE_BYTES-8];
28841 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
28846 +static inline void flush_tlb_kernel_range(unsigned long start,
28847 + unsigned long end)
28852 +#endif /* _ASM_X86_TLBFLUSH_H */
28853 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/tlbflush_32.h 2009-02-16 16:18:36.000000000 +0100
28854 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
28856 -#ifndef _I386_TLBFLUSH_H
28857 -#define _I386_TLBFLUSH_H
28859 -#include <linux/mm.h>
28860 -#include <asm/processor.h>
28862 -#define __flush_tlb() xen_tlb_flush()
28863 -#define __flush_tlb_global() xen_tlb_flush()
28864 -#define __flush_tlb_all() xen_tlb_flush()
28866 -#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
28868 -#define __flush_tlb_single(addr) xen_invlpg(addr)
28870 -#define __flush_tlb_one(addr) __flush_tlb_single(addr)
28875 - * - flush_tlb() flushes the current mm struct TLBs
28876 - * - flush_tlb_all() flushes all processes TLBs
28877 - * - flush_tlb_mm(mm) flushes the specified mm context TLB's
28878 - * - flush_tlb_page(vma, vmaddr) flushes one page
28879 - * - flush_tlb_range(vma, start, end) flushes a range of pages
28880 - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28882 - * ..but the i386 has somewhat limited tlb flushing capabilities,
28883 - * and page-granular flushes are available only on i486 and up.
28886 -#define TLB_FLUSH_ALL 0xffffffff
28889 -#ifndef CONFIG_SMP
28891 -#include <linux/sched.h>
28893 -#define flush_tlb() __flush_tlb()
28894 -#define flush_tlb_all() __flush_tlb_all()
28895 -#define local_flush_tlb() __flush_tlb()
28897 -static inline void flush_tlb_mm(struct mm_struct *mm)
28899 - if (mm == current->active_mm)
28903 -static inline void flush_tlb_page(struct vm_area_struct *vma,
28904 - unsigned long addr)
28906 - if (vma->vm_mm == current->active_mm)
28907 - __flush_tlb_one(addr);
28910 -static inline void flush_tlb_range(struct vm_area_struct *vma,
28911 - unsigned long start, unsigned long end)
28913 - if (vma->vm_mm == current->active_mm)
28919 -#include <asm/smp.h>
28921 -#define local_flush_tlb() \
28924 -#define flush_tlb_all xen_tlb_flush_all
28925 -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
28926 -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
28927 -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
28929 -#define flush_tlb() flush_tlb_current_task()
28931 -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
28933 - flush_tlb_mm(vma->vm_mm);
28936 -#define TLBSTATE_OK 1
28937 -#define TLBSTATE_LAZY 2
28941 - struct mm_struct *active_mm;
28943 - char __cacheline_padding[L1_CACHE_BYTES-8];
28945 -DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
28948 -static inline void flush_tlb_kernel_range(unsigned long start,
28949 - unsigned long end)
28954 -#endif /* _I386_TLBFLUSH_H */
28955 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/asm/tlbflush_64.h 2009-02-16 16:18:36.000000000 +0100
28956 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000
28958 -#ifndef _X8664_TLBFLUSH_H
28959 -#define _X8664_TLBFLUSH_H
28961 -#include <linux/mm.h>
28962 -#include <linux/sched.h>
28963 -#include <asm/processor.h>
28964 -#include <asm/system.h>
28966 -#define __flush_tlb() xen_tlb_flush()
28969 - * Global pages have to be flushed a bit differently. Not a real
28970 - * performance problem because this does not happen often.
28972 -#define __flush_tlb_global() xen_tlb_flush()
28974 -#define __flush_tlb_all() __flush_tlb_global()
28976 -#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
28982 - * - flush_tlb() flushes the current mm struct TLBs
28983 - * - flush_tlb_all() flushes all processes TLBs
28984 - * - flush_tlb_mm(mm) flushes the specified mm context TLB's
28985 - * - flush_tlb_page(vma, vmaddr) flushes one page
28986 - * - flush_tlb_range(vma, start, end) flushes a range of pages
28987 - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
28989 - * x86-64 can only flush individual pages or full VMs. For a range flush
28990 - * we always do the full VM. Might be worth trying if for a small
28991 - * range a few INVLPGs in a row are a win.
28994 -#ifndef CONFIG_SMP
28996 -#define flush_tlb() __flush_tlb()
28997 -#define flush_tlb_all() __flush_tlb_all()
28998 -#define local_flush_tlb() __flush_tlb()
29000 -static inline void flush_tlb_mm(struct mm_struct *mm)
29002 - if (mm == current->active_mm)
29006 -static inline void flush_tlb_page(struct vm_area_struct *vma,
29007 - unsigned long addr)
29009 - if (vma->vm_mm == current->active_mm)
29010 - __flush_tlb_one(addr);
29013 -static inline void flush_tlb_range(struct vm_area_struct *vma,
29014 - unsigned long start, unsigned long end)
29016 - if (vma->vm_mm == current->active_mm)
29022 -#include <asm/smp.h>
29024 -#define local_flush_tlb() \
29027 -#define flush_tlb_all xen_tlb_flush_all
29028 -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask)
29029 -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
29030 -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
29032 -#define flush_tlb() flush_tlb_current_task()
29034 -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
29036 - flush_tlb_mm(vma->vm_mm);
29039 -#define TLBSTATE_OK 1
29040 -#define TLBSTATE_LAZY 2
29042 -/* Roughly an IPI every 20MB with 4k pages for freeing page table
29043 - ranges. Cost is about 42k of memory for each CPU. */
29044 -#define ARCH_FREE_PTE_NR 5350
29048 -static inline void flush_tlb_kernel_range(unsigned long start,
29049 - unsigned long end)
29054 -#endif /* _X8664_TLBFLUSH_H */
29055 --- sle11-2009-10-16.orig/include/asm-x86/mach-xen/irq_vectors.h 2009-10-28 14:55:04.000000000 +0100
29056 +++ sle11-2009-10-16/include/asm-x86/mach-xen/irq_vectors.h 2009-03-16 16:33:40.000000000 +0100
29059 #define RESCHEDULE_VECTOR 0
29060 #define CALL_FUNCTION_VECTOR 1
29062 +#define SPIN_UNLOCK_VECTOR 2
29066 * The maximum number of vectors supported by i386 processors
29067 --- sle11-2009-10-16.orig/include/asm-x86/mmu.h 2009-02-16 16:18:36.000000000 +0100
29068 +++ sle11-2009-10-16/include/asm-x86/mmu.h 2009-03-16 16:33:40.000000000 +0100
29069 @@ -23,7 +23,7 @@ typedef struct {
29074 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
29075 void leave_mm(int cpu);
29077 static inline void leave_mm(int cpu)
29078 --- sle11-2009-10-16.orig/include/asm-x86/ptrace.h 2009-10-28 14:55:04.000000000 +0100
29079 +++ sle11-2009-10-16/include/asm-x86/ptrace.h 2009-03-16 16:33:40.000000000 +0100
29080 @@ -249,7 +249,9 @@ extern void user_enable_single_step(stru
29081 extern void user_disable_single_step(struct task_struct *);
29083 extern void user_enable_block_step(struct task_struct *);
29084 -#ifdef CONFIG_X86_DEBUGCTLMSR
29085 +#if defined(CONFIG_XEN)
29086 +#define arch_has_block_step() (0)
29087 +#elif defined(CONFIG_X86_DEBUGCTLMSR)
29088 #define arch_has_block_step() (1)
29090 #define arch_has_block_step() (boot_cpu_data.x86 >= 6)
29091 --- sle11-2009-10-16.orig/include/asm-x86/thread_info.h 2009-02-16 16:17:21.000000000 +0100
29092 +++ sle11-2009-10-16/include/asm-x86/thread_info.h 2009-03-16 16:33:40.000000000 +0100
29093 @@ -94,6 +94,9 @@ struct thread_info {
29094 #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
29095 #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
29096 #define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */
29097 +#ifdef CONFIG_X86_XEN
29098 +#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */
29101 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
29102 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
29103 @@ -118,6 +121,7 @@ struct thread_info {
29104 #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
29105 #define _TIF_PERFMON_WORK (1 << TIF_PERFMON_WORK)
29106 #define _TIF_PERFMON_CTXSW (1 << TIF_PERFMON_CTXSW)
29107 +#define _TIF_CSTAR (1 << TIF_CSTAR)
29109 /* work to do in syscall_trace_enter() */
29110 #define _TIF_WORK_SYSCALL_ENTRY \
29111 @@ -147,12 +151,12 @@ struct thread_info {
29112 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
29113 _TIF_NOTSC|_TIF_PERFMON_CTXSW)
29115 -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29116 -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29118 -#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG)
29119 -#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC)
29120 +#define _TIF_WORK_CTXSW (_TIF_NOTSC \
29121 + /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
29123 +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
29124 +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
29126 #define PREEMPT_ACTIVE 0x10000000
29128 --- sle11-2009-10-16.orig/include/asm-x86/time.h 2009-10-28 14:55:04.000000000 +0100
29129 +++ sle11-2009-10-16/include/asm-x86/time.h 2009-03-16 16:33:40.000000000 +0100
29130 @@ -58,4 +58,10 @@ static inline int native_set_wallclock(u
29132 extern unsigned long __init calibrate_cpu(void);
29135 +extern int xen_independent_wallclock(void);
29136 +extern unsigned long xen_read_persistent_clock(void);
29137 +extern int xen_update_persistent_clock(void);
29141 --- sle11-2009-10-16.orig/include/linux/page-flags.h 2009-02-16 16:17:21.000000000 +0100
29142 +++ sle11-2009-10-16/include/linux/page-flags.h 2009-03-16 16:33:40.000000000 +0100
29143 @@ -102,8 +102,8 @@ enum pageflags {
29144 PG_foreign, /* Page is owned by foreign allocator. */
29145 PG_pinned, /* Cannot alias with PG_owner_priv_1 since
29146 * bad_page() checks include this bit.
29147 - * Also cannot use PG_arch_1 since that now
29148 - * has a different purpose on x86. */
29149 + * Should not use PG_arch_1 as that may have
29150 + * a different purpose elsewhere. */
29154 --- sle11-2009-10-16.orig/include/linux/pci.h 2008-12-15 11:27:22.000000000 +0100
29155 +++ sle11-2009-10-16/include/linux/pci.h 2009-03-16 16:33:40.000000000 +0100
29156 @@ -644,6 +644,9 @@ int pcie_set_readrq(struct pci_dev *dev,
29157 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
29158 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
29159 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
29161 +void pci_restore_bars(struct pci_dev *);
29164 /* ROM control related routines */
29165 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
29166 --- sle11-2009-10-16.orig/include/xen/evtchn.h 2009-03-04 11:28:34.000000000 +0100
29167 +++ sle11-2009-10-16/include/xen/evtchn.h 2009-03-16 16:33:40.000000000 +0100
29168 @@ -130,12 +130,37 @@ static inline void clear_evtchn(int port
29169 synch_clear_bit(port, s->evtchn_pending);
29172 +static inline void set_evtchn(int port)
29174 + shared_info_t *s = HYPERVISOR_shared_info;
29175 + synch_set_bit(port, s->evtchn_pending);
29178 +static inline int test_evtchn(int port)
29180 + shared_info_t *s = HYPERVISOR_shared_info;
29181 + return synch_test_bit(port, s->evtchn_pending);
29184 static inline void notify_remote_via_evtchn(int port)
29186 struct evtchn_send send = { .port = port };
29187 VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
29190 +/* Clear an irq's pending state, in preparation for polling on it. */
29191 +void xen_clear_irq_pending(int irq);
29193 +/* Set an irq's pending state, to avoid blocking on it. */
29194 +void xen_set_irq_pending(int irq);
29196 +/* Test an irq's pending state. */
29197 +int xen_test_irq_pending(int irq);
29199 +/* Poll waiting for an irq to become pending. In the usual case, the
29200 + irq will be disabled so it won't deliver an interrupt. */
29201 +void xen_poll_irq(int irq);
29204 * Use these to access the event channel underlying the IRQ handle returned
29205 * by bind_*_to_irqhandler().
29206 --- sle11-2009-10-16.orig/kernel/sysctl_check.c 2009-02-16 16:18:36.000000000 +0100
29207 +++ sle11-2009-10-16/kernel/sysctl_check.c 2009-03-16 16:33:40.000000000 +0100
29208 @@ -899,7 +899,7 @@ static const struct trans_ctl_table tran
29212 -static struct trans_ctl_table trans_xen_table[] = {
29213 +static const struct trans_ctl_table trans_xen_table[] = {
29214 { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" },
29215 { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" },
29217 --- sle11-2009-10-16.orig/lib/swiotlb-xen.c 2009-02-16 16:18:36.000000000 +0100
29218 +++ sle11-2009-10-16/lib/swiotlb-xen.c 2009-03-16 16:33:40.000000000 +0100
29220 #include <asm/gnttab_dma.h>
29223 -EXPORT_SYMBOL(swiotlb);
29225 #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
29227 @@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c
29231 +static inline unsigned int is_span_boundary(unsigned int index,
29232 + unsigned int nslots,
29233 + unsigned long offset_slots,
29234 + unsigned long max_slots)
29236 + unsigned long offset = (offset_slots + index) & (max_slots - 1);
29237 + return offset + nslots > max_slots;
29241 * Allocates bounce buffer and returns its kernel virtual address.
29243 @@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct
29244 unsigned int nslots, stride, index, wrap;
29245 struct phys_addr slot_buf;
29247 + unsigned long mask;
29248 + unsigned long offset_slots;
29249 + unsigned long max_slots;
29251 + mask = dma_get_seg_boundary(hwdev);
29252 + offset_slots = -IO_TLB_SEGSIZE;
29253 + max_slots = mask + 1
29254 + ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
29255 + : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
29258 * For mappings greater than a page, we limit the stride (and
29259 @@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct
29261 spin_lock_irqsave(&io_tlb_lock, flags);
29263 - wrap = index = ALIGN(io_tlb_index, stride);
29265 + index = ALIGN(io_tlb_index, stride);
29266 if (index >= iotlb_nslabs)
29267 - wrap = index = 0;
29272 + while (is_span_boundary(index, nslots, offset_slots,
29275 + if (index >= iotlb_nslabs)
29277 + if (index == wrap)
29282 * If we find a slot that indicates we have 'nslots'
29283 * number of contiguous buffers, we allocate the
29284 @@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct
29286 } while (index != wrap);
29289 spin_unlock_irqrestore(&io_tlb_lock, flags);