From: kernel.org Subject: 2.6.25 Patch-mainline: 2.6.25 Signed-off-by: Greg Kroah-Hartman Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py --- arch/x86/Kconfig | 18 arch/x86/Kconfig.debug | 1 arch/x86/ia32/ia32entry-xen.S | 12 arch/x86/kernel/Makefile | 3 arch/x86/kernel/acpi/boot.c | 3 arch/x86/kernel/acpi/sleep-xen.c | 95 + arch/x86/kernel/acpi/sleep_32-xen.c | 117 -- arch/x86/kernel/acpi/sleep_64-xen.c | 125 -- arch/x86/kernel/apic_32-xen.c | 2 arch/x86/kernel/apic_64-xen.c | 73 - arch/x86/kernel/asm-offsets_32.c | 2 arch/x86/kernel/cpu/common-xen.c | 214 +-- arch/x86/kernel/cpu/mtrr/main-xen.c | 19 arch/x86/kernel/e820_32-xen.c | 275 ----- arch/x86/kernel/e820_64-xen.c | 485 +++++--- arch/x86/kernel/early_printk-xen.c | 2 arch/x86/kernel/entry_32-xen.S | 195 +++ arch/x86/kernel/entry_64-xen.S | 91 - arch/x86/kernel/fixup.c | 2 arch/x86/kernel/genapic_64-xen.c | 15 arch/x86/kernel/head64-xen.c | 63 + arch/x86/kernel/head_32-xen.S | 3 arch/x86/kernel/init_task-xen.c | 2 arch/x86/kernel/io_apic_32-xen.c | 15 arch/x86/kernel/io_apic_64-xen.c | 110 +- arch/x86/kernel/ioport-xen.c | 112 ++ arch/x86/kernel/ioport_32-xen.c | 121 -- arch/x86/kernel/ioport_64-xen.c | 99 - arch/x86/kernel/irq_32-xen.c | 22 arch/x86/kernel/irq_64-xen.c | 43 arch/x86/kernel/ldt-xen.c | 272 +++++ arch/x86/kernel/ldt_32-xen.c | 265 ---- arch/x86/kernel/ldt_64-xen.c | 271 ---- arch/x86/kernel/machine_kexec_64.c | 2 arch/x86/kernel/microcode-xen.c | 2 arch/x86/kernel/mpparse_32-xen.c | 49 arch/x86/kernel/mpparse_64-xen.c | 30 arch/x86/kernel/pci-dma-xen.c | 20 arch/x86/kernel/process_32-xen.c | 438 ++------ arch/x86/kernel/process_64-xen.c | 303 ++--- arch/x86/kernel/quirks-xen.c | 82 - arch/x86/kernel/rtc.c | 8 arch/x86/kernel/setup64-xen.c | 70 + arch/x86/kernel/setup_32-xen.c | 311 ++++- arch/x86/kernel/setup_64-xen.c | 686 ++++++------ arch/x86/kernel/smp_32-xen.c | 5 arch/x86/kernel/smp_64-xen.c | 91 - arch/x86/kernel/time_32-xen.c | 136 -- arch/x86/kernel/traps_32-xen.c | 320 +++-- arch/x86/kernel/traps_64-xen.c | 371 +++--- arch/x86/kernel/vsyscall_64-xen.c | 60 - arch/x86/kernel/xen_entry_64.S | 36 arch/x86/mach-xen/setup.c | 11 arch/x86/mm/fault-xen.c | 1026 ++++++++++++++++++ arch/x86/mm/fault_32-xen.c | 757 ------------- arch/x86/mm/fault_64-xen.c | 686 ------------ arch/x86/mm/highmem_32-xen.c | 45 arch/x86/mm/hypervisor.c | 10 arch/x86/mm/init_32-xen.c | 464 +++----- arch/x86/mm/init_64-xen.c | 517 ++++----- arch/x86/mm/ioremap-xen.c | 685 ++++++++++++ arch/x86/mm/ioremap_32-xen.c | 445 -------- arch/x86/mm/pageattr-xen.c | 1412 ++++++++++++++++++++++++++ arch/x86/mm/pageattr_64-xen.c | 542 --------- arch/x86/mm/pgtable_32-xen.c | 672 ++---------- arch/x86/pci/irq-xen.c | 24 arch/x86/vdso/Makefile | 1 arch/x86/vdso/vdso32-setup-xen.c | 506 +++++++++ arch/x86/vdso/vdso32-setup.c | 34 arch/x86/vdso/vdso32.S | 12 arch/x86/vdso/vdso32/syscall.S | 2 drivers/pci/msi-xen.c | 98 - drivers/pci/pci.c | 5 drivers/xen/balloon/sysfs.c | 2 drivers/xen/blkback/blkback.c | 5 drivers/xen/blkfront/blkfront.c | 9 drivers/xen/blktap/blktap.c | 8 drivers/xen/core/Makefile | 1 drivers/xen/core/evtchn.c | 46 drivers/xen/core/hypervisor_sysfs.c | 2 drivers/xen/core/smpboot.c | 29 drivers/xen/core/spinlock.c | 161 ++ drivers/xen/core/xen_sysfs.c | 30 drivers/xen/gntdev/gntdev.c | 4 drivers/xen/scsifront/scsifront.c | 49 drivers/xen/xenoprof/xenoprofile.c | 2 include/asm-x86/mach-xen/asm/agp.h | 9 include/asm-x86/mach-xen/asm/desc.h | 403 +++++++ include/asm-x86/mach-xen/asm/desc_32.h | 262 ---- include/asm-x86/mach-xen/asm/desc_64.h | 228 ---- include/asm-x86/mach-xen/asm/dma-mapping_32.h | 18 include/asm-x86/mach-xen/asm/fixmap_32.h | 24 include/asm-x86/mach-xen/asm/fixmap_64.h | 25 include/asm-x86/mach-xen/asm/highmem.h | 10 include/asm-x86/mach-xen/asm/hypervisor.h | 19 include/asm-x86/mach-xen/asm/io_32.h | 69 - include/asm-x86/mach-xen/asm/io_64.h | 62 - include/asm-x86/mach-xen/asm/irqflags.h | 248 ++++ include/asm-x86/mach-xen/asm/irqflags_32.h | 212 --- include/asm-x86/mach-xen/asm/irqflags_64.h | 178 --- include/asm-x86/mach-xen/asm/maddr_32.h | 21 include/asm-x86/mach-xen/asm/maddr_64.h | 19 include/asm-x86/mach-xen/asm/mmu_context_32.h | 2 include/asm-x86/mach-xen/asm/mmu_context_64.h | 12 include/asm-x86/mach-xen/asm/page.h | 238 ++++ include/asm-x86/mach-xen/asm/page_64.h | 196 --- include/asm-x86/mach-xen/asm/pci.h | 17 include/asm-x86/mach-xen/asm/pci_64.h | 1 include/asm-x86/mach-xen/asm/pgalloc_32.h | 116 +- include/asm-x86/mach-xen/asm/pgalloc_64.h | 87 - include/asm-x86/mach-xen/asm/pgtable-3level.h | 107 - include/asm-x86/mach-xen/asm/pgtable.h | 449 ++++++++ include/asm-x86/mach-xen/asm/pgtable_32.h | 361 ------ include/asm-x86/mach-xen/asm/pgtable_64.h | 400 +------ include/asm-x86/mach-xen/asm/processor.h | 792 ++++++++++++++ include/asm-x86/mach-xen/asm/processor_32.h | 751 ------------- include/asm-x86/mach-xen/asm/processor_64.h | 461 -------- include/asm-x86/mach-xen/asm/segment.h | 203 +++ include/asm-x86/mach-xen/asm/segment_32.h | 150 -- include/asm-x86/mach-xen/asm/smp_32.h | 125 +- include/asm-x86/mach-xen/asm/smp_64.h | 138 -- include/asm-x86/mach-xen/asm/spinlock.h | 333 ++++++ include/asm-x86/mach-xen/asm/system.h | 392 +++++++ include/asm-x86/mach-xen/asm/system_32.h | 312 ----- include/asm-x86/mach-xen/asm/system_64.h | 159 -- include/asm-x86/mach-xen/asm/tlbflush.h | 105 + include/asm-x86/mach-xen/asm/tlbflush_32.h | 99 - include/asm-x86/mach-xen/asm/tlbflush_64.h | 97 - include/asm-x86/mach-xen/irq_vectors.h | 3 include/asm-x86/mmu.h | 2 include/asm-x86/ptrace.h | 4 include/asm-x86/thread_info.h | 12 include/asm-x86/time.h | 6 include/linux/page-flags.h | 4 include/linux/pci.h | 3 include/xen/evtchn.h | 25 kernel/sysctl_check.c | 2 lib/swiotlb-xen.c | 35 138 files changed, 11322 insertions(+), 11153 deletions(-) --- a/arch/x86/ia32/ia32entry-xen.S +++ b/arch/x86/ia32/ia32entry-xen.S @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target) CFI_RESTORE rcx movl %ebp,%ebp /* zero extension */ movl %eax,%eax + movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d movl $__USER32_DS,40(%rsp) movq %rbp,32(%rsp) movl $__USER32_CS,16(%rsp) - movl $VSYSCALL32_SYSEXIT,8(%rsp) + movq %r10,8(%rsp) movq %rax,(%rsp) cld SAVE_ARGS 0,0,1 @@ -582,8 +582,8 @@ ia32_sys_call_table: .quad compat_sys_futex /* 240 */ .quad compat_sys_sched_setaffinity .quad compat_sys_sched_getaffinity - .quad sys32_set_thread_area - .quad sys32_get_thread_area + .quad sys_set_thread_area + .quad sys_get_thread_area .quad compat_sys_io_setup /* 245 */ .quad sys_io_destroy .quad compat_sys_io_getevents @@ -661,7 +661,9 @@ ia32_sys_call_table: .quad sys_epoll_pwait .quad compat_sys_utimensat /* 320 */ .quad compat_sys_signalfd - .quad compat_sys_timerfd + .quad sys_timerfd_create .quad sys_eventfd .quad sys32_fallocate + .quad compat_sys_timerfd_settime /* 325 */ + .quad compat_sys_timerfd_gettime ia32_syscall_end: --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,7 +27,7 @@ config X86 select HAVE_KRETPROBES select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE - select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) + select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 @@ -208,14 +208,12 @@ config X86_TRAMPOLINE default y config X86_NO_TSS - bool + def_bool y depends on XEN - default y config X86_NO_IDT - bool + def_bool y depends on XEN - default y config KTIME_SCALAR def_bool X86_32 @@ -724,9 +722,8 @@ config X86_VISWS_APIC depends on X86_32 && X86_VISWS config X86_XEN_GENAPIC - bool + def_bool y depends on X86_64_XEN - default y config X86_MCE bool "Machine Check Exception" @@ -1113,7 +1110,7 @@ config ARCH_DISCONTIGMEM_DEFAULT config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on X86_64 + depends on X86_64 && !X86_64_XEN config ARCH_SPARSEMEM_ENABLE def_bool y @@ -1743,10 +1740,10 @@ config PCI_MMCONFIG depends on X86_64 && PCI && ACPI config XEN_PCIDEV_FRONTEND - bool "Xen PCI Frontend" if X86_64 + def_bool y + prompt "Xen PCI Frontend" if X86_64 depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64) select HOTPLUG - default y help The PCI device frontend driver allows the kernel to import arbitrary PCI devices from a PCI backend to support PCI driver domains. @@ -1754,7 +1751,6 @@ config XEN_PCIDEV_FRONTEND config XEN_PCIDEV_FE_DEBUG bool "Xen PCI Frontend Debugging" depends on XEN_PCIDEV_FRONTEND - default n help Enables some debug statements within the PCI Frontend. --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -266,6 +266,7 @@ config DEBUG_BOOT_PARAMS bool "Debug boot parameters" depends on DEBUG_KERNEL depends on DEBUG_FS + depends on !XEN help This option will cause struct boot_params to be exported via debugfs. --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -133,6 +133,9 @@ char *__init __acpi_map_table(unsigned l #ifndef CONFIG_XEN if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) return __va(phys); +#else + if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT)) + return isa_bus_to_virt(phys); #endif offset = phys & (PAGE_SIZE - 1); --- a/arch/x86/kernel/acpi/sleep_32-xen.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * sleep.c - x86-specific ACPI sleep support. - * - * Copyright (C) 2001-2003 Patrick Mochel - * Copyright (C) 2001-2003 Pavel Machek - */ - -#include -#include -#include -#include - -#include - -#ifndef CONFIG_ACPI_PV_SLEEP -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); -#endif - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ -#ifndef CONFIG_ACPI_PV_SLEEP - if (!acpi_wakeup_address) - return 1; - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); -#endif - return 0; -} - -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_bootmem(void) -{ -#ifndef CONFIG_ACPI_PV_SLEEP - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if (!acpi_wakeup_address) - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); -#endif -} - -#ifndef CONFIG_ACPI_PV_SLEEP -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - -/* Ouch, we want to delete this. We already have better version in userspace, in - s2ram from suspend.sf.net project */ -static __init int reset_videomode_after_s3(const struct dmi_system_id *d) -{ - acpi_realmode_flags |= 2; - return 0; -} - -static __initdata struct dmi_system_id acpisleep_dmi_table[] = { - { /* Reset video mode after returning from ACPI S3 sleep */ - .callback = reset_videomode_after_s3, - .ident = "Toshiba Satellite 4030cdt", - .matches = { - DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), - }, - }, - {} -}; - -static int __init acpisleep_dmi_init(void) -{ - dmi_check_system(acpisleep_dmi_table); - return 0; -} - -core_initcall(acpisleep_dmi_init); -#endif /* CONFIG_ACPI_PV_SLEEP */ --- a/arch/x86/kernel/acpi/sleep_64-xen.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * acpi.c - Architecture-Specific Low-Level ACPI Support - * - * Copyright (C) 2001, 2002 Paul Diefenbaugh - * Copyright (C) 2001 Jun Nakajima - * Copyright (C) 2001 Patrick Mochel - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) - * Copyright (C) 2003 Pavel Machek, SuSE Labs - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* -------------------------------------------------------------------------- - Low-Level Sleep Support - -------------------------------------------------------------------------- */ - -#ifndef CONFIG_ACPI_PV_SLEEP -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long acpi_copy_wakeup_routine(unsigned long); -#endif - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ -#ifndef CONFIG_ACPI_PV_SLEEP - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); -#endif - return 0; -} - -/* - * acpi_restore_state - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page in low memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16M pages, but not - * <1M pages. - */ -void __init acpi_reserve_bootmem(void) -{ -#ifndef CONFIG_ACPI_PV_SLEEP - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) - printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt" - " to suspend\n"); -#endif -} - -#ifndef CONFIG_ACPI_PV_SLEEP -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); -#endif /* CONFIG_ACPI_PV_SLEEP */ - --- /dev/null +++ b/arch/x86/kernel/acpi/sleep-xen.c @@ -0,0 +1,95 @@ +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek + */ + +#include +#include +#include +#include + +#include + +#ifndef CONFIG_ACPI_PV_SLEEP +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_realmode_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long acpi_copy_wakeup_routine(unsigned long); +#endif + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + if (!acpi_wakeup_address) { + printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n"); + return -ENOMEM; + } + memcpy((void *)acpi_wakeup_address, &wakeup_start, + &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); +#endif + + return 0; +} + +/* + * acpi_restore_state - undo effects of acpi_save_state_mem + */ +void acpi_restore_state_mem(void) +{ +} + + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page from the first 1MB of memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16MB pages, but not + * <1MB pages. + */ +void __init acpi_reserve_bootmem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) { + printk(KERN_ERR + "ACPI: Wakeup code way too big, S3 disabled.\n"); + return; + } + + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if (!acpi_wakeup_address) + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); +#endif +} + + +#ifndef CONFIG_ACPI_PV_SLEEP +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_realmode_flags |= 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); +#endif /* CONFIG_ACPI_PV_SLEEP */ --- a/arch/x86/kernel/apic_32-xen.c +++ b/arch/x86/kernel/apic_32-xen.c @@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int __init APIC_init_uniprocessor (void) +int __init APIC_init_uniprocessor(void) { #ifdef CONFIG_X86_IO_APIC if (smp_found_config) --- a/arch/x86/kernel/apic_64-xen.c +++ b/arch/x86/kernel/apic_64-xen.c @@ -34,34 +34,17 @@ #include #include -int apic_verbosity; +int disable_apic; /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * Debug level, exported for io_apic.c */ -void ack_bad_irq(unsigned int irq) -{ - printk("unexpected IRQ trap at irq %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK - */ - if (!disable_apic) - ack_APIC_irq(); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} +int apic_verbosity; -void smp_local_timer_interrupt(void) +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) { #ifndef CONFIG_XEN int cpu = smp_processor_id(); @@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_ */ exit_idle(); irq_enter(); - smp_local_timer_interrupt(); + local_apic_timer_interrupt(); irq_exit(); set_irq_regs(old_regs); } +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor(void) +{ +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + + return 1; +} + +/* + * Local APIC interrupts + */ + /* * This interrupt should _never_ happen with our APIC/SMP architecture */ @@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v /* * This interrupt should never happen with our APIC/SMP architecture */ - asmlinkage void smp_error_interrupt(void) { unsigned int v, v1; @@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void smp_processor_id(), v , v1); irq_exit(); } - -int disable_apic; - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) -{ -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -#endif - - return 1; -} --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -23,8 +23,10 @@ #include #endif +#ifdef CONFIG_LGUEST_GUEST #include #include "../../../drivers/lguest/lg.h" +#endif /* workaround for a warning with -Wmissing-prototypes */ void foo(void); --- a/arch/x86/kernel/cpu/common-xen.c +++ b/arch/x86/kernel/cpu/common-xen.c @@ -27,45 +27,50 @@ #include "cpu.h" DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, - [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, - [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, - [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, #ifndef CONFIG_XEN /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ - [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, - [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, #endif - [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + static int cachesize_override __cpuinitdata = -1; -static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; -static int disable_x86_sep __cpuinitdata; struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; -extern int disable_pse; - static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ @@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str static int __init x86_fxsr_setup(char * s) { - /* Tell all the other CPUs to not use it... */ - disable_x86_fxsr = 1; - - /* - * ... and clear the bits early in the boot_cpu_data - * so that the bootup process doesn't try to do this - * either. - */ - clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability); + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); return 1; } __setup("nofxsr", x86_fxsr_setup); @@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup); static int __init x86_sep_setup(char * s) { - disable_x86_sep = 1; + setup_clear_cpu_cap(X86_FEATURE_SEP); return 1; } __setup("nosep", x86_sep_setup); @@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void) void __init cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ - cpuid(0x00000000, &c->cpuid_level, - (int *)&c->x86_vendor_id[0], - (int *)&c->x86_vendor_id[8], - (int *)&c->x86_vendor_id[4]); + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); c->x86 = 4; if (c->cpuid_level >= 0x00000001) { @@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8 if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; - if (cap0 & (1<<19)) + if (cap0 & (1<<19)) { c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + } + } +} +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + unsigned int ebx; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + if (have_cpuid_p()) { + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + } + } + } /* Do minimum CPU detection early. @@ -300,6 +326,7 @@ static void __init early_cpu_detect(void struct cpuinfo_x86 *c = &boot_cpu_data; c->x86_cache_alignment = 32; + c->x86_clflush_size = 32; if (!have_cpuid_p()) return; @@ -307,19 +334,30 @@ static void __init early_cpu_detect(void cpu_detect(c); get_cpu_vendor(c, 1); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + + early_get_cap(c); } static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; - int ebx; + unsigned int ebx; if (have_cpuid_p()) { /* Get vendor name */ - cpuid(0x00000000, &c->cpuid_level, - (int *)&c->x86_vendor_id[0], - (int *)&c->x86_vendor_id[8], - (int *)&c->x86_vendor_id[4]); + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); get_cpu_vendor(c, 0); /* Initialize the standard set of capabilities */ @@ -364,8 +402,6 @@ static void __cpuinit generic_identify(s init_scattered_cpuid_features(c); } - early_intel_workaround(c); - #ifdef CONFIG_X86_HT c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif @@ -399,7 +435,7 @@ __setup("serialnumber", x86_serial_nr_se /* * This does the hard work of actually picking apart the CPU stuff... */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -425,20 +461,9 @@ static void __cpuinit identify_cpu(struc generic_identify(c); - printk(KERN_DEBUG "CPU: After generic identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - - if (this_cpu->c_identify) { + if (this_cpu->c_identify) this_cpu->c_identify(c); - printk(KERN_DEBUG "CPU: After vendor identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - } - /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -460,23 +485,6 @@ static void __cpuinit identify_cpu(struc * we do "generic changes." */ - /* TSC disabled? */ - if ( tsc_disable ) - clear_bit(X86_FEATURE_TSC, c->x86_capability); - - /* FXSR disabled? */ - if (disable_x86_fxsr) { - clear_bit(X86_FEATURE_FXSR, c->x86_capability); - clear_bit(X86_FEATURE_XMM, c->x86_capability); - } - - /* SEP disabled? */ - if (disable_x86_sep) - clear_bit(X86_FEATURE_SEP, c->x86_capability); - - if (disable_pse) - clear_bit(X86_FEATURE_PSE, c->x86_capability); - /* If the model name is still unset, do table lookup. */ if ( !c->x86_model_id[0] ) { char *p; @@ -489,13 +497,6 @@ static void __cpuinit identify_cpu(struc c->x86, c->x86_model); } - /* Now the feature flags better reflect actual CPU features! */ - - printk(KERN_DEBUG "CPU: After all inits, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -508,8 +509,14 @@ static void __cpuinit identify_cpu(struc boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] &= ~cleared_cpu_caps[i]; + /* Init Machine Check Exception if available. */ mcheck_init(c); + + select_idle_routine(c); } void __init identify_boot_cpu(void) @@ -517,7 +524,6 @@ void __init identify_boot_cpu(void) identify_cpu(&boot_cpu_data); sysenter_setup(); enable_sep_cpu(); - mtrr_bp_init(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -574,6 +580,13 @@ void __cpuinit detect_ht(struct cpuinfo_ } #endif +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -597,6 +610,17 @@ void __cpuinit print_cpu_info(struct cpu printk("\n"); } +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; /* This is hacky. :) @@ -606,16 +630,6 @@ cpumask_t cpu_initialized __cpuinitdata * They will insert themselves into the cpu_devs structure. * Then, when cpu_init() is called, we can just iterate over that array. */ - -extern int intel_cpu_init(void); -extern int cyrix_init_cpu(void); -extern int nsc_init_cpu(void); -extern int amd_init_cpu(void); -extern int centaur_init_cpu(void); -extern int transmeta_init_cpu(void); -extern int nexgen_init_cpu(void); -extern int umc_init_cpu(void); - void __init early_cpu_init(void) { intel_cpu_init(); @@ -627,21 +641,13 @@ void __init early_cpu_init(void) nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); - -#ifdef CONFIG_DEBUG_PAGEALLOC - /* pse is not compatible with on-the-fly unmapping, - * disable it even if the cpus claim to support it. - */ - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; -#endif } /* Make sure %fs is initialized properly in idle threads */ -struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PERCPU; + regs->fs = __KERNEL_PERCPU; return regs; } @@ -649,7 +655,7 @@ struct pt_regs * __devinit idle_regs(str * it's on the real one. */ void switch_to_new_gdt(void) { - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; unsigned long va, frames[16]; int f; @@ -692,12 +698,6 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } switch_to_new_gdt(); @@ -710,7 +710,7 @@ void __cpuinit cpu_init(void) BUG(); enter_lazy_tlb(&init_mm, curr); - load_esp0(t, thread); + load_sp0(t, thread); load_LDT(&init_mm.context); --- a/arch/x86/kernel/cpu/mtrr/main-xen.c +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c @@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = { struct mtrr_ops *mtrr_if = &generic_mtrr_ops; unsigned int num_var_ranges; -unsigned int *usage_table; +unsigned int mtrr_usage_table[MAX_VAR_RANGES]; static void __init set_num_var_ranges(void) { @@ -52,17 +52,12 @@ static void __init init_table(void) int i, max; max = num_var_ranges; - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) - == NULL) { - printk(KERN_ERR "mtrr: could not allocate\n"); - return; - } for (i = 0; i < max; i++) - usage_table[i] = 0; + mtrr_usage_table[i] = 0; } int mtrr_add_page(unsigned long base, unsigned long size, - unsigned int type, char increment) + unsigned int type, bool increment) { int error; struct xen_platform_op op; @@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un } if (increment) - ++usage_table[op.u.add_memtype.reg]; + ++mtrr_usage_table[op.u.add_memtype.reg]; mutex_unlock(&mtrr_mutex); @@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base int mtrr_add(unsigned long base, unsigned long size, unsigned int type, - char increment) + bool increment) { if (mtrr_check(base, size)) return -EINVAL; @@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long goto out; } } - if (usage_table[reg] < 1) { + if (mtrr_usage_table[reg] < 1) { printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); goto out; } - if (--usage_table[reg] < 1) { + if (--mtrr_usage_table[reg] < 1) { op.cmd = XENPF_del_memtype; op.u.del_memtype.handle = 0; op.u.del_memtype.reg = reg; --- a/arch/x86/kernel/e820_32-xen.c +++ b/arch/x86/kernel/e820_32-xen.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -18,11 +17,6 @@ #include #include -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif - struct e820map e820; struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000 EXPORT_SYMBOL(pci_mem_start); #endif extern int user_defined_memmap; -struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource bss_resource = { - .name = "Kernel bss", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; static struct resource system_rom_resource = { .name = "System ROM", @@ -112,60 +86,6 @@ static struct resource video_rom_resourc .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) @@ -272,10 +192,9 @@ static struct e820map machine_e820; * Request address space for all standard RAM and ROM resources * and also for regions reported as reserved by the e820. */ -static void __init -legacy_init_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) +void __init init_iomem_resources(struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; @@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou #undef e820 -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) -{ - int i; - - /* Nothing to do if not running in dom0. */ - if (!is_initial_xendomain()) - return 0; - - printk("Setting up standard PCI resources\n"); - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, - &data_resource, &bss_resource); - else - legacy_init_iomem_resources(&code_resource, - &data_resource, &bss_resource); - - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; -} - -subsys_initcall(request_standard_resources); - #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) /** * e820_mark_nosave_regions - Find the ranges of physical addresses that do not @@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l { int x; - if (!efi_enabled) { - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } + x = e820.nr_map; - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; } /* add_memory_region */ /* @@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr } /* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - -/* * Find the highest page frame number we have available */ void __init find_max_pfn(void) @@ -672,11 +533,6 @@ void __init find_max_pfn(void) int i; max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long start, end; @@ -694,34 +550,12 @@ void __init find_max_pfn(void) } /* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* * Register fully available low RAM pages with the bootmem allocator. */ void __init register_bootmem_low_pages(unsigned long max_low_pfn) { int i; - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long curr_pfn, last_pfn, size; /* @@ -855,56 +689,12 @@ void __init print_memory_map(char *who) } } -static __init __always_inline void efi_limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - efi_memory_desc_t *md, *next_md; - void *p, *p1; - int i, j; - - j = 0; - p1 = memmap.map; - for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { - md = p; - next_md = p1; - current_addr = md->phys_addr + - PFN_PHYS(md->num_pages); - if (is_available_memory(md)) { - if (md->phys_addr >= size) continue; - memcpy(next_md, md, memmap.desc_size); - if (current_addr >= size) { - next_md->num_pages -= - PFN_UP(current_addr-size); - } - p1 += memmap.desc_size; - next_md = p1; - j++; - } else if ((md->attribute & EFI_MEMORY_RUNTIME) == - EFI_MEMORY_RUNTIME) { - /* In order to make runtime services - * available we have to include runtime - * memory regions in memory map */ - memcpy(next_md, md, memmap.desc_size); - p1 += memmap.desc_size; - next_md = p1; - j++; - } - } - memmap.nr_map = j; - memmap.map_end = memmap.map + - (memmap.nr_map * memmap.desc_size); -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr = 0; int i; print_memory_map("limit_regions start"); - if (efi_enabled) { - efi_limit_regions(size); - return; - } for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; if (current_addr < size) @@ -1056,3 +846,44 @@ static int __init parse_memmap(char *arg return 0; } early_param("memmap", parse_memmap); + +#ifndef CONFIG_XEN +void __init update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + int i; + + BUG_ON(old_type == new_type); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + if (ei->type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && ei->size <= size) { + ei->type = new_type; + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + add_memory_region(final_start, final_end - final_start, + new_type); + } +} + +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + print_memory_map("modified"); +} +#endif --- a/arch/x86/kernel/e820_64-xen.c +++ b/arch/x86/kernel/e820_64-xen.c @@ -1,4 +1,4 @@ -/* +/* * Handle the memory map. * The functions here do the job until bootmem takes over. * @@ -26,6 +26,7 @@ #include #include #include +#include #include struct e820map e820 __initdata; @@ -33,96 +34,103 @@ struct e820map e820 __initdata; struct e820map machine_e820; #endif -/* +/* * PFN of last memory page. */ -unsigned long end_pfn; -EXPORT_SYMBOL(end_pfn); +unsigned long end_pfn; -/* +/* * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. * The direct mapping extends to end_pfn_map, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long end_pfn_map; + */ +unsigned long end_pfn_map; -/* +/* * Last pfn which the user wants to use. */ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; -extern struct resource code_resource, data_resource, bss_resource; - -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) -{ - unsigned long addr = *addrp, last = addr + size; +/* + * Early reserved memory areas. + */ +#define MAX_EARLY_RES 20 +struct early_res { + unsigned long start, end; + char name[16]; +}; +static struct early_res early_res[MAX_EARLY_RES] __initdata = { #ifndef CONFIG_XEN - /* various gunk below that needed for SMP startup */ - if (addr < 0x8000) { - *addrp = PAGE_ALIGN(0x8000); - return 1; - } - - /* direct mapping tables of the kernel */ - if (last >= table_start<= ramdisk_image && addr < ramdisk_end) { - *addrp = PAGE_ALIGN(ramdisk_end); - return 1; - } - } + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ +#ifdef CONFIG_SMP + { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" }, #endif - /* kernel code */ - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { - *addrp = PAGE_ALIGN(__pa_symbol(&_end)); - return 1; - } +#endif + {} +}; - if (last >= ebda_addr && addr < ebda_addr + ebda_size) { - *addrp = PAGE_ALIGN(ebda_addr + ebda_size); - return 1; +void __init reserve_early(unsigned long start, unsigned long end, char *name) +{ + int i; + struct early_res *r; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n", + start, end - 1, name?name:"", r->start, r->end - 1, r->name); } + if (i >= MAX_EARLY_RES) + panic("Too many early reservations"); + r = &early_res[i]; + r->start = start; + r->end = end; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); +} -#ifdef CONFIG_NUMA - /* NUMA memory to node map */ - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { - *addrp = nodemap_addr + nodemap_size; - return 1; +void __init early_res_to_bootmem(void) +{ + int i; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i, + r->start, r->end - 1, r->name); + reserve_bootmem_generic(r->start, r->end - r->start); } -#endif - /* XXX ramdisk image here? */ -#else - if (last < (table_end<= r->start && addr < r->end) { + *addrp = addr = r->end; + changed = 1; + goto again; + } } -#endif - return 0; -} + return changed; +} /* * This function checks if any part of the range is mapped * with type. */ -int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ +int +e820_any_mapped(unsigned long start, unsigned long end, unsigned type) +{ int i; #ifndef CONFIG_XEN - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; #else if (!is_initial_xendomain()) return 0; @@ -130,12 +138,12 @@ int e820_any_mapped(unsigned long start, const struct e820entry *ei = &machine_e820.map[i]; #endif - if (type && ei->type != type) + if (type && ei->type != type) continue; if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } + continue; + return 1; + } return 0; } EXPORT_SYMBOL_GPL(e820_any_mapped); @@ -146,7 +154,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped); * Note: this function only works correct if the e820 table is sorted and * not-overlapping, which is the case */ -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +int __init e820_all_mapped(unsigned long start, unsigned long end, + unsigned type) { int i; @@ -171,65 +180,77 @@ int __init e820_all_mapped(unsigned long */ if (ei->addr <= start) start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full coverage */ + /* + * if start is now at or beyond end, we're done, full + * coverage + */ if (start >= end) - return 1; /* we're done */ + return 1; } return 0; } -/* - * Find a free area in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; - if (ei->type != E820_RAM) - continue; - if (addr < start) +/* + * Find a free area with specified alignment in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, + unsigned size, unsigned long align) +{ + int i; + unsigned long mask = ~(align - 1); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + + if (ei->type != E820_RAM) + continue; + if (addr < start) addr = start; - if (addr > ei->addr + ei->size) - continue; + if (addr > ei->addr + ei->size) + continue; while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) ; - last = PAGE_ALIGN(addr) + size; + addr = (addr + align - 1) & mask; + last = addr + size; if (last > ei->addr + ei->size) continue; - if (last > end) + if (last > end) continue; - return addr; - } - return -1UL; -} + return addr; + } + return -1UL; +} /* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) { - unsigned long end_pfn = 0; + unsigned long end_pfn; + end_pfn = find_max_pfn_with_active_regions(); - - if (end_pfn > end_pfn_map) + + if (end_pfn > end_pfn_map) end_pfn_map = end_pfn; if (end_pfn_map > MAXMEM>>PAGE_SHIFT) end_pfn_map = MAXMEM>>PAGE_SHIFT; if (end_pfn > end_user_pfn) end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; - printk("end_pfn_map = %lu\n", end_pfn_map); - return end_pfn; + printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map); + return end_pfn; } /* * Mark e820 reserved areas as busy for the resource manager. */ -void __init e820_reserve_resources(struct e820entry *e820, int nr_map) +void __init e820_reserve_resources(struct e820entry *e820, int nr_map, + struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; for (i = 0; i < nr_map; i++) { @@ -247,14 +268,14 @@ void __init e820_reserve_resources(struc request_resource(&iomem_resource, res); if (e820[i].type == E820_RAM) { /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. */ #ifndef CONFIG_XEN - request_resource(res, &code_resource); - request_resource(res, &data_resource); - request_resource(res, &bss_resource); + request_resource(res, code_resource); + request_resource(res, data_resource); + request_resource(res, bss_resource); #endif #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) @@ -357,9 +378,9 @@ e820_register_active_regions(int nid, un add_active_range(nid, ei_startpfn, ei_endpfn); } -/* +/* * Add a memory region to the kernel e820 map. - */ + */ void __init add_memory_region(unsigned long start, unsigned long size, int type) { int x = e820.nr_map; @@ -384,9 +405,7 @@ unsigned long __init e820_hole_size(unsi { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn; - unsigned long ei_endpfn; - unsigned long ram = 0; + unsigned long ei_startpfn, ei_endpfn, ram = 0; int i; for (i = 0; i < e820.nr_map; i++) { @@ -398,28 +417,31 @@ unsigned long __init e820_hole_size(unsi return end - start - (ram << PAGE_SHIFT); } -void __init e820_print_map(char *who) +static void __init e820_print_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; + case E820_RAM: + printk(KERN_CONT "(usable)\n"); + break; case E820_RESERVED: - printk("(reserved)\n"); - break; + printk(KERN_CONT "(reserved)\n"); + break; case E820_ACPI: - printk("(ACPI data)\n"); - break; + printk(KERN_CONT "(ACPI data)\n"); + break; case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; + printk(KERN_CONT "(ACPI NVS)\n"); + break; + default: + printk(KERN_CONT "type %u\n", e820.map[i].type); + break; } } } @@ -427,11 +449,11 @@ void __init e820_print_map(char *who) /* * Sanitize the BIOS e820 map. * - * Some e820 responses include overlapping entries. The following + * Some e820 responses include overlapping entries. The following * replaces the original e820 map with a new one, removing overlaps. * */ -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -451,7 +473,8 @@ static int __init sanitize_e820_map(stru int i; /* - Visually we're performing the following (1,2,3,4 = memory types)... + Visually we're performing the following + (1,2,3,4 = memory types)... Sample memory map (w/overlaps): ____22__________________ @@ -493,22 +516,23 @@ static int __init sanitize_e820_map(stru old_nr = *pnr_map; /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; iaddr = biosmap[i].addr; change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; change_point[chgidx++]->pbios = &biosmap[i]; } } @@ -518,75 +542,106 @@ static int __init sanitize_e820_map(stru still_changing = 1; while (still_changing) { still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if > , swap */ - /* or, if current= & last=, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { change_tmp = change_point[i]; change_point[i] = change_point[i-1]; change_point[i-1] = change_tmp; - still_changing=1; + still_changing = 1; } } } /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ last_type = 0; /* start with undefined memory type */ last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { + for (chgidx = 0; chgidx < chg_nr; chgidx++) { /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; ipbios) - overlap_list[i] = overlap_list[overlap_entries-1]; + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; } overlap_entries--; } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ current_type = 0; - for (i=0; itype > current_type) current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ + /* + * continue building up new bios map based on this + * information + */ if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ + /* + * move forward only if the new size + * was non-zero + */ if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ + break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; + last_addr = change_point[chgidx]->addr; } last_type = current_type; } } - new_nr = new_bios_entry; /* retain count for new bios entries */ + /* retain count for new bios entries */ + new_nr = new_bios_entry; /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); *pnr_map = new_nr; return 0; @@ -601,7 +656,7 @@ static int __init sanitize_e820_map(stru * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) { #ifndef CONFIG_XEN /* Only one memory region (or negative)? Ignore it */ @@ -622,7 +677,7 @@ static int __init copy_e820_map(struct e return -1; add_memory_region(start, size, type); - } while (biosmap++,--nr_map); + } while (biosmap++, --nr_map); #ifdef CONFIG_XEN if (is_initial_xendomain()) { @@ -641,15 +696,17 @@ static int __init copy_e820_map(struct e return 0; } -void early_panic(char *msg) +static void early_panic(char *msg) { early_printk(msg); panic(msg); } -#ifndef CONFIG_XEN -void __init setup_memory_region(void) +/* We're not void only for x86 32-bit compat */ +char * __init machine_specific_memory_setup(void) { +#ifndef CONFIG_XEN + char *who = "BIOS-e820"; /* * Try to copy the BIOS-supplied E820-map. * @@ -659,14 +716,8 @@ void __init setup_memory_region(void) sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map("BIOS-e820"); -} - #else /* CONFIG_XEN */ - -void __init setup_memory_region(void) -{ + char *who = "Xen"; int rc; struct xen_memory_map memmap; /* @@ -694,11 +745,13 @@ void __init setup_memory_region(void) if (copy_e820_map(map, (char)memmap.nr_entries) < 0) early_panic("Cannot find a valid memory map"); - +#endif printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map("Xen"); + e820_print_map(who); + + /* In case someone cares... */ + return who; } -#endif static int __init parse_memopt(char *p) { @@ -709,7 +762,7 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; end_user_pfn = memparse(p, &p); - end_user_pfn >>= PAGE_SHIFT; + end_user_pfn >>= PAGE_SHIFT; end = end_user_pfn<type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && ei->size <= size) { + ei->type = new_type; + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + add_memory_region(final_start, final_end - final_start, + new_type); + } +} + +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + e820_print_map("modified"); +} +#endif + unsigned long pci_mem_start = 0xaeedbabe; EXPORT_SYMBOL(pci_mem_start); @@ -825,8 +927,10 @@ __init void e820_setup_gap(struct e820en if (!found) { gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " + "address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource " + "registers may break!\n"); } /* @@ -839,8 +943,9 @@ __init void e820_setup_gap(struct e820en /* Fun with two's complement */ pci_mem_start = (gapstart + round) & -round; - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); } int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) --- a/arch/x86/kernel/early_printk-xen.c +++ b/arch/x86/kernel/early_printk-xen.c @@ -222,7 +222,7 @@ static struct console simnow_console = { }; /* Direct interface for emergencies */ -struct console *early_console = &early_vga_console; +static struct console *early_console = &early_vga_console; static int early_console_initialized = 0; void early_printk(const char *fmt, ...) --- a/arch/x86/kernel/entry_32-xen.S +++ b/arch/x86/kernel/entry_32-xen.S @@ -59,7 +59,7 @@ * for paravirtualization. The following will never clobber any registers: * INTERRUPT_RETURN (aka. "iret") * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). * * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). @@ -282,16 +282,21 @@ END(resume_kernel) #endif CFI_ENDPROC + .macro test_tif ti_reg # system call tracing in operation / emulation + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg) + .endm + /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub -ENTRY(sysenter_entry) +ENTRY(ia32_sysenter_target) CFI_STARTPROC simple CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp - movl SYSENTER_stack_esp0(%esp),%esp + movl SYSENTER_stack_sp0(%esp),%esp sysenter_past_esp: /* * No need to follow this irqs on/off section: the syscall @@ -334,9 +339,7 @@ sysenter_past_esp: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + test_tif %ebp jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -354,7 +357,7 @@ sysenter_past_esp: xorl %ebp,%ebp TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs - ENABLE_INTERRUPTS_SYSEXIT + ENABLE_INTERRUPTS_SYSCALL_RET CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) @@ -363,10 +366,10 @@ sysenter_past_esp: .align 4 .long 1b,2b .popsection -ENDPROC(sysenter_entry) +ENDPROC(ia32_sysenter_target) # pv sysenter call handler stub -ENTRY(sysenter_entry_pv) +ENTRY(ia32pv_sysenter_target) RING0_INT_FRAME movl $__USER_DS,16(%esp) movl %ebp,12(%esp) @@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv) .previous /* fall through */ CFI_ENDPROC -ENDPROC(sysenter_entry_pv) +ENDPROC(ia32pv_sysenter_target) # system call handler stub ENTRY(system_call) @@ -398,9 +401,7 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + test_tif %ebp jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -452,7 +453,8 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 -1: INTERRUPT_RETURN +irq_return: + INTERRUPT_RETURN .section .fixup,"ax" iret_exc: pushl $0 # no error code @@ -461,7 +463,7 @@ iret_exc: .previous .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long irq_return,iret_exc .previous CFI_RESTORE_STATE @@ -657,7 +659,7 @@ END(syscall_badsys) * Build the entry stubs and pointer table with * some assembler magic. */ -.data +.section .rodata,"a" ENTRY(interrupt) .text @@ -959,7 +961,7 @@ END(device_not_available) * that sets up the real kernel stack. Check here, since we can't * allow the wrong stack to be used. * - * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have * already pushed 3 words if it hits on the sysenter instruction: * eflags, cs and eip. * @@ -971,7 +973,7 @@ END(device_not_available) cmpw $__KERNEL_CS,4(%esp); \ jne ok; \ label: \ - movl SYSENTER_stack_esp0+offset(%esp),%esp; \ + movl SYSENTER_stack_sp0+offset(%esp),%esp; \ CFI_DEF_CFA esp, 0; \ CFI_UNDEFINED eip; \ pushfl; \ @@ -986,7 +988,7 @@ label: \ KPROBE_ENTRY(debug) RING0_INT_FRAME #ifndef CONFIG_XEN - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) debug_stack_correct: @@ -1019,7 +1021,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 je nmi_espfix_stack - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -1032,7 +1034,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 jae nmi_stack_correct - cmpl $sysenter_entry,12(%esp) + cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ @@ -1085,12 +1087,8 @@ nmi_espfix_stack: RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 -1: INTERRUPT_RETURN + jmp irq_return CFI_ENDPROC -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous #else KPROBE_ENTRY(nmi) RING0_INT_FRAME @@ -1108,17 +1106,17 @@ KPROBE_END(nmi) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) -1: iret + iret .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long native_iret, iret_exc .previous END(native_iret) -ENTRY(native_irq_enable_sysexit) +ENTRY(native_irq_enable_syscall_ret) sti sysexit -END(native_irq_enable_sysexit) +END(native_irq_enable_syscall_ret) #endif KPROBE_ENTRY(int3) @@ -1267,7 +1265,144 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) +#include + + # pv syscall call handler stub +ENTRY(ia32pv_cstar_target) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,%ecx + movl $__USER_CS,4(%esp) + movl 12(%esp),%ebp + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-4,%ebp + CFI_REMEMBER_STATE + ja cstar_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,cstar_fault +.previous + SAVE_ALL + GET_THREAD_INFO(%ebp) + test_tif %ebp + jnz cstar_trace_entry + cmpl $nr_syscalls,%eax + jae cstar_badsys +.Lcstar_call: + btl %eax,cstar_special + jc .Lcstar_special + call *cstar_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) # store the return value +.Lcstar_exit: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_exit +.Lcstar_special: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_call +cstar_set_tif: + movl $cstar_clear_tif,(%esp) # replace return address + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + jmp *sys_call_table(,%eax,4) +cstar_clear_tif: + movl %eax,PT_EAX(%esp) # store the return value + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp .Lcstar_exit +cstar_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + cmpl $nr_syscalls,%eax + jae 1f + btl %eax,cstar_special + jc .Lcstar_trace_special +1: movl %esp,%eax + xorl %edx,%edx + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + call do_syscall_trace + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + testl %eax,%eax + jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall + movl PT_ORIG_EAX(%esp),%eax + cmpl $nr_syscalls,%eax + jb .Lcstar_call + jmp .Lcstar_exit +.Lcstar_trace_special: + movl PT_ECX(%esp),%ecx + movl %esp,%eax + xorl %edx,%edx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + call do_syscall_trace + testl %eax,%eax + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall + movl PT_ORIG_EAX(%esp),%eax + cmpl $nr_syscalls,%eax + jb syscall_call + jmp syscall_exit +cstar_badsys: + movl $-ENOSYS,PT_EAX(%esp) +.Lcstar_resume: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp resume_userspace + CFI_RESTORE_STATE +cstar_fault: + movl $-EFAULT,%eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + jmp .Lcstar_resume + CFI_ENDPROC +ENDPROC(ia32pv_cstar_target) + +ENTRY(cstar_ret_from_fork) + CFI_STARTPROC + movl PT_ECX(%esp),%ecx + GET_THREAD_INFO(%ebp) + movl %ecx,PT_EBP(%esp) # put user EBP back in place + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp ret_from_fork + CFI_ENDPROC +END(ret_from_fork) + .section .rodata,"a" #include "syscall_table_32.S" syscall_table_size=(.-sys_call_table) + +#include +cstar_special: +nr=0 +mask=0 +.rept nr_syscalls+31 + .irp n, __NR_sigreturn, __NR_rt_sigreturn + .if nr == \n + mask = mask | (1 << (\n & 31)) + .endif + .endr + nr = nr + 1 + .if (nr & 31) == 0 + .long mask + mask = 0 + .endif +.endr +#define sys_call_table cstar_call_table +#define sys_fork cstar_set_tif +#define sys_clone cstar_set_tif +#define sys_vfork cstar_set_tif +#include "syscall_table_32.S" +#undef sys_call_table +#undef sys_fork +#undef sys_clone +#undef sys_vfork --- a/arch/x86/kernel/entry_64-xen.S +++ b/arch/x86/kernel/entry_64-xen.S @@ -54,17 +54,22 @@ #include #include #include -#include +#include #include -#include "xen_entry_64.S" - .code64 #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif +#ifdef CONFIG_PARAVIRT +ENTRY(native_irq_enable_syscall_ret) + movq %gs:pda_oldrsp,%rsp + swapgs + sysretq +#endif /* CONFIG_PARAVIRT */ + .macro TRACE_IRQS_IRETQ offset=ARGOFFSET #ifdef CONFIG_TRACE_IRQFLAGS @@ -277,7 +282,7 @@ ret_from_sys_call: sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx @@ -287,7 +292,7 @@ sysret_check: * sysretq will re-enable interrupts: */ TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) RESTORE_ARGS 0,8,0 HYPERVISOR_IRET VGCF_IN_SYSCALL @@ -298,7 +303,7 @@ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule @@ -309,9 +314,8 @@ sysret_careful: /* Handle a signal */ sysret_signal: TRACE_IRQS_ON -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + ENABLE_INTERRUPTS(CLBR_NONE) + testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f /* Really a signal */ @@ -323,7 +327,7 @@ sysret_signal: 1: movl $_TIF_NEED_RESCHED,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -355,7 +359,7 @@ tracesys: */ .globl int_ret_from_sys_call int_ret_from_sys_call: - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testb $3,CS-ARGOFFSET(%rsp) jnz 1f @@ -381,22 +385,20 @@ int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful TRACE_IRQS_ON -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: TRACE_IRQS_ON -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx @@ -411,7 +413,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -419,7 +421,7 @@ int_signal: 1: movl $_TIF_NEED_RESCHED,%edi int_restore_rest: RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC @@ -474,6 +476,7 @@ ENTRY(stub_execve) CFI_REGISTER rip, r11 SAVE_REST FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx call sys_execve RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) @@ -526,11 +529,10 @@ retint_check: retint_restore_args: /* return to kernel space */ movl EFLAGS-REST_SKIP(%rsp), %eax shr $9, %eax # EAX[0] == IRET_EFLAGS.IF - XEN_GET_VCPU_INFO(%rsi) + GET_VCPU_INFO andb evtchn_upcall_mask(%rsi),%al andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask jnz restore_all_enable_events # != 0 => enable event delivery - XEN_PUT_VCPU_INFO(%rsi) RESTORE_ARGS 0,8,0 HYPERVISOR_IRET 0 @@ -541,31 +543,29 @@ retint_careful: bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) -/* sti */ + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) - XEN_BLOCK_EVENTS(%rsi) -/* cli */ + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_restore_args TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) @@ -702,7 +702,7 @@ END(spurious_interrupt) rdmsr testl %edx,%edx js 1f - swapgs + SWAPGS xorl %ebx,%ebx 1: #endif @@ -719,8 +719,7 @@ END(spurious_interrupt) .if \ist addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif -/* cli */ - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) .if \irqtrace TRACE_IRQS_OFF .endif @@ -749,10 +748,10 @@ paranoid_swapgs\trace: .if \trace TRACE_IRQS_IRETQ 0 .endif - swapgs + SWAPGS_UNSAFE_STACK paranoid_restore\trace: RESTORE_ALL 8 - iretq + jmp irq_return paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -767,11 +766,11 @@ paranoid_userspace\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_NONE) xorl %esi,%esi /* arg2: oldset */ movq %rsp,%rdi /* arg1: &pt_regs */ call do_notify_resume - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \trace TRACE_IRQS_OFF .endif @@ -780,9 +779,9 @@ paranoid_schedule\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_ANY) call schedule - cli + DISABLE_INTERRUPTS(CLBR_ANY) .if \trace TRACE_IRQS_OFF .endif @@ -846,8 +845,7 @@ error_call_handler: call *%rax error_exit: RESTORE_REST -/* cli */ - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testb $3,CS-ARGOFFSET(%rsp) @@ -875,7 +873,7 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp + leaq irq_return(%rip),%rbp cmpq %rbp,RIP(%rsp) je error_swapgs movl %ebp,%ebp /* zero extend */ @@ -930,19 +928,17 @@ END(do_hypervisor_callback) restore_all_enable_events: CFI_DEFAULT_STACK adj=1 TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... + __ENABLE_INTERRUPTS scrit: /**** START OF CRITICAL REGION ****/ - XEN_TEST_PENDING(%rsi) + __TEST_PENDING CFI_REMEMBER_STATE jnz 14f # process more events if necessary... - XEN_PUT_VCPU_INFO(%rsi) RESTORE_ARGS 0,8,0 HYPERVISOR_IRET 0 CFI_RESTORE_STATE -14: XEN_LOCKED_BLOCK_EVENTS(%rsi) - XEN_PUT_VCPU_INFO(%rsi) +14: __DISABLE_INTERRUPTS SAVE_REST movq %rsp,%rdi # set the argument again jmp 11b @@ -1086,15 +1082,16 @@ ENDPROC(child_rip) * rdi: name, rsi: argv, rdx: envp * * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) * * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack */ ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 SAVE_ALL + movq %rsp,%rcx call sys_execve movq %rax, RAX(%rsp) RESTORE_REST @@ -1144,7 +1141,7 @@ do_nmi_callback: call do_nmi orl $NMI_MASK,EFLAGS(%rsp) RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) jmp retint_restore_args --- a/arch/x86/kernel/fixup.c +++ b/arch/x86/kernel/fixup.c @@ -36,7 +36,7 @@ #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) -fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) { static unsigned long printed = 0; char info[100]; --- a/arch/x86/kernel/genapic_64-xen.c +++ b/arch/x86/kernel/genapic_64-xen.c @@ -24,20 +24,13 @@ #include #endif -/* - * which logical CPU number maps to which CPU (physical APIC ID) - * - * The following static array is used during kernel startup - * and the x86_cpu_to_apicid_ptr contains the address of the - * array during this time. Is it zeroed when the per_cpu - * data area is removed. - */ +/* which logical CPU number maps to which CPU (physical APIC ID) */ #ifndef CONFIG_XEN -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_ptr; +void *x86_cpu_to_apicid_early_ptr; #endif -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); #ifndef CONFIG_XEN --- a/arch/x86/kernel/head_32-xen.S +++ b/arch/x86/kernel/head_32-xen.S @@ -3,6 +3,7 @@ .text #include #include +#include #include #include #include @@ -88,7 +89,7 @@ ENTRY(_stext) */ .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm -ENTRY(swapper_pg_pmd) +ENTRY(swapper_pg_fixmap) .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 --- a/arch/x86/kernel/head64-xen.c +++ b/arch/x86/kernel/head64-xen.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,8 @@ #include #include #include +#include +#include unsigned long start_pfn; @@ -34,7 +37,7 @@ static void __init zap_identity_mappings { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb(); + __flush_tlb_all(); } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping); unsigned int machine_to_phys_order; EXPORT_SYMBOL(machine_to_phys_order); +#define EBDA_ADDR_POINTER 0x40E + +static __init void reserve_ebda(void) +{ +#ifndef CONFIG_XEN + unsigned ebda_addr, ebda_size; + + /* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E + */ + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); + ebda_addr <<= 4; + + if (!ebda_addr) + return; + + ebda_size = *(unsigned short *)__va(ebda_addr); + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; + + reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA"); +#endif +} + void __init x86_64_start_kernel(char * real_mode_data) { struct xen_machphys_mapping mapping; @@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r /* Make NULL pointers segfault */ zap_identity_mappings(); - for (i = 0; i < IDT_ENTRIES; i++) + /* Cleanup the over mapped high alias */ + cleanup_highmap(); + + for (i = 0; i < IDT_ENTRIES; i++) { +#ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +#else set_intr_gate(i, early_idt_handler); +#endif + } load_idt((const struct desc_ptr *)&idt_descr); #endif @@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r pda_init(0); copy_bootdata(__va(real_mode_data)); -#ifdef CONFIG_SMP - cpu_set(0, cpu_online_map); -#endif + + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + + reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE), + start_pfn << PAGE_SHIFT, "Xen provided"); + + reserve_ebda(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + start_kernel(); } --- a/arch/x86/kernel/init_task-xen.c +++ b/arch/x86/kernel/init_task-xen.c @@ -19,7 +19,7 @@ static struct sighand_struct init_sighan #endif struct mm_struct init_mm = INIT_MM(init_mm); #undef swapper_pg_dir -EXPORT_SYMBOL(init_mm); +EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ /* * Initial thread structure. --- a/arch/x86/kernel/io_apic_32-xen.c +++ b/arch/x86/kernel/io_apic_32-xen.c @@ -35,6 +35,7 @@ #include #include #include +#include /* time_after() */ #include #include @@ -48,8 +49,6 @@ #include #include -#include "io_ports.h" - #ifdef CONFIG_XEN #include #include @@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi # include /* kernel_thread() */ # include /* kstat */ # include /* kmalloc() */ -# include /* time_after() */ +# include #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) @@ -777,7 +776,7 @@ late_initcall(balanced_irq_init); #endif #ifndef CONFIG_SMP -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { #ifndef CONFIG_XEN unsigned int cfg; @@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; @@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read .eoi = ack_apic, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -2155,7 +2154,7 @@ static void setup_nmi (void) */ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + enable_NMI_through_LVT0(); apic_printk(APIC_VERBOSE, " done.\n"); } @@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi } static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), + .name = "ioapic", .suspend = ioapic_suspend, .resume = ioapic_resume, }; --- a/arch/x86/kernel/io_apic_64-xen.c +++ b/arch/x86/kernel/io_apic_64-xen.c @@ -32,9 +32,11 @@ #include #include #include +#include #ifdef CONFIG_ACPI #include #endif +#include #include #include @@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); @@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void) } #endif /* !CONFIG_XEN */ -static void __init enable_IO_APIC(void) +void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; #ifndef CONFIG_XEN @@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void) */ /* jiffies wrap? */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; } @@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_rax; + vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; @@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int int do_unmask_irq = 0; irq_complete_move(irq); -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; @@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir .end = end_lapic_irq, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -1583,7 +1585,7 @@ static void setup_nmi (void) */ printk(KERN_INFO "activating NMI Watchdog ..."); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); printk(" done.\n"); } @@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v * * FIXME: really need to revamp this for modern platforms only. */ -static inline void check_timer(void) +static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg + 0; int apic1, pin1, apic2, pin2; @@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi } static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), + .name = "ioapic", .suspend = ioapic_suspend, .resume = ioapic_resume, }; @@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void) } } #endif -#endif /* !CONFIG_XEN */ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + memset(mem, 0, n); + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %016lx (%016lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } + } +} + +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif /* !CONFIG_XEN */ --- a/arch/x86/kernel/ioport_32-xen.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - unsigned long mask; - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); - unsigned int low_index = base & (BITS_PER_LONG-1); - int length = low_index + extent; - - if (low_index != 0) { - mask = (~0UL << low_index); - if (length < BITS_PER_LONG) - mask &= ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - length -= BITS_PER_LONG; - } - - mask = (new_value ? ~0UL : 0UL); - while (length >= BITS_PER_LONG) { - *bitmap_base++ = mask; - length -= BITS_PER_LONG; - } - - if (length > 0) { - mask = ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - } -} - - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - struct thread_struct * t = ¤t->thread; - unsigned long *bitmap; - struct physdev_set_iobitmap set_iobitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); - - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); - set_iobitmap.nr_ports = IO_BITMAP_BITS; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, - &set_iobitmap)); - } - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - * Here we just change the eflags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. - */ - -asmlinkage long sys_iopl(unsigned long unused) -{ - volatile struct pt_regs * regs = (struct pt_regs *) &unused; - unsigned int level = regs->ebx; - struct thread_struct *t = ¤t->thread; - unsigned int old = (t->iopl >> 12) & 3; - - if (level > 3) - return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - } - t->iopl = level << 12; - set_iopl_mask(t->iopl); - return 0; -} --- a/arch/x86/kernel/ioport_64-xen.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - int i; - - if (new_value) - for (i = base; i < base + extent; i++) - __set_bit(i, bitmap); - else - for (i = base; i < base + extent; i++) - clear_bit(i, bitmap); -} - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - struct thread_struct * t = ¤t->thread; - unsigned long *bitmap; - struct physdev_set_iobitmap set_iobitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); - - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); - set_iobitmap.nr_ports = IO_BITMAP_BITS; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, - &set_iobitmap)); - } - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - */ - -asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs) -{ - unsigned int old_iopl = current->thread.iopl; - struct physdev_set_iopl set_iopl; - - if (new_iopl > 3) - return -EINVAL; - - /* Need "raw I/O" privileges for direct port access. */ - if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* Change our version of the privilege levels. */ - current->thread.iopl = new_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); - - return 0; -} --- /dev/null +++ b/arch/x86/kernel/ioport-xen.c @@ -0,0 +1,112 @@ +/* + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. 32/64 bits code unification by Miguel Botón. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, + unsigned int extent, int new_value) +{ + unsigned int i; + + for (i = base; i < base + extent; i++) { + if (new_value) + __set_bit(i, bitmap); + else + __clear_bit(i, bitmap); + } +} + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct * t = ¤t->thread; + struct physdev_set_iobitmap set_iobitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); + set_iobitmap.nr_ports = IO_BITMAP_BITS; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + } + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + */ +static int do_iopl(unsigned int level, struct thread_struct *t) +{ + unsigned int old = t->iopl >> 12; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage long sys_iopl(unsigned long regsp) +{ + struct pt_regs *regs = (struct pt_regs *)®sp; + unsigned int level = regs->bx; +#else +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ +#endif + struct thread_struct *t = ¤t->thread; + int rc; + + rc = do_iopl(level, t); + if (rc < 0) + goto out; + + t->iopl = level << 12; + set_iopl_mask(t->iopl); +out: + return rc; +} --- a/arch/x86/kernel/irq_32-xen.c +++ b/arch/x86/kernel/irq_32-xen.c @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int irq = ~regs->orig_eax; + int irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { - long esp; + long sp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (THREAD_SIZE - 1)); - if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + "=r" (sp) : "0" (THREAD_SIZE - 1)); + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { printk("do_IRQ: stack overflow: %ld\n", - esp - sizeof(struct thread_info)); + sp - sizeof(struct thread_info)); dump_stack(); } } @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, ebx; + int arg1, arg2, bx; /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r (curctx->tinfo.preempt_count & SOFTIRQ_MASK); asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (ebx) + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (bx) : "0" (irq), "1" (desc), "2" (isp), "D" (desc->handle_irq) : "memory", "cc" --- a/arch/x86/kernel/irq_64-xen.c +++ b/arch/x86/kernel/irq_64-xen.c @@ -20,6 +20,28 @@ atomic_t irq_err_count; +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq); +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But don't ack when the APIC is disabled. -AK + */ + if (!disable_apic) + ack_APIC_irq(); +#endif +} + #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: @@ -33,11 +55,11 @@ static inline void stack_overflow_check( u64 curbase = (u64)task_stack_page(current); static unsigned long warned = -60*HZ; - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && - regs->rsp < curbase + sizeof(struct thread_info) + 128 && + if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + 128 && time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", - current->comm, curbase, regs->rsp); + printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); show_stack(NULL,NULL); warned = jiffies; } @@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt struct pt_regs *old_regs = set_irq_regs(regs); /* high bit used in ret_from_ code */ - unsigned irq = ~regs->orig_rax; + unsigned irq = ~regs->orig_ax; /*exit_idle();*/ /*irq_enter();*/ @@ -251,14 +273,3 @@ asmlinkage void do_softirq(void) } local_irq_restore(flags); } - -#ifndef CONFIG_X86_LOCAL_APIC -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk("unexpected IRQ trap at irq %02x\n", irq); -} -#endif --- a/arch/x86/kernel/ldt_32-xen.c +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, int mincount, int reload) -{ - void *oldldt; - void *newldt; - int oldsize; - - if (mincount <= pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - preempt_disable(); -#endif - make_pages_readonly( - pc->ldt, - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - load_LDT(pc); -#ifdef CONFIG_SMP - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#endif - } - if (oldsize) { - make_pages_writable( - oldldt, - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - make_pages_readonly( - new->ldt, - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - mutex_init(&mm->context.lock); - mm->context.size = 0; - mm->context.has_foreign_mappings = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - mutex_lock(&old_mm->context.lock); - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); - } - return retval; -} - -/* - * No need to lock the MM as we are the last user - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - make_pages_writable( - mm->context.ldt, - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - mutex_unlock(&mm->context.lock); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - - err = 0; - size = 5*sizeof(struct desc_struct); - if (size > bytecount) - size = bytecount; - - err = size; - if (clear_user(ptr, size)) - err = -EFAULT; - - return err; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - mutex_lock(&mm->context.lock); - if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, - entry_1, entry_2); - -out_unlock: - mutex_unlock(&mm->context.lock); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} --- a/arch/x86/kernel/ldt_64-xen.c +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) -{ - void *oldldt; - void *newldt; - unsigned oldsize; - - if (mincount <= (unsigned)pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - - preempt_disable(); -#endif - make_pages_readonly( - pc->ldt, - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - load_LDT(pc); -#ifdef CONFIG_SMP - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#endif - } - if (oldsize) { - make_pages_writable( - oldldt, - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - make_pages_readonly( - new->ldt, - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - memset(&mm->context, 0, sizeof(mm->context)); - mutex_init(&mm->context.lock); - old_mm = current->mm; - if (old_mm) - mm->context.vdso = old_mm->context.vdso; - if (old_mm && old_mm->context.size > 0) { - mutex_lock(&old_mm->context.lock); - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); - } - return retval; -} - -/* - * - * Don't touch the LDT register - we're already in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - make_pages_writable( - mm->context.ldt, - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - mutex_unlock(&mm->context.lock); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - /* Arbitrary number */ - /* x86-64 default LDT is all zeros */ - if (bytecount > 128) - bytecount = 128; - if (clear_user(ptr, bytecount)) - return -EFAULT; - return bytecount; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct task_struct *me = current; - struct mm_struct * mm = me->mm; - __u32 entry_1, entry_2, *lp; - unsigned long mach_lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, bytecount)) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - mutex_lock(&mm->context.lock); - if (ldt_info.entry_number >= (unsigned)mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - mach_lp = arbitrary_virt_to_machine(lp); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32))); - -out_unlock: - mutex_unlock(&mm->context.lock); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} --- /dev/null +++ b/arch/x86/kernel/ldt-xen.c @@ -0,0 +1,272 @@ +/* + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt, *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + else + newldt = (void *)__get_free_page(GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + + preempt_disable(); +#endif + make_pages_readonly(newldt, + (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + load_LDT(pc); +#ifdef CONFIG_SMP + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable(oldldt, + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + put_page(virt_to_page(oldldt)); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + make_pages_readonly(new->ldt, + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct *old_mm; + int retval = 0; + + memset(&mm->context, 0, sizeof(mm->context)); + mutex_init(&mm->context.lock); + old_mm = current->mm; + if (old_mm) + mm->context.vdso = old_mm->context.vdso; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); + retval = copy_ldt(&mm->context, &old_mm->context); + mutex_unlock(&old_mm->context.lock); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + /* CHECKME: Can this ever happen ? */ + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable(mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + put_page(virt_to_page(mm->context.ldt)); + mm->context.size = 0; + } +} + +static int read_ldt(void __user *ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct *mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; + + mutex_lock(&mm->context.lock); + size = mm->context.size * LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + mutex_unlock(&mm->context.lock); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr + size, bytecount - size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user *ptr, unsigned long bytecount) +{ + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct *mm = current->mm; + struct desc_struct ldt; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + mutex_lock(&mm->context.lock); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + memset(&ldt, 0, sizeof(ldt)); + goto install; + } + } + + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; + + /* Install the new entry ... */ +install: + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); + +out_unlock: + mutex_unlock(&mm->context.lock); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -300,7 +300,9 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */ VMCOREINFO_SYMBOL(phys_base); +#endif VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -120,11 +120,10 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o + obj-$(CONFIG_XEN) += nmi_64.o time_64-$(CONFIG_XEN) += time_32.o pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o endif disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \ smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o -disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o -%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) := --- a/arch/x86/kernel/microcode-xen.c +++ b/arch/x86/kernel/microcode-xen.c @@ -167,7 +167,7 @@ static int request_microcode(void) } op.cmd = XENPF_microcode_update; - set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data); + set_xen_guest_handle(op.u.microcode.data, firmware->data); op.u.microcode.length = firmware->size; error = HYPERVISOR_platform_op(&op); --- a/arch/x86/kernel/mpparse_32-xen.c +++ b/arch/x86/kernel/mpparse_32-xen.c @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -unsigned int __cpuinitdata num_processors; +unsigned int num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", @@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp mps_oem_check(mpc, oem, str); - printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + printk("APIC at: 0x%X\n", mpc->mpc_lapic); - /* + /* * Save the local APIC address (it might be non-default) -- but only * if we're not using ACPI. */ @@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig unsigned long *bp = isa_bus_to_virt(base); struct intel_mp_floating *mpf; - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length); if (sizeof(*mpf) != 16) printk("Error: MPF size\n"); @@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig smp_found_config = 1; #ifndef CONFIG_XEN - printk(KERN_INFO "found SMP MP-table at %08lx\n", - virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, + BOOTMEM_DEFAULT); if (mpf->mpf_physptr) { /* * We cannot access to MPC table to compute @@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig unsigned long end = max_low_pfn * PAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size); + reserve_bootmem(mpf->mpf_physptr, size, + BOOTMEM_DEFAULT); } #else - printk(KERN_INFO "found SMP MP-table at %08lx\n", - ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, ((void *)bp - isa_bus_to_virt(base)) + base); #endif mpf_found = mpf; @@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3 */ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + + mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, - mp_ioapic_routing[idx].gsi_end); + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); } void __init @@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs ( } #define MAX_GSI_NUM 4096 +#define IRQ_COMPRESSION_START 64 int mp_register_gsi(u32 gsi, int triggering, int polarity) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; - static int pci_irq = 16; + static int pci_irq = IRQ_COMPRESSION_START; /* - * Mapping between Global System Interrups, which + * Mapping between Global System Interrupts, which * represent all possible interrupts, and IRQs * assigned to actual devices. */ @@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger if ((1<= 64, use IRQ compression + */ + if ((gsi >= IRQ_COMPRESSION_START) + && (triggering == ACPI_LEVEL_SENSITIVE)) { /* * For PCI devices assign IRQs in order, avoiding gaps * due to unused I/O APIC pins. --- a/arch/x86/kernel/mpparse_64-xen.c +++ b/arch/x86/kernel/mpparse_64-xen.c @@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U; EXPORT_SYMBOL(boot_cpu_id); /* Internal processor count */ -unsigned int num_processors __cpuinitdata = 0; +unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +#ifndef CONFIG_XEN +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata + = { [0 ... NR_CPUS-1] = BAD_APICID }; +void *x86_bios_cpu_apicid_early_ptr; +#endif +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); /* @@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info( physid_set(m->mpc_apicid, phys_cpu_present_map); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* - * bios_cpu_apicid is required to have processors listed + * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ cpu = 0; } - bios_cpu_apicid[cpu] = m->mpc_apicid; - /* - * We get called early in the the start_kernel initialization - * process when the per_cpu data area is not yet setup, so we - * use a static array that is removed after the per_cpu data - * area is created. - */ - if (x86_cpu_to_apicid_ptr) { - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; + /* are we being called early in kernel startup? */ + if (x86_cpu_to_apicid_early_ptr) { + u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + + cpu_to_apicid[cpu] = m->mpc_apicid; + bios_cpu_apicid[cpu] = m->mpc_apicid; } else { per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; + per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid; } cpu_set(cpu, cpu_possible_map); --- a/arch/x86/kernel/pci-dma-xen.c +++ b/arch/x86/kernel/pci-dma-xen.c @@ -434,3 +434,23 @@ dma_sync_single_for_device(struct device swiotlb_sync_single_for_device(dev, dma_handle, size, direction); } EXPORT_SYMBOL(dma_sync_single_for_device); + +void +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + if (swiotlb) + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction); + flush_write_buffers(); +} +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + if (swiotlb) + swiotlb_sync_sg_for_device(dev,sg,nelems,direction); + flush_write_buffers(); +} +EXPORT_SYMBOL(dma_sync_sg_for_device); --- a/arch/x86/kernel/process_32-xen.c +++ b/arch/x86/kernel/process_32-xen.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -59,8 +58,10 @@ #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); static int hlt_counter; @@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - return ((unsigned long *)tsk->thread.esp)[3]; + return ((unsigned long *)tsk->thread.sp)[3]; } /* @@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) { @@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { cpu_relax(); } @@ -122,10 +122,19 @@ static void xen_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched()) { + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } #ifdef CONFIG_APM_MODULE @@ -168,13 +177,13 @@ void cpu_idle(void) while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - check_pgt_cache(); rmb(); idle = xen_idle; /* no alternatives */ + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, 0); + if (cpu_is_offline(cpu)) play_dead(); @@ -192,40 +201,19 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; - unsigned long esp; + unsigned long sp; unsigned short ss, gs; if (user_mode_vm(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; + sp = regs->sp; + ss = regs->ss & 0xffff; savesegment(gs, gs); } else { - esp = (unsigned long) (®s->esp); + sp = (unsigned long) (®s->sp); savesegment(ss, ss); savesegment(gs, gs); } @@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re init_utsname()->version); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - 0xffff & regs->xcs, regs->eip, regs->eflags, + 0xffff & regs->cs, regs->ip, regs->flags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->eip); + print_symbol("EIP is at %s\n", regs->ip); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); + regs->ax, regs->bx, regs->cx, regs->dx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); + regs->si, regs->di, regs->bp, sp); printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, - regs->xfs & 0xffff, gs, ss); + regs->ds & 0xffff, regs->es & 0xffff, + regs->fs & 0xffff, gs, ss); if (!all) return; @@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re void show_regs(struct pt_regs *regs) { __show_registers(regs, 1); - show_trace(NULL, regs, ®s->esp); + show_trace(NULL, regs, ®s->sp, regs->bp); } /* - * This gets run with %ebx containing the - * function to call, and %edx containing + * This gets run with %bx containing the + * function to call, and %dx containing * the "args". */ extern void kernel_thread_helper(void); @@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi memset(®s, 0, sizeof(regs)); - regs.ebx = (unsigned long) fn; - regs.edx = (unsigned long) arg; + regs.bx = (unsigned long) fn; + regs.dx = (unsigned long) arg; - regs.xds = __USER_DS; - regs.xes = __USER_DS; - regs.xfs = __KERNEL_PERCPU; - regs.orig_eax = -1; - regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS | get_kernel_rpl(); - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); @@ -368,7 +356,12 @@ void flush_thread(void) { struct task_struct *tsk = current; - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); clear_tsk_thread_flag(tsk, TIF_DEBUG); /* @@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl childregs = task_pt_regs(p); *childregs = *regs; - childregs->eax = 0; - childregs->esp = esp; + childregs->ax = 0; + childregs->sp = sp; - p->thread.esp = (unsigned long) childregs; - p->thread.esp0 = (unsigned long) (childregs+1); + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); - p->thread.eip = (unsigned long) ret_from_fork; + p->thread.ip = (unsigned long) ret_from_fork; - savesegment(gs,p->thread.gs); + savesegment(gs, p->thread.gs); tsk = current; + if (test_tsk_thread_flag(tsk, TIF_CSTAR)) + p->thread.ip = (unsigned long) cstar_ret_from_fork; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); @@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl set_tsk_thread_flag(p, TIF_IO_BITMAP); } + err = 0; + /* * Set a new TLS for the child thread? */ - if (clone_flags & CLONE_SETTLS) { - struct desc_struct *desc; - struct user_desc info; - int idx; - - err = -EFAULT; - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) - goto out; - err = -EINVAL; - if (LDT_empty(&info)) - goto out; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - goto out; - - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } + if (clone_flags & CLONE_SETTLS) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); p->thread.iopl = current->thread.iopl; - err = 0; - out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; @@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl return err; } -/* - * fill in the user structure for a core dump.. - */ -void dump_thread(struct pt_regs * regs, struct user * dump) -{ - int i; - -/* changed the size calculations - should hopefully work better. lbt */ - dump->magic = CMAGIC; - dump->start_code = 0; - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; - dump->u_dsize -= dump->u_tsize; - dump->u_ssize = 0; - for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->thread.debugreg[i]; - - if (dump->start_stack < TASK_SIZE) - dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; - - dump->regs.ebx = regs->ebx; - dump->regs.ecx = regs->ecx; - dump->regs.edx = regs->edx; - dump->regs.esi = regs->esi; - dump->regs.edi = regs->edi; - dump->regs.ebp = regs->ebp; - dump->regs.eax = regs->eax; - dump->regs.ds = regs->xds; - dump->regs.es = regs->xes; - dump->regs.fs = regs->xfs; - savesegment(gs,dump->regs.gs); - dump->regs.orig_eax = regs->orig_eax; - dump->regs.eip = regs->eip; - dump->regs.cs = regs->xcs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->esp; - dump->regs.ss = regs->xss; - - dump->u_fpvalid = dump_fpu (regs, &dump->i387); -} -EXPORT_SYMBOL(dump_thread); - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs ptregs = *task_pt_regs(tsk); - ptregs.xcs &= 0xffff; - ptregs.xds &= 0xffff; - ptregs.xes &= 0xffff; - ptregs.xss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; -} - #ifdef CONFIG_SECCOMP -void hard_disable_TSC(void) +static void hard_disable_TSC(void) { write_cr4(read_cr4() | X86_CR4_TSD); } @@ -534,7 +453,7 @@ void disable_TSC(void) hard_disable_TSC(); preempt_enable(); } -void hard_enable_TSC(void) +static void hard_enable_TSC(void) { write_cr4(read_cr4() & ~X86_CR4_TSD); } @@ -543,18 +462,32 @@ void hard_enable_TSC(void) static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *next; + struct thread_struct *prev, *next; + unsigned long debugctl; + prev = &prev_p->thread; next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); + } + + if (next->debugctlmsr != debugctl) + wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); /* no 4 and 5 */ - set_debugreg(next->debugreg[6], 6); - set_debugreg(next->debugreg[7], 7); + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); } #ifdef CONFIG_SECCOMP @@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre hard_enable_TSC(); } #endif + +#ifdef X86_BTS + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); +#endif } /* @@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre * More important, however, is the fact that this allows us much * more flexibility. * - * The return value (in %eax) will be the "prev" task after + * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t #endif /* - * Reload esp0. - * This is load_esp0(tss, next) with a multicall. + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. */ mcl->op = __HYPERVISOR_stack_switch; mcl->args[0] = __KERNEL_DS; - mcl->args[1] = next->esp0; + mcl->args[1] = next->sp0; mcl++; /* @@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t asmlinkage int sys_fork(struct pt_regs regs) { - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } asmlinkage int sys_clone(struct pt_regs regs) @@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs unsigned long newsp; int __user *parent_tidptr, *child_tidptr; - clone_flags = regs.ebx; - newsp = regs.ecx; - parent_tidptr = (int __user *)regs.edx; - child_tidptr = (int __user *)regs.edi; + clone_flags = regs.bx; + newsp = regs.cx; + parent_tidptr = (int __user *)regs.dx; + child_tidptr = (int __user *)regs.di; if (!newsp) - newsp = regs.esp; + newsp = regs.sp; return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); } @@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs */ asmlinkage int sys_vfork(struct pt_regs regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } /* @@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs int error; char * filename; - filename = getname((char __user *) regs.ebx); + filename = getname((char __user *) regs.bx); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; error = do_execve(filename, - (char __user * __user *) regs.ecx, - (char __user * __user *) regs.edx, + (char __user * __user *) regs.cx, + (char __user * __user *) regs.dx, ®s); if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); } @@ -800,145 +738,37 @@ out: unsigned long get_wchan(struct task_struct *p) { - unsigned long ebp, esp, eip; + unsigned long bp, sp, ip; unsigned long stack_page; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; stack_page = (unsigned long)task_stack_page(p); - esp = p->thread.esp; - if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + sp = p->thread.sp; + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) return 0; - /* include/asm-i386/system.h:switch_to() pushes ebp last. */ - ebp = *(unsigned long *) esp; + /* include/asm-i386/system.h:switch_to() pushes bp last. */ + bp = *(unsigned long *) sp; do { - if (ebp < stack_page || ebp > top_ebp+stack_page) + if (bp < stack_page || bp > top_ebp+stack_page) return 0; - eip = *(unsigned long *) (ebp+4); - if (!in_sched_functions(eip)) - return eip; - ebp = *(unsigned long *) ebp; + ip = *(unsigned long *) (bp+4); + if (!in_sched_functions(ip)) + return ip; + bp = *(unsigned long *) bp; } while (count++ < 16); return 0; } -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty(t->tls_array + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - */ -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) -{ - struct thread_struct *t = ¤t->thread; - struct user_desc info; - struct desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - load_TLS(t, cpu); - - put_cpu(); - - return 0; -} - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) -{ - struct user_desc info; - struct desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - memset(&info, 0, sizeof(info)); - - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} --- a/arch/x86/kernel/process_64-xen.c +++ b/arch/x86/kernel/process_64-xen.c @@ -3,7 +3,7 @@ * * Pentium III FXSR, SSE support * Gareth Hughes , May 2000 - * + * * X86-64 port * Andi Kleen. * @@ -22,19 +22,18 @@ #include #include #include +#include #include #include -#include #include #include #include #include -#include -#include #include +#include #include +#include #include -#include #include #include #include @@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override) */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -81,13 +79,6 @@ void idle_notifier_register(struct notif { atomic_notifier_chain_register(&idle_notifier, n); } -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL(idle_notifier_unregister); void enter_idle(void) { @@ -116,7 +107,7 @@ void exit_idle(void) * to poll the ->need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); cpu_relax(); @@ -131,10 +122,19 @@ static void xen_idle(void) */ smp_mb(); local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); + if (!need_resched()) { + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); + safe_halt(); /* enables interrupts racelessly */ + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } @@ -161,19 +161,15 @@ static inline void play_dead(void) * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ -void cpu_idle (void) +void cpu_idle(void) { current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { + tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - - tick_nohz_stop_sched_tick(); - rmb(); idle = xen_idle; /* no alternatives */ if (cpu_is_offline(smp_processor_id())) @@ -203,49 +199,27 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && - !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); @@ -260,13 +234,13 @@ static int __init idle_setup (char *str) } early_param("idle", idle_setup); -/* Prints also some state that isn't saved in the pt_regs */ +/* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) { unsigned long fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; - unsigned int fsindex,gsindex; - unsigned int ds,cs,es; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; printk("\n"); print_modules(); @@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, - regs->eflags); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip, 1); + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, + regs->flags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->rax, regs->rbx, regs->rcx); + regs->ax, regs->bx, regs->cx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->rdx, regs->rsi, regs->rdi); + regs->dx, regs->si, regs->di); printk("RBP: %016lx R08: %016lx R09: %016lx\n", - regs->rbp, regs->r8, regs->r9); + regs->bp, regs->r8, regs->r9); printk("R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); printk("R13: %016lx R14: %016lx R15: %016lx\n", @@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(NULL, regs, (void *)(regs + 1)); + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } /* @@ -329,7 +303,7 @@ void exit_thread(void) struct task_struct *me = current; struct thread_struct *t = &me->thread; - if (me->thread.io_bitmap_ptr) { + if (me->thread.io_bitmap_ptr) { #ifndef CONFIG_X86_NO_TSS struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); #endif @@ -382,7 +356,7 @@ void flush_thread(void) tsk->thread.debugreg3 = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. */ @@ -405,26 +379,21 @@ void release_thread(struct task_struct * static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) { - struct user_desc ud = { + struct user_desc ud = { .base_addr = addr, .limit = 0xfffff, .seg_32bit = 1, .limit_in_pages = 1, .useable = 1, }; - struct n_desc_struct *desc = (void *)t->thread.tls_array; + struct desc_struct *desc = t->thread.tls_array; desc += tls; - desc->a = LDT_entry_a(&ud); - desc->b = LDT_entry_b(&ud); + fill_ldt(desc, &ud); } static inline u32 read_32bit_tls(struct task_struct *t, int tls) { - struct desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - return desc->base0 | - (((u32)desc->base1) << 16) | - (((u32)desc->base2) << 24); + return get_desc_base(&t->thread.tls_array[tls]); } /* @@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl (THREAD_SIZE + task_stack_page(p))) - 1; *childregs = *regs; - childregs->rax = 0; - childregs->rsp = rsp; - if (rsp == ~0UL) - childregs->rsp = (unsigned long)childregs; - - p->thread.rsp = (unsigned long) childregs; - p->thread.rsp0 = (unsigned long) (childregs+1); - p->thread.userrsp = me->thread.userrsp; + childregs->ax = 0; + childregs->sp = sp; + if (sp == ~0UL) + childregs->sp = (unsigned long)childregs; + + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); @@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); - } + } /* * Set a new TLS for the child thread? @@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) - err = ia32_child_tls(p, childregs); + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); else #endif err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); @@ -502,26 +472,32 @@ out: return err; } -static inline void __save_init_fpu( struct task_struct *tsk ) -{ - asm volatile( "rex64 ; fxsave %0 ; fnclex" - : "=m" (tsk->thread.i387.fxsave)); - tsk->thread_info->status &= ~TS_USEDFPU; -} - /* * This special macro can be used to load a debugging register */ -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) static inline void __switch_to_xtra(struct task_struct *prev_p, - struct task_struct *next_p) + struct task_struct *next_p) { struct thread_struct *prev, *next; + unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + } + + if (next->debugctlmsr != debugctl) + wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); loaddebug(next, 1); @@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru loaddebug(next, 6); loaddebug(next, 7); } + +#ifdef X86_BTS + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); +#endif } /* * switch_to(x,y) should switch tasks from x to y. * - * This could still be optimized: + * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced * @@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p, { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); #ifndef CONFIG_X86_NO_TSS struct tss_struct *tss = &per_cpu(init_tss, cpu); #endif @@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p, prev_p->fpu_counter = 0; /* - * Reload esp0, LDT and the page table pointer: + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. */ mcl->op = __HYPERVISOR_stack_switch; mcl->args[0] = __KERNEL_DS; - mcl->args[1] = next->rsp0; + mcl->args[1] = next->sp0; mcl++; /* @@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p, * This is load_TLS(next, cpu) with multicalls. */ #define C(i) do { \ - if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ mcl->op = __HYPERVISOR_update_descriptor; \ mcl->args[0] = virt_to_machine( \ - &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \ - mcl->args[1] = next->tls_array[i]; \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + mcl->args[1] = *(u64 *)&next->tls_array[i]; \ mcl++; \ } \ } while (0) @@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p, #undef C if (unlikely(prev->iopl != next->iopl)) { - iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl; + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; #if CONFIG_XEN_COMPAT > 0x030002 mcl->op = __HYPERVISOR_physdev_op; mcl->args[0] = PHYSDEVOP_set_iopl; @@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p, /* * Switch the PDA context. */ - prev->userrsp = read_pda(oldrsp); - write_pda(oldrsp, next->userrsp); + prev->usersp = read_pda(oldrsp); + write_pda(oldrsp, next->usersp); write_pda(pcurrent, next_p); write_pda(kernelstack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); @@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p, /* * Now maybe reload the debug registers */ - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p); /* If the task has used fpu the last 5 timeslices, just do a full @@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p, /* * sys_execve() executes a new program. */ -asmlinkage +asmlinkage long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) + char __user * __user *envp, struct pt_regs *regs) { long error; char * filename; filename = getname(name); error = PTR_ERR(filename); - if (IS_ERR(filename)) + if (IS_ERR(filename)) return error; - error = do_execve(filename, argv, envp, ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } + error = do_execve(filename, argv, envp, regs); putname(filename); return error; } @@ -728,18 +710,18 @@ void set_personality_64bit(void) /* inherit personality from parent */ /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_IA32); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that - 32bit childs are affected again. */ + 32bit childs are affected again. */ current->personality &= ~READ_IMPLIES_EXEC; } asmlinkage long sys_fork(struct pt_regs *regs) { - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); } asmlinkage long @@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) - newsp = regs->rsp; + newsp = regs->sp; return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } @@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns */ asmlinkage long sys_vfork(struct pt_regs *regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); } unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,rip; + u64 fp,ip; int count = 0; if (!p || p == current || p->state==TASK_RUNNING) return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) return 0; - fp = *(u64 *)(p->thread.rsp); + fp = *(u64 *)(p->thread.sp); do { if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) return 0; - rip = *(u64 *)(fp+8); - if (!in_sched_functions(rip)) - return rip; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; fp = *(u64 *)fp; } while (count++ < 16); return 0; @@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t /* Not strictly needed for fs, but do it for symmetry with gs */ if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { + if (addr <= 0xffffffff) { set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); + if (doit) { + load_TLS(&task->thread, cpu); asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; - } else { + } else { task->thread.fsindex = 0; task->thread.fs = addr; if (doit) { @@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t } put_cpu(); break; - case ARCH_GET_FS: { - unsigned long base; + case ARCH_GET_FS: { + unsigned long base; if (task->thread.fsindex == FS_TLS_SEL) base = read_32bit_tls(task, FS_TLS); else if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fs; - ret = put_user(base, (unsigned long __user *)addr); - break; + ret = put_user(base, (unsigned long __user *)addr); + break; } - case ARCH_GET_GS: { + case ARCH_GET_GS: { unsigned long base; unsigned gsindex; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); else if (doit) { - asm("movl %%gs,%0" : "=r" (gsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); if (gsindex) rdmsrl(MSR_KERNEL_GS_BASE, base); else @@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t } else base = task->thread.gs; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)addr); break; } default: ret = -EINVAL; break; - } + } - return ret; -} + return ret; +} long sys_arch_prctl(int code, unsigned long addr) { return do_arch_prctl(current, code, addr); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs *pp, ptregs; - - pp = task_pt_regs(tsk); - - ptregs = *pp; - ptregs.cs &= 0xffff; - ptregs.ss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - boot_option_idle_override = 1; - return 1; } unsigned long arch_align_stack(unsigned long sp) @@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} --- a/arch/x86/kernel/quirks-xen.c +++ b/arch/x86/kernel/quirks-xen.c @@ -9,7 +9,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; - u32 word; + u16 word; /* BIOS may enable hardware IRQ balancing for * E7520/E7320/E7525(revision ID 0x9 and below) @@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal pci_read_config_byte(dev, 0xf4, &config); pci_write_config_byte(dev, 0xf4, config|0x2); - /* read xTPR register */ - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); if (!(word & (1 << 13))) { struct xen_platform_op op; - printk(KERN_INFO "Intel E7520/7320/7525 detected. " - "Disabling irq balancing and affinity\n"); + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " + "disabling irq balancing and affinity\n"); op.cmd = XENPF_platform_quirk; op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; WARN_ON(HYPERVISOR_platform_op(&op)); @@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct pci_read_config_dword(dev, 0xF0, &rcba); rcba &= 0xFFFFC000; if (rcba == 0) { - printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " + "cannot force enable HPET\n"); return; } /* use bits 31:14, 16 kB aligned */ rcba_base = ioremap_nocache(rcba, 0x4000); if (rcba_base == NULL) { - printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " + "cannot force enable HPET\n"); return; } @@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct /* HPET is enabled in HPTC. Just not reported by BIOS */ val = val & 0x3; force_hpet_address = 0xFED00000 | (val << 12); - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); iounmap(rcba_base); return; } @@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct if (err) { force_hpet_address = 0; iounmap(rcba_base); - printk(KERN_DEBUG "Failed to force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, + "Failed to force enable HPET\n"); } else { force_hpet_resume_type = ICH_FORCE_HPET_RESUME; - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); } } @@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); static struct pci_dev *cached_dev; @@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st if (val & 0x4) { val &= 0x3; force_hpet_address = 0xFED00000 | (val << 12); - printk(KERN_DEBUG "HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); return; } @@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st /* HPET is enabled in HPTC. Just not reported by BIOS */ val &= 0x3; force_hpet_address = 0xFED00000 | (val << 12); - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); cached_dev = dev; force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; return; } - printk(KERN_DEBUG "Failed to force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); } /* @@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str */ if (val & 0x80) { force_hpet_address = (val & ~0x3ff); - printk(KERN_DEBUG "HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); return; } @@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str pci_read_config_dword(dev, 0x68, &val); if (val & 0x80) { force_hpet_address = (val & ~0x3ff); - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); cached_dev = dev; force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; return; } - printk(KERN_DEBUG "Failed to force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, @@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str pci_read_config_dword(dev, 0x44, &val); force_hpet_address = val & 0xfffffffe; force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; return; @@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N nvidia_force_enable_hpet); /* LPC bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, + nvidia_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, nvidia_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, @@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N void force_hpet_resume(void) { switch (force_hpet_resume_type) { - case ICH_FORCE_HPET_RESUME: - return ich_force_hpet_resume(); - - case OLD_ICH_FORCE_HPET_RESUME: - return old_ich_force_hpet_resume(); - - case VT8237_FORCE_HPET_RESUME: - return vt8237_force_hpet_resume(); - - case NVIDIA_FORCE_HPET_RESUME: - return nvidia_force_hpet_resume(); - - default: + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + default: break; } } --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void { unsigned long retval, flags; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return xen_read_persistent_clock(); +#endif spin_lock_irqsave(&rtc_lock, flags); retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); @@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void int update_persistent_clock(struct timespec now) { +#ifdef CONFIG_XEN + if (xen_update_persistent_clock() < 0 || xen_independent_wallclock()) + return 0; +#endif return set_rtc_mmss(now.tv_sec); } --- a/arch/x86/kernel/setup_32-xen.c +++ b/arch/x86/kernel/setup_32-xen.c @@ -47,9 +47,12 @@ #include #include #include +#include +#include #include