From: www.kernel.org Subject: Update to 2.6.22 Patch-mainline: 2.6.22 Automatically created from "patches.kernel.org/patch-2.6.22" by xen-port-patches.py Acked-by: jbeulich@novell.com --- arch/x86/Kconfig | 4 arch/x86/ia32/ia32entry-xen.S | 18 - arch/x86/kernel/Makefile | 2 arch/x86/kernel/acpi/sleep_64-xen.c | 26 - arch/x86/kernel/apic_32-xen.c | 1 arch/x86/kernel/apic_64-xen.c | 1 arch/x86/kernel/asm-offsets_32.c | 5 arch/x86/kernel/cpu/common-xen.c | 224 ++++--------- arch/x86/kernel/cpu/mtrr/main-xen.c | 2 arch/x86/kernel/e820_32-xen.c | 46 +- arch/x86/kernel/e820_64-xen.c | 28 - arch/x86/kernel/early_printk-xen.c | 27 - arch/x86/kernel/entry_32-xen.S | 30 - arch/x86/kernel/entry_64-xen.S | 7 arch/x86/kernel/genapic_64-xen.c | 108 +----- arch/x86/kernel/genapic_xen_64.c | 3 arch/x86/kernel/head64-xen.c | 32 + arch/x86/kernel/head_32-xen.S | 101 ------ arch/x86/kernel/head_64-xen.S | 52 --- arch/x86/kernel/io_apic_32-xen.c | 43 -- arch/x86/kernel/io_apic_64-xen.c | 39 -- arch/x86/kernel/ioport_32-xen.c | 2 arch/x86/kernel/ioport_64-xen.c | 2 arch/x86/kernel/irq_32-xen.c | 3 arch/x86/kernel/irq_64-xen.c | 34 +- arch/x86/kernel/ldt_32-xen.c | 1 arch/x86/kernel/ldt_64-xen.c | 1 arch/x86/kernel/microcode-xen.c | 2 arch/x86/kernel/mpparse_32-xen.c | 3 arch/x86/kernel/mpparse_64-xen.c | 3 arch/x86/kernel/pci-dma-xen.c | 29 + arch/x86/kernel/process_32-xen.c | 27 + arch/x86/kernel/process_64-xen.c | 16 arch/x86/kernel/quirks-xen.c | 63 --- arch/x86/kernel/setup64-xen.c | 17 - arch/x86/kernel/setup_64-xen.c | 30 - arch/x86/kernel/smp_32-xen.c | 191 ++++------- arch/x86/kernel/smp_64-xen.c | 29 - arch/x86/kernel/time_32-xen.c | 165 ++++++---- arch/x86/kernel/traps_32-xen.c | 46 +- arch/x86/kernel/traps_64-xen.c | 55 +-- arch/x86/kernel/vsyscall_64-xen.c | 73 +++- arch/x86/mm/fault_32-xen.c | 42 +- arch/x86/mm/fault_64-xen.c | 15 arch/x86/mm/highmem_32-xen.c | 14 arch/x86/mm/init_32-xen.c | 157 ++++++--- arch/x86/mm/init_64-xen.c | 132 ++++---- arch/x86/mm/ioremap_32-xen.c | 1 arch/x86/mm/pageattr_64-xen.c | 27 + arch/x86/mm/pgtable_32-xen.c | 206 +++++++----- drivers/char/tpm/tpm_xen.c | 2 drivers/pci/msi-xen.c | 127 +++++-- drivers/xen/blkfront/blkfront.c | 2 drivers/xen/char/mem.c | 1 drivers/xen/core/hypervisor_sysfs.c | 2 drivers/xen/core/smpboot.c | 45 +- drivers/xen/core/xen_sysfs.c | 24 - drivers/xen/netback/netback.c | 14 drivers/xen/netfront/netfront.c | 2 drivers/xen/pciback/xenbus.c | 2 drivers/xen/pcifront/xenbus.c | 4 drivers/xen/scsifront/xenbus.c | 2 drivers/xen/sfc_netback/accel_fwd.c | 7 drivers/xen/sfc_netback/accel_solarflare.c | 2 drivers/xen/sfc_netfront/accel_tso.c | 28 - drivers/xen/sfc_netfront/accel_vi.c | 4 drivers/xen/sfc_netfront/accel_xenbus.c | 4 fs/aio.c | 7 include/asm-x86/mach-xen/asm/desc_32.h | 119 ++++--- include/asm-x86/mach-xen/asm/desc_64.h | 30 - include/asm-x86/mach-xen/asm/dma-mapping_64.h | 2 include/asm-x86/mach-xen/asm/fixmap_32.h | 9 include/asm-x86/mach-xen/asm/fixmap_64.h | 1 include/asm-x86/mach-xen/asm/highmem.h | 6 include/asm-x86/mach-xen/asm/io_32.h | 13 include/asm-x86/mach-xen/asm/irqflags_32.h | 75 ++-- include/asm-x86/mach-xen/asm/irqflags_64.h | 19 - include/asm-x86/mach-xen/asm/mmu_context_32.h | 29 + include/asm-x86/mach-xen/asm/mmu_context_64.h | 3 include/asm-x86/mach-xen/asm/page_64.h | 61 +-- include/asm-x86/mach-xen/asm/pgalloc_32.h | 3 include/asm-x86/mach-xen/asm/pgalloc_64.h | 15 include/asm-x86/mach-xen/asm/pgtable-3level-defs.h | 2 include/asm-x86/mach-xen/asm/pgtable-3level.h | 61 ++- include/asm-x86/mach-xen/asm/pgtable_32.h | 80 ++-- include/asm-x86/mach-xen/asm/pgtable_64.h | 83 ++--- include/asm-x86/mach-xen/asm/processor_32.h | 141 +++----- include/asm-x86/mach-xen/asm/processor_64.h | 55 --- include/asm-x86/mach-xen/asm/segment_32.h | 10 include/asm-x86/mach-xen/asm/smp_32.h | 117 +++++-- include/asm-x86/mach-xen/asm/smp_64.h | 20 - include/asm-x86/mach-xen/asm/system_32.h | 342 ++++----------------- include/asm-x86/mach-xen/asm/system_64.h | 106 ------ include/asm-x86/mach-xen/asm/tlbflush_32.h | 11 include/asm-x86/mach-xen/asm/tlbflush_64.h | 2 include/linux/pci.h | 2 lib/swiotlb-xen.c | 1 net/core/dev.c | 15 scripts/Makefile.xen.awk | 2 99 files changed, 1771 insertions(+), 2128 deletions(-) --- a/arch/x86/ia32/ia32entry-xen.S +++ b/arch/x86/ia32/ia32entry-xen.S @@ -431,11 +431,7 @@ ia32_sys_call_table: .quad sys_symlink .quad sys_lstat .quad sys_readlink /* 85 */ -#ifdef CONFIG_IA32_AOUT .quad sys_uselib -#else - .quad quiet_ni_syscall -#endif .quad sys_swapon .quad sys_reboot .quad compat_sys_old_readdir @@ -574,7 +570,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* tux */ .quad quiet_ni_syscall /* security */ .quad sys_gettid - .quad sys_readahead /* 225 */ + .quad sys32_readahead /* 225 */ .quad sys_setxattr .quad sys_lsetxattr .quad sys_fsetxattr @@ -599,7 +595,7 @@ ia32_sys_call_table: .quad compat_sys_io_getevents .quad compat_sys_io_submit .quad sys_io_cancel - .quad sys_fadvise64 /* 250 */ + .quad sys32_fadvise64 /* 250 */ .quad quiet_ni_syscall /* free_huge_pages */ .quad sys_exit_group .quad sys32_lookup_dcookie @@ -663,10 +659,14 @@ ia32_sys_call_table: .quad compat_sys_set_robust_list .quad compat_sys_get_robust_list .quad sys_splice - .quad sys_sync_file_range - .quad sys_tee + .quad sys32_sync_file_range + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages .quad sys_getcpu .quad sys_epoll_pwait -ia32_syscall_end: + .quad compat_sys_utimensat /* 320 */ + .quad compat_sys_signalfd + .quad compat_sys_timerfd + .quad sys_eventfd +ia32_syscall_end: --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1429,7 +1429,7 @@ config PHYSICAL_START config RELOCATABLE bool "Build a relocatable kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL && !X86_XEN + depends on EXPERIMENTAL && !X86_XEN && !X86_64_XEN help This builds a kernel image that retains relocation information so it can be loaded someplace besides the default 1MB. @@ -1483,7 +1483,6 @@ config COMPAT_VDSO def_bool y prompt "Compat VDSO support" depends on X86_32 || IA32_EMULATION - depends on !X86_XEN help Map the 32-bit VDSO to the predictable old-style address too. ---help--- @@ -1662,6 +1661,7 @@ config PCI bool "PCI support" default y select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) + select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND) help Find out whether you have a PCI motherboard. PCI is the name of a bus system, i.e. the way the CPU talks to the other stuff inside --- a/arch/x86/kernel/acpi/sleep_64-xen.c +++ b/arch/x86/kernel/acpi/sleep_64-xen.c @@ -60,19 +60,6 @@ unsigned long acpi_video_flags; extern char wakeup_start, wakeup_end; extern unsigned long acpi_copy_wakeup_routine(unsigned long); - -static pgd_t low_ptr; - -static void init_low_mapping(void) -{ - pgd_t *slot0 = pgd_offset(current->mm, 0UL); - low_ptr = *slot0; - /* FIXME: We're playing with the current task's page tables here, which - * is potentially dangerous on SMP systems. - */ - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); - local_flush_tlb(); -} #endif /** @@ -84,8 +71,6 @@ static void init_low_mapping(void) int acpi_save_state_mem(void) { #ifndef CONFIG_ACPI_PV_SLEEP - init_low_mapping(); - memcpy((void *)acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); acpi_copy_wakeup_routine(acpi_wakeup_address); @@ -98,10 +83,6 @@ int acpi_save_state_mem(void) */ void acpi_restore_state_mem(void) { -#ifndef CONFIG_ACPI_PV_SLEEP - set_pgd(pgd_offset(current->mm, 0UL), low_ptr); - local_flush_tlb(); -#endif } /** @@ -115,10 +96,11 @@ void acpi_restore_state_mem(void) void __init acpi_reserve_bootmem(void) { #ifndef CONFIG_ACPI_PV_SLEEP - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); + "ACPI: Wakeup code way too big, will crash on attempt" + " to suspend\n"); #endif } --- a/arch/x86/kernel/apic_32-xen.c +++ b/arch/x86/kernel/apic_32-xen.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include --- a/arch/x86/kernel/apic_64-xen.c +++ b/arch/x86/kernel/apic_64-xen.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -109,11 +109,6 @@ void foo(void) OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_START_mfn_list, start_info, mfn_list); -#endif - #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); --- a/arch/x86/kernel/cpu/common-xen.c +++ b/arch/x86/kernel/cpu/common-xen.c @@ -22,16 +22,40 @@ #define phys_pkg_id(a,b) a #endif #endif -#include #include #include "cpu.h" -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, +#ifndef CONFIG_XEN + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ + [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, + [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(_cpu_pda); + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, +#endif + [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; @@ -373,7 +397,7 @@ __setup("serialnumber", x86_serial_nr_se /* * This does the hard work of actually picking apart the CPU stuff... */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -484,15 +508,22 @@ void __cpuinit identify_cpu(struct cpuin /* Init Machine Check Exception if available. */ mcheck_init(c); +} - if (c == &boot_cpu_data) - sysenter_setup(); +void __init identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); + sysenter_setup(); enable_sep_cpu(); + mtrr_bp_init(); +} - if (c == &boot_cpu_data) - mtrr_bp_init(); - else - mtrr_ap_init(); +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); + enable_sep_cpu(); + mtrr_ap_init(); } #ifdef CONFIG_X86_HT @@ -606,136 +637,47 @@ void __init early_cpu_init(void) #endif } -/* Make sure %gs is initialized properly in idle threads */ +/* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PDA; + regs->xfs = __KERNEL_PERCPU; return regs; } -static __cpuinit int alloc_gdt(int cpu) +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) { - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - /* - * This is a horrible hack to allocate the GDT. The problem - * is that cpu_init() is called really early for the boot CPU - * (and hence needs bootmem) but much later for the secondary - * CPUs, when bootmem will have gone away - */ - if (NODE_DATA(0)->bdata->node_bootmem_map) { - BUG_ON(gdt != NULL || pda != NULL); - - gdt = alloc_bootmem_pages(PAGE_SIZE); - pda = alloc_bootmem(sizeof(*pda)); - /* alloc_bootmem(_pages) panics on failure, so no check */ - - memset(gdt, 0, PAGE_SIZE); - memset(pda, 0, sizeof(*pda)); - } else { - /* GDT and PDA might already have been allocated if - this is a CPU hotplug re-insertion. */ - if (gdt == NULL) - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - - if (pda == NULL) - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); - - if (unlikely(!gdt || !pda)) { - free_pages((unsigned long)gdt, 0); - kfree(pda); - return 0; - } - } - - cpu_gdt_descr->address = (unsigned long)gdt; - cpu_pda(cpu) = pda; - - return 1; -} - -/* Initial PDA used by boot CPU */ -struct i386_pda boot_pda = { - ._pda = &boot_pda, - .cpu_number = 0, - .pcurrent = &init_task, -}; - -static inline void set_kernel_fs(void) -{ - /* Set %fs for this CPU's PDA. Memory clobber is to create a - barrier with respect to any PDA operations, so the compiler - doesn't move any before here. */ - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); -} - -/* Initialize the CPU's GDT and PDA. The boot CPU does this for - itself, but secondaries find this done for them. */ -__cpuinit int init_gdt(int cpu, struct task_struct *idle) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - /* For non-boot CPUs, the GDT and PDA should already have been - allocated. */ - if (!alloc_gdt(cpu)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); - return 0; - } - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - BUG_ON(gdt == NULL || pda == NULL); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - memcpy(gdt, cpu_gdt_table, GDT_SIZE); - cpu_gdt_descr->size = GDT_SIZE - 1; - - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, - (u32 *)&gdt[GDT_ENTRY_PDA].b, - (unsigned long)pda, sizeof(*pda) - 1, - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ - - memset(pda, 0, sizeof(*pda)); - pda->_pda = pda; - pda->cpu_number = cpu; - pda->pcurrent = idle; - - return 1; -} - -void __cpuinit cpu_set_gdt(int cpu) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct Xgt_desc_struct gdt_descr; unsigned long va, frames[16]; int f; - for (va = cpu_gdt_descr->address, f = 0; - va < cpu_gdt_descr->address + cpu_gdt_descr->size; + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + + for (va = gdt_descr.address, f = 0; + va < gdt_descr.address + gdt_descr.size; va += PAGE_SIZE, f++) { frames[f] = virt_to_mfn(va); make_lowmem_page_readonly( (void *)va, XENFEAT_writable_descriptor_tables); } - BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8)); - - set_kernel_fs(); + if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8)) + BUG(); + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); } -/* Common CPU init for both boot and secondary CPUs */ -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) { + int cpu = smp_processor_id(); + struct task_struct *curr = current; #ifndef CONFIG_X86_NO_TSS struct tss_struct * t = &per_cpu(init_tss, cpu); #endif @@ -757,6 +699,8 @@ static void __cpuinit _cpu_init(int cpu, set_in_cr4(X86_CR4_TSD); } + switch_to_new_gdt(); + /* * Set up and load the per-CPU TSS and LDT */ @@ -794,38 +738,6 @@ static void __cpuinit _cpu_init(int cpu, mxcsr_feature_mask_init(); } -/* Entrypoint to initialize secondary CPU */ -void __cpuinit secondary_cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - - _cpu_init(cpu, curr); -} - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __cpuinit cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - - /* Set up the real GDT and PDA, so we can transition from the - boot versions. */ - if (!init_gdt(cpu, curr)) { - /* failed to allocate something; not much we can do... */ - for (;;) - local_irq_enable(); - } - - cpu_set_gdt(cpu); - _cpu_init(cpu, curr); -} - #ifdef CONFIG_HOTPLUG_CPU void __cpuinit cpu_uninit(void) { --- a/arch/x86/kernel/cpu/mtrr/main-xen.c +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c @@ -167,7 +167,7 @@ mtrr_del(int reg, unsigned long base, un EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); -void __init mtrr_bp_init(void) +__init void mtrr_bp_init(void) { } --- a/arch/x86/kernel/e820_32-xen.c +++ b/arch/x86/kernel/e820_32-xen.c @@ -162,26 +162,27 @@ static struct resource standard_io_resou static int __init romsignature(const unsigned char *rom) { + const unsigned short * const ptr = (const unsigned short *)rom; unsigned short sig; - return probe_kernel_address((const unsigned short *)rom, sig) == 0 && - sig == ROMSIGNATURE; + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; } -static int __init romchecksum(unsigned char *rom, unsigned long length) +static int __init romchecksum(const unsigned char *rom, unsigned long length) { - unsigned char sum; + unsigned char sum, c; - for (sum = 0; length; length--) - sum += *rom++; - return sum == 0; + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; } static void __init probe_roms(void) { + const unsigned char *rom; unsigned long start, length, upper; - unsigned char *rom; - int i; + unsigned char c; + int i; #ifdef CONFIG_XEN /* Nothing to do if not running in dom0. */ @@ -198,8 +199,11 @@ static void __init probe_roms(void) video_rom_resource.start = start; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; + length = c * 512; /* if checksum okay, trust length byte */ if (length && romchecksum(rom, length)) @@ -233,8 +237,11 @@ static void __init probe_roms(void) if (!romsignature(rom)) continue; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; + length = c * 512; /* but accept any length that fits if checksum okay */ if (!length || start + length > upper || !romchecksum(rom, length)) @@ -249,7 +256,7 @@ static void __init probe_roms(void) } #ifdef CONFIG_XEN -static struct e820map machine_e820 __initdata; +static struct e820map machine_e820; #define e820 machine_e820 #endif @@ -409,10 +416,8 @@ int __init sanitize_e820_map(struct e820 ____________________33__ ______________________4_ */ - printk("sanitize start\n"); /* if there's only one memory region, don't bother */ if (*pnr_map < 2) { - printk("sanitize bail 0\n"); return -1; } @@ -421,7 +426,6 @@ int __init sanitize_e820_map(struct e820 /* bail out if we find any unreasonable addresses in bios map */ for (i=0; isize; unsigned long long end = start + size; unsigned long type = biosmap->type; - printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type); /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) @@ -564,17 +566,11 @@ int __init copy_e820_map(struct e820entr * Not right. Fix it up. */ if (type == E820_RAM) { - printk("copy_e820_map() type is E820_RAM\n"); if (start < 0x100000ULL && end > 0xA0000ULL) { - printk("copy_e820_map() lies in range...\n"); - if (start < 0xA0000ULL) { - printk("copy_e820_map() start < 0xA0000ULL\n"); + if (start < 0xA0000ULL) add_memory_region(start, 0xA0000ULL-start, type); - } - if (end <= 0x100000ULL) { - printk("copy_e820_map() end <= 0x100000ULL\n"); + if (end <= 0x100000ULL) continue; - } start = 0x100000ULL; size = end - start; } --- a/arch/x86/kernel/e820_64-xen.c +++ b/arch/x86/kernel/e820_64-xen.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include @@ -28,7 +30,7 @@ struct e820map e820 __initdata; #ifdef CONFIG_XEN -struct e820map machine_e820 __initdata; +struct e820map machine_e820; #endif /* @@ -291,22 +293,6 @@ void __init e820_reserve_resources(struc } #ifndef CONFIG_XEN -/* Mark pages corresponding to given address range as nosave */ -static void __init -e820_mark_nosave_range(unsigned long start, unsigned long end) -{ - unsigned long pfn, max_pfn; - - if (start >= end) - return; - - printk("Nosave address range: %016lx - %016lx\n", start, end); - max_pfn = end >> PAGE_SHIFT; - for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++) - if (pfn_valid(pfn)) - SetPageNosave(pfn_to_page(pfn)); -} - /* * Find the ranges of physical addresses that do not correspond to * e820 RAM areas and mark the corresponding pages as nosave for software @@ -325,13 +311,13 @@ void __init e820_mark_nosave_regions(voi struct e820entry *ei = &e820.map[i]; if (paddr < ei->addr) - e820_mark_nosave_range(paddr, - round_up(ei->addr, PAGE_SIZE)); + register_nosave_region(PFN_DOWN(paddr), + PFN_UP(ei->addr)); paddr = round_down(ei->addr + ei->size, PAGE_SIZE); if (ei->type != E820_RAM) - e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE), - paddr); + register_nosave_region(PFN_UP(ei->addr), + PFN_DOWN(paddr)); if (paddr >= (end_pfn << PAGE_SHIFT)) break; --- a/arch/x86/kernel/early_printk-xen.c +++ b/arch/x86/kernel/early_printk-xen.c @@ -11,11 +11,10 @@ #ifdef __i386__ #include -#define VGABASE (__ISA_IO_base + 0xb8000) #else #include -#define VGABASE ((void __iomem *)0xffffffff800b8000UL) #endif +#define VGABASE (__ISA_IO_base + 0xb8000) #ifndef CONFIG_XEN static int max_ypos = 25, max_xpos = 80; @@ -93,9 +92,9 @@ static int early_serial_putc(unsigned ch static void early_serial_write(struct console *con, const char *s, unsigned n) { while (*s && n-- > 0) { - early_serial_putc(*s); if (*s == '\n') early_serial_putc('\r'); + early_serial_putc(*s); s++; } } @@ -205,7 +204,7 @@ static noinline long simnow(long cmd, lo return ret; } -void __init simnow_init(char *str) +static void __init simnow_init(char *str) { char *fn = "klog"; if (*str == '=') @@ -277,22 +276,12 @@ static int __init setup_early_printk(cha early_console = &simnow_console; keep_early = 1; } + + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; register_console(early_console); return 0; } - early_param("earlyprintk", setup_early_printk); - -void __init disable_early_printk(void) -{ - if (!early_console_initialized || !early_console) - return; - if (!keep_early) { - printk("disabling early console\n"); - unregister_console(early_console); - early_console_initialized = 0; - } else { - printk("keeping early console\n"); - } -} - --- a/arch/x86/kernel/entry_32-xen.S +++ b/arch/x86/kernel/entry_32-xen.S @@ -15,7 +15,7 @@ * I changed all the .align's to 4 (16 byte alignment), as that's faster * on a 486. * - * Stack layout in 'ret_from_system_call': + * Stack layout in 'syscall_exit': * ptrace needs to have all regs on the stack. * if the order here is changed, it needs to be * updated in fork.c:copy_process, signal.c:do_signal, @@ -135,7 +135,7 @@ NMI_MASK = 0x80000000 movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ - movl $(__KERNEL_PDA), %edx; \ + movl $(__KERNEL_PERCPU), %edx; \ movl %edx, %fs #define RESTORE_INT_REGS \ @@ -308,16 +308,12 @@ sysenter_past_esp: pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET cs, 0*/ -#ifndef CONFIG_COMPAT_VDSO /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) -#else - pushl $SYSENTER_RETURN -#endif CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 @@ -345,7 +341,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) - DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -400,10 +396,6 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - testl $TF_MASK,PT_EFLAGS(%esp) - jz no_singlestep - orl $_TIF_SINGLESTEP,TI_flags(%ebp) -no_singlestep: # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) @@ -418,6 +410,10 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit + jz no_singlestep + orl $_TIF_SINGLESTEP,TI_flags(%ebp) +no_singlestep: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work @@ -635,9 +631,7 @@ END(syscall_badsys) #ifndef CONFIG_XEN #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - movl %fs:PDA_cpu, %ebx; \ - PER_CPU(cpu_gdt_descr, %ebx); \ - movl GDS_address(%ebx), %ebx; \ + PER_CPU(gdt_page, %ebx); \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ addl %esp, %eax; \ pushl $__KERNEL_DS; \ @@ -710,7 +704,7 @@ ENTRY(name) \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ - call smp_/**/name; \ + call smp_##name; \ jmp ret_from_intr; \ CFI_ENDPROC; \ ENDPROC(name) @@ -718,10 +712,6 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -/* This alternate entry is needed because we hijack the apic LVTT */ -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) -#endif #else #define UNWIND_ESPFIX_STACK #endif @@ -764,7 +754,7 @@ error_code: pushl %fs CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PDA), %ecx + movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs UNWIND_ESPFIX_STACK popl %ecx --- a/arch/x86/kernel/entry_64-xen.S +++ b/arch/x86/kernel/entry_64-xen.S @@ -1254,3 +1254,10 @@ ENTRY(call_softirq) ret CFI_ENDPROC ENDPROC(call_softirq) + +KPROBE_ENTRY(ignore_sysret) + CFI_STARTPROC + mov $-ENOSYS,%eax + HYPERVISOR_IRET 0 + CFI_ENDPROC +ENDPROC(ignore_sysret) --- a/arch/x86/kernel/genapic_64-xen.c +++ b/arch/x86/kernel/genapic_64-xen.c @@ -11,123 +11,57 @@ #include #include #include +#include #include #include #include -#include #include #include +#include -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI #include #endif /* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly + = { [0 ... NR_CPUS-1] = BAD_APICID }; EXPORT_SYMBOL(x86_cpu_to_apicid); -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -extern struct genapic apic_cluster; -extern struct genapic apic_flat; -extern struct genapic apic_physflat; +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; #ifndef CONFIG_XEN -struct genapic *genapic = &apic_flat; -struct genapic *genapic_force; +struct genapic __read_mostly *genapic = &apic_flat; #else extern struct genapic apic_xen; -struct genapic *genapic = &apic_xen; +struct genapic __read_mostly *genapic = &apic_xen; #endif /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ -void __init clustered_apic_check(void) +void __init setup_apic_routing(void) { #ifndef CONFIG_XEN - long i; - u8 clusters, max_cluster; - u8 id; - u8 cluster_cnt[NUM_APIC_CLUSTERS]; - int max_apic = 0; - - /* genapic selection can be forced because of certain quirks. - */ - if (genapic_force) { - genapic = genapic_force; - goto print; - } - -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI /* - * Some x86_64 machines use physical APIC mode regardless of how many - * procs/clusters are present (x86_64 ES7000 is an example). + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID) - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) { - genapic = &apic_cluster; - goto print; - } -#endif - - memset(cluster_cnt, 0, sizeof(cluster_cnt)); - for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; - if (id == BAD_APICID) - continue; - if (id > max_apic) - max_apic = id; - cluster_cnt[APIC_CLUSTERID(id)]++; - } - - /* Don't use clustered mode on AMD platforms. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) genapic = &apic_physflat; -#ifndef CONFIG_HOTPLUG_CPU - /* In the CPU hotplug case we cannot use broadcast mode - because that opens a race when a CPU is removed. - Stay at physflat mode in this case. - It is bad to do this unconditionally though. Once - we have ACPI platform support for CPU hotplug - we should detect hotplug capablity from ACPI tables and - only do this when really needed. -AK */ - if (max_apic <= 8) - genapic = &apic_flat; -#endif - goto print; - } - - clusters = 0; - max_cluster = 0; - - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (cluster_cnt[i] > 0) { - ++clusters; - if (cluster_cnt[i] > max_cluster) - max_cluster = cluster_cnt[i]; - } - } + else +#endif - /* - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical - * else physical mode. - * (We don't use lowest priority delivery + HW APIC IRQ steering, so - * can ignore the clustered logical case and go straight to physical.) - */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { -#ifdef CONFIG_HOTPLUG_CPU - /* Don't use APIC shortcuts in CPU hotplug to avoid races */ - genapic = &apic_physflat; -#else + if (cpus_weight(cpu_possible_map) <= 8) genapic = &apic_flat; -#endif - } else - genapic = &apic_cluster; + else + genapic = &apic_physflat; -print: #else /* hardcode to xen apic functions */ genapic = &apic_xen; @@ -135,7 +69,7 @@ print: printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } -/* Same for both flat and clustered. */ +/* Same for both flat and physical. */ #ifdef CONFIG_XEN extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); --- a/arch/x86/kernel/genapic_xen_64.c +++ b/arch/x86/kernel/genapic_xen_64.c @@ -21,9 +21,8 @@ #include #else #include -#include -#include #endif +#include #include DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); --- a/arch/x86/kernel/head_32-xen.S +++ b/arch/x86/kernel/head_32-xen.S @@ -37,7 +37,8 @@ ENTRY(startup_32) /* Set up the stack pointer */ movl $(init_thread_union+THREAD_SIZE),%esp - call setup_pda + movl %ss,%eax + movl %eax,%fs # gets reset once there's real percpu /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID @@ -64,55 +65,11 @@ ENTRY(startup_32) xorl %eax,%eax # Clear GS movl %eax,%gs - movl $(__KERNEL_PDA),%eax - mov %eax,%fs - cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder jmp start_kernel -/* - * Point the GDT at this CPU's PDA. This will be - * cpu_gdt_table and boot_pda. - */ -ENTRY(setup_pda) - /* get the PDA pointer */ - movl $boot_pda, %eax - - /* slot the PDA address into the GDT */ - mov $cpu_gdt_table, %ecx - mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ - shr $16, %eax - mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ - mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ - - # %esi still points to start_info, and no registers - # need to be preserved. - - movl XEN_START_mfn_list(%esi), %ebx - movl $(cpu_gdt_table - __PAGE_OFFSET), %eax - shrl $PAGE_SHIFT, %eax - movl (%ebx,%eax,4), %ecx - pushl %ecx # frame number for set_gdt below - - xorl %esi, %esi - xorl %edx, %edx - shldl $PAGE_SHIFT, %ecx, %edx - shll $PAGE_SHIFT, %ecx - orl $0x61, %ecx - movl $cpu_gdt_table, %ebx - movl $__HYPERVISOR_update_va_mapping, %eax - int $0x82 - - movl $(PAGE_SIZE_asm / 8), %ecx - movl %esp, %ebx - movl $__HYPERVISOR_set_gdt, %eax - int $0x82 - - popl %ecx - ret - #define HYPERCALL_PAGE_OFFSET 0x1000 .org HYPERCALL_PAGE_OFFSET ENTRY(hypercall_page) @@ -138,60 +95,6 @@ ENTRY(empty_zero_page) */ .data -/* - * The Global Descriptor Table contains 28 quadwords, per-CPU. - */ - .section .data.page_aligned, "aw" - .align PAGE_SIZE_asm -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x0000000000000000 /* 0x0b reserved */ - .quad 0x0000000000000000 /* 0x13 reserved */ - .quad 0x0000000000000000 /* 0x1b reserved */ - .quad 0x0000000000000000 /* 0x20 unused */ - .quad 0x0000000000000000 /* 0x28 unused */ - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ - .quad 0x0000000000000000 /* 0x4b reserved */ - .quad 0x0000000000000000 /* 0x53 reserved */ - .quad 0x0000000000000000 /* 0x5b reserved */ - - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ - - .quad 0x0000000000000000 /* 0x80 TSS descriptor */ - .quad 0x0000000000000000 /* 0x88 LDT descriptor */ - - /* - * Segments used for calling PnP BIOS have byte granularity. - * They code segments and data segments have fixed 64k limits, - * the transfer segment sizes are set at run time. - */ - .quad 0x0000000000000000 /* 0x90 32-bit code */ - .quad 0x0000000000000000 /* 0x98 16-bit code */ - .quad 0x0000000000000000 /* 0xa0 16-bit data */ - .quad 0x0000000000000000 /* 0xa8 16-bit data */ - .quad 0x0000000000000000 /* 0xb0 16-bit data */ - - /* - * The APM segments have byte granularity and their bases - * are set at run time. All have 64k limits. - */ - .quad 0x0000000000000000 /* 0xb8 APM CS code */ - .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ - .quad 0x0000000000000000 /* 0xc8 APM DS data */ - - .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x00cf92000000ffff /* 0xd8 - PDA */ - .quad 0x0000000000000000 /* 0xe0 - unused */ - .quad 0x0000000000000000 /* 0xe8 - unused */ - .quad 0x0000000000000000 /* 0xf0 - unused */ - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ - .align PAGE_SIZE_asm - #if CONFIG_XEN_COMPAT <= 0x030002 /* * __xen_guest information --- a/arch/x86/kernel/head64-xen.c +++ b/arch/x86/kernel/head64-xen.c @@ -25,13 +25,21 @@ #include #include #include +#include #include unsigned long start_pfn; +#ifndef CONFIG_XEN +static void __init zap_identity_mappings(void) +{ + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + __flush_tlb(); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ -#if 0 static void __init clear_bss(void) { memset(__bss_start, 0, @@ -40,26 +48,25 @@ static void __init clear_bss(void) #endif #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x90020 +#define OLD_CL_MAGIC_ADDR 0x20 #define OLD_CL_MAGIC 0xA33F -#define OLD_CL_BASE_ADDR 0x90000 -#define OLD_CL_OFFSET 0x90022 +#define OLD_CL_OFFSET 0x22 static void __init copy_bootdata(char *real_mode_data) { #ifndef CONFIG_XEN - int new_data; + unsigned long new_data; char * command_line; memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { return; } - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); } - command_line = (char *) ((u64)(new_data)); + command_line = __va(new_data); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); #else int max_cmdline; @@ -101,10 +108,13 @@ void __init x86_64_start_kernel(char * r while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) machine_to_phys_order++; -#if 0 +#ifndef CONFIG_XEN /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); + /* Make NULL pointers segfault */ + zap_identity_mappings(); + for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); @@ -116,7 +126,7 @@ void __init x86_64_start_kernel(char * r cpu_pda(i) = &boot_cpu_pda[i]; pda_init(0); - copy_bootdata(real_mode_data); + copy_bootdata(__va(real_mode_data)); #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif --- a/arch/x86/kernel/head_64-xen.S +++ b/arch/x86/kernel/head_64-xen.S @@ -5,6 +5,7 @@ * Copyright (C) 2000 Pavel Machek * Copyright (C) 2000 Karsten Keil * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman * Jun Nakajima * Modified for Xen */ @@ -34,27 +35,15 @@ startup_64: pushq $0 # fake return address jmp x86_64_start_kernel -#ifdef CONFIG_ACPI_SLEEP -.org 0xf00 - .globl pGDT32 -pGDT32: - .word gdt_end-cpu_gdt_table-1 - .long cpu_gdt_table-__START_KERNEL_map -#endif -ENTRY(stext) -ENTRY(_stext) +.balign PAGE_SIZE - $page = 0 #define NEXT_PAGE(name) \ - $page = $page + 1; \ - .org $page * 0x1000; \ - phys_##name = $page * 0x1000 + __PHYSICAL_START; \ + .balign PAGE_SIZE; \ + phys_##name = . - .bootstrap.text; \ ENTRY(name) NEXT_PAGE(init_level4_pgt) - /* This gets initialized in x86_64_start_kernel */ .fill 512,8,0 -NEXT_PAGE(init_level4_user_pgt) /* * We update two pgd entries to make kernel and user pgd consistent * at pgd_populate(). It can be used for kernel modules. So we place @@ -101,14 +90,6 @@ NEXT_PAGE(hypercall_page) #undef NEXT_PAGE .data -/* Just dummy symbol to allow compilation. Not used in sleep path */ -#ifdef CONFIG_ACPI_SLEEP - .align PAGE_SIZE -ENTRY(wakeup_level4_pgt) - .fill 512,8,0 -#endif - - .data .align 16 .globl cpu_gdt_descr @@ -136,13 +117,13 @@ gdt: ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ + .quad 0x00af9b000000ffff /* __KERNEL_CS */ + .quad 0x00cf93000000ffff /* __KERNEL_DS */ + .quad 0x00cffb000000ffff /* __USER32_CS */ + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ + .quad 0x00affb000000ffff /* __USER_CS */ .quad 0x0 /* unused */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x00cffa000000ffff /* __USER32_CS */ - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affa000000ffff /* __USER_CS */ - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ @@ -165,14 +146,11 @@ ENTRY(empty_zero_page) * __xen_guest information */ .macro utoh value - .if (\value) < 0 || (\value) >= 0x10 - utoh (((\value)>>4)&0x0fffffffffffffff) - .endif - .if ((\value) & 0xf) < 10 - .byte '0' + ((\value) & 0xf) - .else - .byte 'A' + ((\value) & 0xf) - 10 - .endif + i = 64 + .rept 16 + i = i - 4 + .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf) + .endr .endm .section __xen_guest --- a/arch/x86/kernel/io_apic_32-xen.c +++ b/arch/x86/kernel/io_apic_32-xen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include #include #include @@ -710,8 +710,6 @@ static int balanced_irq(void *unused) unsigned long prev_balance_time = jiffies; long time_remaining = balanced_irq_interval; - daemonize("kirqd"); - /* push everything to CPU 0 to give us a starting point. */ for (i = 0 ; i < NR_IRQS ; i++) { irq_desc[i].pending_mask = cpumask_of_cpu(0); @@ -771,10 +769,9 @@ static int __init balanced_irq_init(void } printk(KERN_INFO "Starting balanced_irq\n"); - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) return 0; - else - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); failed: for_each_possible_cpu(i) { kfree(irq_cpu_data[i].irq_delta); @@ -1455,10 +1452,6 @@ static void __init setup_ExtINT_IRQ0_pin enable_8259A_irq(0); } -static inline void UNEXPECTED_IO_APIC(void) -{ -} - void __init print_IO_APIC(void) { int apic, i; @@ -1498,34 +1491,12 @@ void __init print_IO_APIC(void) printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - if (reg_00.bits.ID >= get_physical_broadcast()) - UNEXPECTED_IO_APIC(); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1535,8 +1506,6 @@ void __init print_IO_APIC(void) if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } /* @@ -1548,8 +1517,6 @@ void __init print_IO_APIC(void) reg_03.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - if (reg_03.bits.__reserved_1) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); @@ -2686,19 +2653,19 @@ int arch_setup_msi_irq(struct pci_dev *d if (irq < 0) return irq; - set_irq_msi(irq, desc); ret = msi_compose_msg(dev, irq, &msg); if (ret < 0) { destroy_irq(irq); return ret; } + set_irq_msi(irq, desc); write_msi_msg(irq, &msg); set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); - return irq; + return 0; } void arch_teardown_msi_irq(unsigned int irq) --- a/arch/x86/kernel/io_apic_64-xen.c +++ b/arch/x86/kernel/io_apic_64-xen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -904,10 +903,6 @@ static void __init setup_ExtINT_IRQ0_pin enable_8259A_irq(0); } -void __init UNEXPECTED_IO_APIC(void) -{ -} - void __apicdebuginit print_IO_APIC(void) { int apic, i; @@ -943,40 +938,16 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) && - (reg_01.bits.entries != 0x03) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); if (reg_01.bits.version >= 0x10) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); @@ -1408,8 +1379,7 @@ static void irq_complete_move(unsigned i vector = ~get_irq_regs()->orig_rax; me = smp_processor_id(); - if ((vector == cfg->vector) && - cpu_isset(smp_processor_id(), cfg->domain)) { + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); @@ -1444,7 +1414,7 @@ static void ack_apic_level(unsigned int /* * We must acknowledge the irq before we move it or the acknowledge will - * not propogate properly. + * not propagate properly. */ ack_APIC_irq(); @@ -1527,6 +1497,7 @@ static void ack_lapic_irq (unsigned int static void end_lapic_irq (unsigned int i) { /* nothing */ } static struct hw_interrupt_type lapic_irq_type __read_mostly = { + .name = "local-APIC", .typename = "local-APIC-edge", .startup = NULL, /* startup_irq() not used for IRQ0 */ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ @@ -1998,18 +1969,18 @@ int arch_setup_msi_irq(struct pci_dev *d if (irq < 0) return irq; - set_irq_msi(irq, desc); ret = msi_compose_msg(dev, irq, &msg); if (ret < 0) { destroy_irq(irq); return ret; } + set_irq_msi(irq, desc); write_msi_msg(irq, &msg); set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); - return irq; + return 0; } void arch_teardown_msi_irq(unsigned int irq) --- a/arch/x86/kernel/ioport_32-xen.c +++ b/arch/x86/kernel/ioport_32-xen.c @@ -12,10 +12,10 @@ #include #include #include -#include #include #include #include +#include #include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ --- a/arch/x86/kernel/ioport_64-xen.c +++ b/arch/x86/kernel/ioport_64-xen.c @@ -13,10 +13,10 @@ #include #include #include -#include #include #include #include +#include #include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ --- a/arch/x86/kernel/irq_32-xen.c +++ b/arch/x86/kernel/irq_32-xen.c @@ -24,6 +24,9 @@ DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. --- a/arch/x86/kernel/irq_64-xen.c +++ b/arch/x86/kernel/irq_64-xen.c @@ -32,7 +32,7 @@ atomic_t irq_err_count; */ static inline void stack_overflow_check(struct pt_regs *regs) { - u64 curbase = (u64) current->thread_info; + u64 curbase = (u64)task_stack_page(current); static unsigned long warned = -60*HZ; if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && @@ -145,17 +145,43 @@ void fixup_irqs(cpumask_t map) for (irq = 0; irq < NR_IRQS; irq++) { cpumask_t mask; + int break_affinity = 0; + int set_affinity = 1; + if (irq == 2) continue; + /* interrupt's are disabled at this point */ + spin_lock(&irq_desc[irq].lock); + + if (!irq_has_action(irq) || + cpus_equal(irq_desc[irq].affinity, map)) { + spin_unlock(&irq_desc[irq].lock); + continue; + } + cpus_and(mask, irq_desc[irq].affinity, map); - if (any_online_cpu(mask) == NR_CPUS) { - /*printk("Breaking affinity for irq %i\n", irq);*/ + if (cpus_empty(mask)) { + break_affinity = 1; mask = map; } + + if (irq_desc[irq].chip->mask) + irq_desc[irq].chip->mask(irq); + if (irq_desc[irq].chip->set_affinity) irq_desc[irq].chip->set_affinity(irq, mask); - else if (irq_desc[irq].action && !(warned++)) + else if (!(warned++)) + set_affinity = 0; + + if (irq_desc[irq].chip->unmask) + irq_desc[irq].chip->unmask(irq); + + spin_unlock(&irq_desc[irq].lock); + + if (break_affinity && set_affinity) + /*printk("Broke affinity for irq %i\n", irq)*/; + else if (!set_affinity) printk("Cannot set affinity for irq %i\n", irq); } --- a/arch/x86/kernel/ldt_32-xen.c +++ b/arch/x86/kernel/ldt_32-xen.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include --- a/arch/x86/kernel/ldt_64-xen.c +++ b/arch/x86/kernel/ldt_64-xen.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -127,4 +127,4 @@ endif disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \ smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o -%/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) := +%/head_64.o %/head_64.s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) := --- a/arch/x86/kernel/microcode-xen.c +++ b/arch/x86/kernel/microcode-xen.c @@ -135,7 +135,7 @@ static int __init microcode_dev_init (vo return 0; } -static void __exit microcode_dev_exit (void) +static void microcode_dev_exit (void) { misc_deregister(µcode_dev); } --- a/arch/x86/kernel/mpparse_32-xen.c +++ b/arch/x86/kernel/mpparse_32-xen.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -484,7 +483,7 @@ static int __init smp_read_mpc(struct mp } ++mpc_record; } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "SMP mptable: no processors registered!\n"); return num_processors; --- a/arch/x86/kernel/mpparse_64-xen.c +++ b/arch/x86/kernel/mpparse_64-xen.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -307,7 +306,7 @@ static int __init smp_read_mpc(struct mp } } } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; --- a/arch/x86/kernel/pci-dma-xen.c +++ b/arch/x86/kernel/pci-dma-xen.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -278,7 +279,7 @@ int dma_declare_coherent_memory(struct d { void __iomem *mem_base = NULL; int pages = size >> PAGE_SHIFT; - int bitmap_size = (pages + 31)/32; + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) goto out; @@ -351,6 +352,32 @@ void *dma_mark_declared_memory_occupied( EXPORT_SYMBOL(dma_mark_declared_memory_occupied); #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +int forbid_dac; +EXPORT_SYMBOL(forbid_dac); + +static __devinit void via_no_dac(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); + forbid_dac = 1; + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); + +static int check_iommu(char *s) +{ + if (!strcmp(s, "usedac")) { + forbid_dac = -1; + return 1; + } + return 0; +} +__setup("iommu=", check_iommu); +#endif + dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, enum dma_data_direction direction) --- a/arch/x86/kernel/process_32-xen.c +++ b/arch/x86/kernel/process_32-xen.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -39,6 +38,7 @@ #include #include #include +#include #include #include @@ -61,7 +61,6 @@ #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -70,6 +69,12 @@ static int hlt_counter; unsigned long boot_option_idle_override = 0; EXPORT_SYMBOL(boot_option_idle_override); +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * Return saved PC of a blocked thread. */ @@ -168,6 +173,7 @@ void cpu_idle(void) if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + check_pgt_cache(); rmb(); idle = xen_idle; /* no alternatives */ @@ -218,18 +224,19 @@ void __devinit select_idle_routine(const { } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; } + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); void show_regs(struct pt_regs * regs) { @@ -282,7 +289,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xds = __USER_DS; regs.xes = __USER_DS; - regs.xfs = __KERNEL_PDA; + regs.xfs = __KERNEL_PERCPU; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -562,7 +569,7 @@ struct task_struct fastcall * __switch_t * multicall to indicate FPU task switch, rather than * synchronously trapping to Xen. */ - if (prev_p->thread_info->status & TS_USEDFPU) { + if (task_thread_info(prev_p)->status & TS_USEDFPU) { __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ mcl->op = __HYPERVISOR_fpu_taskswitch; mcl->args[0] = 1; @@ -669,7 +676,7 @@ struct task_struct fastcall * __switch_t if (prev->gs | next->gs) loadsegment(gs, next->gs); - write_pda(pcurrent, next_p); + x86_write_percpu(current_task, next_p); return prev_p; } --- a/arch/x86/kernel/process_64-xen.c +++ b/arch/x86/kernel/process_64-xen.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -49,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -232,16 +232,18 @@ void __cpuinit select_idle_routine(const static int __init idle_setup (char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; - } + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) @@ -546,7 +548,7 @@ __switch_to(struct task_struct *prev_p, * The AMD workaround requires it to be after DS reload, or * after DS has been cleared, which we do in __prepare_arch_switch. */ - if (prev_p->thread_info->status & TS_USEDFPU) { + if (task_thread_info(prev_p)->status & TS_USEDFPU) { __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ mcl->op = __HYPERVISOR_fpu_taskswitch; mcl->args[0] = 1; --- a/arch/x86/kernel/quirks-xen.c +++ b/arch/x86/kernel/quirks-xen.c @@ -3,12 +3,10 @@ */ #include #include -#include -#include -#include #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; u32 word; @@ -16,7 +14,7 @@ static void __devinit verify_quirk_intel /* BIOS may enable hardware IRQ balancing for * E7520/E7320/E7525(revision ID 0x9 and below) * based platforms. - * For those platforms, make sure that the genapic is set to 'flat' + * Disable SW irqbalance/affinity on those platforms. */ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev > 0x9) @@ -30,59 +28,20 @@ static void __devinit verify_quirk_intel raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); if (!(word & (1 << 13))) { -#ifndef CONFIG_XEN -#ifdef CONFIG_X86_64 - if (genapic != &apic_flat) - panic("APIC mode must be flat on this system\n"); -#elif defined(CONFIG_X86_GENERICARCH) - if (genapic != &apic_default) - panic("APIC mode must be default(flat) on this system. Use apic=default\n"); -#endif -#endif - } - - /* put back the original value for config space*/ - if (!(config & 0x2)) - pci_write_config_byte(dev, 0xf4, config); -} - -void __init quirk_intel_irqbalance(void) -{ - u8 config, rev; - u32 word; - - /* BIOS may enable hardware IRQ balancing for - * E7520/E7320/E7525(revision ID 0x9 and below) - * based platforms. - * Disable SW irqbalance/affinity on those platforms. - */ - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); - if (rev > 0x9) - return; - - printk(KERN_INFO "Intel E7520/7320/7525 detected."); - - /* enable access to config space */ - config = read_pci_config_byte(0, 0, 0, 0xf4); - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); - - /* read xTPR register */ - word = read_pci_config_16(0, 0, 0x40, 0x4c); - - if (!(word & (1 << 13))) { struct xen_platform_op op; - printk(KERN_INFO "Disabling irq balancing and affinity\n"); + + printk(KERN_INFO "Intel E7520/7320/7525 detected. " + "Disabling irq balancing and affinity\n"); op.cmd = XENPF_platform_quirk; op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; WARN_ON(HYPERVISOR_platform_op(&op)); } - /* put back the original value for config space */ + /* put back the original value for config space*/ if (!(config & 0x2)) - write_pci_config_byte(0, 0, 0, 0xf4, config); + pci_write_config_byte(dev, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); - +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); #endif --- a/arch/x86/kernel/setup_64-xen.c +++ b/arch/x86/kernel/setup_64-xen.c @@ -120,6 +120,8 @@ int bootloader_type; unsigned long saved_video_mode; +int force_mwait __cpuinitdata; + /* * Early DMI memory */ @@ -253,10 +255,10 @@ static void discover_ebda(void) * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E */ - ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); ebda_addr <<= 4; - ebda_size = *(unsigned short *)(unsigned long)ebda_addr; + ebda_size = *(unsigned short *)__va(ebda_addr); /* Round EBDA up to pages */ if (ebda_size == 0) @@ -410,15 +412,8 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); #endif #endif @@ -570,8 +565,6 @@ void __init setup_arch(char **cmdline_p) early_quirks(); #endif - zap_low_mappings(0); - /* * set this early, so we dont allocate cpu0 * if MADT list doesnt list BSP first @@ -864,6 +857,10 @@ static void __cpuinit init_amd(struct cp /* RDTSC can be speculated around */ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + + /* Family 10 doesn't support C states in MWAIT so don't use it */ + if (c->x86 == 0x10 && !force_mwait) + clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -1146,9 +1143,7 @@ void __cpuinit identify_cpu(struct cpuin #ifdef CONFIG_X86_MCE mcheck_init(c); #endif - if (c == &boot_cpu_data) - mtrr_bp_init(); - else + if (c != &boot_cpu_data) mtrr_ap_init(); #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); @@ -1239,9 +1234,8 @@ static int show_cpuinfo(struct seq_file "stc", "100mhzsteps", "hwpstate", - NULL, /* tsc invariant mapped to constant_tsc */ - NULL, - /* nothing */ /* constant_tsc - moved to flags */ + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ }; --- a/arch/x86/kernel/setup64-xen.c +++ b/arch/x86/kernel/setup64-xen.c @@ -113,9 +113,9 @@ void __init setup_per_cpu_areas(void) if (!NODE_DATA(cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", i, num_online_nodes()); - ptr = alloc_bootmem(size); + ptr = alloc_bootmem_pages(size); } else { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); } if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); @@ -208,6 +208,8 @@ char boot_exception_stacks[(N_EXCEPTION_ __attribute__((section(".bss.page_aligned"))); #endif +extern asmlinkage void ignore_sysret(void); + /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -219,12 +221,22 @@ void syscall_init(void) */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); /* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); #endif #ifdef CONFIG_IA32_EMULATION syscall32_cpu_init (); +#else + { + static const struct callback_register cstar = { + .type = CALLBACKTYPE_syscall32, + .address = (unsigned long)ignore_sysret + }; + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar)) + printk(KERN_WARN "Unable to register CSTAR callback\n"); + } #endif } @@ -262,7 +274,6 @@ void __cpuinit cpu_init (void) /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); - zap_low_mappings(cpu); } #ifndef CONFIG_X86_NO_TSS else --- a/arch/x86/kernel/smp_32-xen.c +++ b/arch/x86/kernel/smp_32-xen.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -216,7 +215,6 @@ static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff /* * We cannot call mmdrop() because we are in interrupt context, @@ -298,7 +296,7 @@ irqreturn_t smp_invalidate_interrupt(int if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) + if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); @@ -314,9 +312,11 @@ out: return IRQ_HANDLED; } -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { + cpumask_t cpumask = *cpumaskp; + /* * A couple of (to be removed) sanity checks: * @@ -327,10 +327,12 @@ static void flush_tlb_others(cpumask_t c BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); +#ifdef CONFIG_HOTPLUG_CPU /* If a CPU which we ran on has gone down, OK. */ cpus_and(cpumask, cpumask, cpu_online_map); - if (cpus_empty(cpumask)) + if (unlikely(cpus_empty(cpumask))) return; +#endif /* * i'm not happy about this global shared spinlock in the @@ -341,17 +343,7 @@ static void flush_tlb_others(cpumask_t c flush_mm = mm; flush_va = va; -#if NR_CPUS <= BITS_PER_LONG - atomic_set_mask(cpumask, &flush_cpumask); -#else - { - int k; - unsigned long *flush_mask = (unsigned long *)&flush_cpumask; - unsigned long *cpu_mask = (unsigned long *)&cpumask; - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) - atomic_set_mask(cpu_mask[k], &flush_mask[k]); - } -#endif + cpus_or(flush_cpumask, cpumask, flush_cpumask); /* * We have to send the IPI only to * CPUs affected. @@ -378,7 +370,7 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -397,7 +389,7 @@ void flush_tlb_mm (struct mm_struct * mm leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -446,7 +438,7 @@ void flush_tlb_all(void) * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ -void smp_send_reschedule(int cpu) +void xen_smp_send_reschedule(int cpu) { WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); @@ -478,36 +470,79 @@ void unlock_ipi_call_lock(void) static struct call_data_struct *call_data; +static void __smp_call_function(void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = num_online_cpus() - 1; + + if (!cpus) + return; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + + /** - * smp_call_function(): Run a function on all other CPUs. + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. Must not include the current cpu. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <> or are or have executed. + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) +int +xen_smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; + cpumask_t allbutself; int cpus; + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + /* Holding any lock stops cpus from going down. */ spin_lock(&call_lock); - cpus = num_online_cpus() - 1; + + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + if (!cpus) { spin_unlock(&call_lock); return 0; } - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -517,9 +552,12 @@ int smp_call_function (void (*func) (voi call_data = &data; mb(); - - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself)) + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else + send_IPI_mask(mask, CALL_FUNCTION_VECTOR); /* Wait for response */ while (atomic_read(&data.started) != cpus) @@ -532,15 +570,14 @@ int smp_call_function (void (*func) (voi return 0; } -EXPORT_SYMBOL(smp_call_function); static void stop_this_cpu (void * dummy) { + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); disable_all_local_evtchn(); if (cpu_data[smp_processor_id()].hlt_works_ok) for(;;) halt(); @@ -551,13 +588,18 @@ static void stop_this_cpu (void * dummy) * this function calls the 'stop' function on all other CPUs in the system. */ -void smp_send_stop(void) +void xen_smp_send_stop(void) { - smp_call_function(stop_this_cpu, NULL, 1, 0); + /* Don't deadlock on the call lock in panic */ + int nolock = !spin_trylock(&call_lock); + unsigned long flags; - local_irq_disable(); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); disable_all_local_evtchn(); - local_irq_enable(); + local_irq_restore(flags); } /* @@ -598,74 +640,3 @@ irqreturn_t smp_call_function_interrupt( return IRQ_HANDLED; } - -/* - * this function sends a 'generic call function' IPI to one other CPU - * in the system. - * - * cpu is a standard Linux logical CPU number. - */ -static void -__smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = 1; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* - * smp_call_function_single - Run a function on another CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Currently unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Retrurns 0 on success, else a negative status code. - * - * Does not return until the remote CPU is nearly ready to execute - * or is or has executed. - */ - -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int me = get_cpu(); - if (cpu == me) { - WARN_ON(1); - put_cpu(); - return -EBUSY; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - spin_lock_bh(&call_lock); - __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock_bh(&call_lock); - put_cpu(); - return 0; -} -EXPORT_SYMBOL(smp_call_function_single); --- a/arch/x86/kernel/smp_64-xen.c +++ b/arch/x86/kernel/smp_64-xen.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -457,44 +456,36 @@ int smp_call_function (void (*func) (voi } EXPORT_SYMBOL(smp_call_function); -void smp_stop_cpu(void) +static void stop_this_cpu(void *dummy) { - unsigned long flags; + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_save(flags); disable_all_local_evtchn(); - local_irq_restore(flags); -} - -static void smp_really_stop_cpu(void *dummy) -{ - smp_stop_cpu(); for (;;) halt(); } void smp_send_stop(void) { - int nolock = 0; + int nolock; + unsigned long flags; + #ifndef CONFIG_XEN if (reboot_force) return; #endif + /* Don't deadlock on the call lock in panic */ - if (!spin_trylock(&call_lock)) { - /* ignore locking because we have panicked anyways */ - nolock = 1; - } - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); + nolock = !spin_trylock(&call_lock); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); if (!nolock) spin_unlock(&call_lock); - - local_irq_disable(); disable_all_local_evtchn(); - local_irq_enable(); + local_irq_restore(flags); } /* --- a/arch/x86/kernel/time_32-xen.c +++ b/arch/x86/kernel/time_32-xen.c @@ -80,7 +80,6 @@ #include DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -int pit_latch_buggy; /* extern */ #else volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; #endif @@ -218,6 +217,26 @@ static inline u64 scale_delta(u64 delta, return product; } +static inline u64 get64(volatile u64 *ptr) +{ +#ifndef CONFIG_64BIT + return cmpxchg64(ptr, 0, 0); +#else + return *ptr; +#define cmpxchg64 cmpxchg +#endif +} + +static inline u64 get64_local(volatile u64 *ptr) +{ +#ifndef CONFIG_64BIT + return cmpxchg64_local(ptr, 0, 0); +#else + return *ptr; +#define cmpxchg64_local cmpxchg_local +#endif +} + static void init_cpu_khz(void) { u64 __cpu_khz = 1000000ULL << 32; @@ -399,7 +418,7 @@ static int set_rtc_mmss(unsigned long no return retval; } -unsigned long long sched_clock(void) +static unsigned long long local_clock(void) { unsigned int cpu = get_cpu(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); @@ -420,6 +439,61 @@ unsigned long long sched_clock(void) return time; } +/* + * Runstate accounting + */ +static void get_runstate_snapshot(struct vcpu_runstate_info *res) +{ + u64 state_time; + struct vcpu_runstate_info *state; + + BUG_ON(preemptible()); + + state = &__get_cpu_var(runstate); + + do { + state_time = get64_local(&state->state_entry_time); + *res = *state; + } while (get64_local(&state->state_entry_time) != state_time); + + WARN_ON_ONCE(res->state != RUNSTATE_running); +} + +/* + * Xen sched_clock implementation. Returns the number of unstolen + * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED + * states. + */ +unsigned long long sched_clock(void) +{ + struct vcpu_runstate_info runstate; + cycle_t now; + u64 ret; + s64 offset; + + /* + * Ideally sched_clock should be called on a per-cpu basis + * anyway, so preempt should already be disabled, but that's + * not current practice at the moment. + */ + preempt_disable(); + + now = local_clock(); + + get_runstate_snapshot(&runstate); + + offset = now - runstate.state_entry_time; + if (offset < 0) + offset = 0; + + ret = offset + runstate.time[RUNSTATE_running] + + runstate.time[RUNSTATE_blocked]; + + preempt_enable(); + + return ret; +} + unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -467,10 +541,9 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { s64 delta, delta_cpu, stolen, blocked; - u64 sched_time; unsigned int i, cpu = smp_processor_id(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); - struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); + struct vcpu_runstate_info runstate; /* * Here we are in the timer irq handler. We just have irqs locally @@ -490,20 +563,7 @@ irqreturn_t timer_interrupt(int irq, voi delta -= processed_system_time; delta_cpu -= per_cpu(processed_system_time, cpu); - /* - * Obtain a consistent snapshot of stolen/blocked cycles. We - * can use state_entry_time to detect if we get preempted here. - */ - do { - sched_time = runstate->state_entry_time; - barrier(); - stolen = runstate->time[RUNSTATE_runnable] + - runstate->time[RUNSTATE_offline] - - per_cpu(processed_stolen_time, cpu); - blocked = runstate->time[RUNSTATE_blocked] - - per_cpu(processed_blocked_time, cpu); - barrier(); - } while (sched_time != runstate->state_entry_time); + get_runstate_snapshot(&runstate); } while (!time_values_up_to_date(cpu)); if ((unlikely(delta < -(s64)permitted_clock_jitter) || @@ -545,6 +605,9 @@ irqreturn_t timer_interrupt(int irq, voi * HACK: Passing NULL to account_steal_time() * ensures that the ticks are accounted as stolen. */ + stolen = runstate.time[RUNSTATE_runnable] + + runstate.time[RUNSTATE_offline] + - per_cpu(processed_stolen_time, cpu); if ((stolen > 0) && (delta_cpu > 0)) { delta_cpu -= stolen; if (unlikely(delta_cpu < 0)) @@ -560,6 +623,8 @@ irqreturn_t timer_interrupt(int irq, voi * HACK: Passing idle_task to account_steal_time() * ensures that the ticks are accounted as idle/wait. */ + blocked = runstate.time[RUNSTATE_blocked] + - per_cpu(processed_blocked_time, cpu); if ((blocked > 0) && (delta_cpu > 0)) { delta_cpu -= blocked; if (unlikely(delta_cpu < 0)) @@ -596,7 +661,7 @@ irqreturn_t timer_interrupt(int irq, voi return IRQ_HANDLED; } -void mark_tsc_unstable(void) +void mark_tsc_unstable(char *reason) { #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */ tsc_unstable = 1; @@ -604,17 +669,13 @@ void mark_tsc_unstable(void) } EXPORT_SYMBOL_GPL(mark_tsc_unstable); +static cycle_t cs_last; + static cycle_t xen_clocksource_read(void) { #ifdef CONFIG_SMP - static cycle_t last_ret; -#ifndef CONFIG_64BIT - cycle_t last = cmpxchg64(&last_ret, 0, 0); -#else - cycle_t last = last_ret; -#define cmpxchg64 cmpxchg -#endif - cycle_t ret = sched_clock(); + cycle_t last = get64(&cs_last); + cycle_t ret = local_clock(); if (unlikely((s64)(ret - last) < 0)) { if (last - ret > permitted_clock_jitter @@ -633,17 +694,25 @@ static cycle_t xen_clocksource_read(void } for (;;) { - cycle_t cur = cmpxchg64(&last_ret, last, ret); + cycle_t cur = cmpxchg64(&cs_last, last, ret); if (cur == last || (s64)(ret - cur) < 0) return ret; last = cur; } #else - return sched_clock(); + return local_clock(); #endif } +static void xen_clocksource_resume(void) +{ + extern void time_resume(void); + + time_resume(); + cs_last = local_clock(); +} + static struct clocksource clocksource_xen = { .name = "xen", .rating = 400, @@ -652,6 +721,7 @@ static struct clocksource clocksource_xe .mult = 1 << XEN_SHIFT, /* time directly in nanoseconds */ .shift = XEN_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = xen_clocksource_resume, }; static void init_missing_ticks_accounting(unsigned int cpu) @@ -740,35 +810,6 @@ void notify_arch_cmos_timer(void) mod_timer(&sync_xen_wallclock_timer, jiffies + 1); } -static int timer_resume(struct sys_device *dev) -{ - extern void time_resume(void); - time_resume(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); - extern void (*late_time_init)(void); /* Dynamically-mapped IRQ. */ @@ -899,21 +940,21 @@ static void start_hz_timer(void) cpu_clear(smp_processor_id(), nohz_cpu_mask); } -void raw_safe_halt(void) +void xen_safe_halt(void) { stop_hz_timer(); /* Blocking includes an implicit local_irq_enable(). */ HYPERVISOR_block(); start_hz_timer(); } -EXPORT_SYMBOL(raw_safe_halt); +EXPORT_SYMBOL(xen_safe_halt); -void halt(void) +void xen_halt(void) { if (irqs_disabled()) VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); } -EXPORT_SYMBOL(halt); +EXPORT_SYMBOL(xen_halt); /* No locking required. Interrupts are disabled on all CPUs. */ void time_resume(void) --- a/arch/x86/kernel/traps_32-xen.c +++ b/arch/x86/kernel/traps_32-xen.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include @@ -101,20 +101,6 @@ asmlinkage void machine_check(void); int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; -ATOMIC_NOTIFIER_HEAD(i386die_chain); - -int register_die_notifier(struct notifier_block *nb) -{ - vmalloc_sync_all(); - return atomic_notifier_chain_register(&i386die_chain, nb); -} -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ - -int unregister_die_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&i386die_chain, nb); -} -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) { @@ -325,7 +311,7 @@ void show_registers(struct pt_regs *regs regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, - current_thread_info(), current, current->thread_info); + current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -482,8 +468,6 @@ static void __kprobes do_trap(int trapnr siginfo_t *info) { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; if (regs->eflags & VM_MASK) { if (vm86) @@ -495,6 +479,18 @@ static void __kprobes do_trap(int trapnr goto kernel_trap; trap_signal: { + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (info) force_sig_info(signr, info, tsk); else @@ -503,8 +499,11 @@ static void __kprobes do_trap(int trapnr } kernel_trap: { - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } @@ -578,9 +577,6 @@ DO_ERROR_INFO(32, SIGSEGV, "iret excepti fastcall void __kprobes do_general_protection(struct pt_regs * regs, long error_code) { - current->thread.error_code = error_code; - current->thread.trap_no = 13; - if (regs->eflags & VM_MASK) goto gp_in_vm86; @@ -599,6 +595,8 @@ gp_in_vm86: gp_in_kernel: if (!fixup_exception(regs)) { + current->thread.error_code = error_code; + current->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; @@ -987,9 +985,7 @@ fastcall void do_spurious_interrupt_bug( fastcall unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { - int cpu = smp_processor_id(); - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; unsigned long base = (kesp - uesp) & -THREAD_SIZE; unsigned long new_kesp = kesp - base; unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; --- a/arch/x86/kernel/traps_64-xen.c +++ b/arch/x86/kernel/traps_64-xen.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -71,22 +71,6 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); -ATOMIC_NOTIFIER_HEAD(die_chain); -EXPORT_SYMBOL(die_chain); - -int register_die_notifier(struct notifier_block *nb) -{ - vmalloc_sync_all(); - return atomic_notifier_chain_register(&die_chain, nb); -} -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ - -int unregister_die_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&die_chain, nb); -} -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ - static inline void conditional_sti(struct pt_regs *regs) { if (regs->eflags & X86_EFLAGS_IF) @@ -428,8 +412,7 @@ void show_registers(struct pt_regs *regs const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; - rsp = regs->rsp; - + rsp = regs->rsp; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -440,7 +423,6 @@ void show_registers(struct pt_regs *regs * time of the fault.. */ if (in_kernel) { - printk("Stack: "); _show_stack(NULL, regs, (unsigned long*)rsp); @@ -485,13 +467,14 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { - int cpu = smp_processor_id(); + int cpu; unsigned long flags; oops_enter(); /* racy, but better than risking deadlock. */ local_irq_save(flags); + cpu = smp_processor_id(); if (!spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; @@ -585,10 +568,20 @@ static void __kprobes do_trap(int trapnr { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - if (user_mode(regs)) { + /* + * We want error_code and trap_no set for userspace + * faults and kernelspace faults which result in + * die(), but not kernelspace faults which are fixed + * up. die() gives the process no chance to handle + * the signal and notice the kernel fault information, + * so that won't result in polluting the information + * about previously queued, but not yet delivered, + * faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (exception_trace && unhandled_signal(tsk, signr)) printk(KERN_INFO "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", @@ -609,8 +602,11 @@ static void __kprobes do_trap(int trapnr fixup = search_exception_tables(regs->rip); if (fixup) regs->rip = fixup->fixup; - else + else { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } } @@ -686,10 +682,10 @@ asmlinkage void __kprobes do_general_pro conditional_sti(regs); - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (user_mode(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", @@ -708,6 +704,9 @@ asmlinkage void __kprobes do_general_pro regs->rip = fixup->fixup; return; } + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; --- a/arch/x86/kernel/vsyscall_64-xen.c +++ b/arch/x86/kernel/vsyscall_64-xen.c @@ -45,14 +45,34 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" +#define __pa_vsymbol(x) \ + ({unsigned long v; \ + extern char __vsyscall_0; \ + asm("" : "=r" (v) : "0" (x)); \ + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) +/* + * vsyscall_gtod_data contains data that is : + * - readonly from vsyscalls + * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) + * Try to keep this structure as small as possible to avoid cache line ping pongs + */ struct vsyscall_gtod_data_t { - seqlock_t lock; - int sysctl_enabled; - struct timeval wall_time_tv; + seqlock_t lock; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + + int sysctl_enabled; struct timezone sys_tz; - cycle_t offset_base; - struct clocksource clock; + struct { /* extract of a clocksource struct */ + cycle_t (*vread)(void); + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; }; int __vgetcpu_mode __section_vgetcpu_mode; @@ -68,9 +88,13 @@ void update_vsyscall(struct timespec *wa write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* copy vsyscall data */ - vsyscall_gtod_data.clock = *clock; - vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.sys_tz = sys_tz; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -105,7 +129,8 @@ static __always_inline long time_syscall static __always_inline void do_vgettimeofday(struct timeval * tv) { cycle_t now, base, mask, cycle_delta; - unsigned long seq, mult, shift, nsec_delta; + unsigned seq; + unsigned long mult, shift, nsec; cycle_t (*vread)(void); do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); @@ -121,21 +146,20 @@ static __always_inline void do_vgettimeo mult = __vsyscall_gtod_data.clock.mult; shift = __vsyscall_gtod_data.clock.shift; - *tv = __vsyscall_gtod_data.wall_time_tv; - + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + nsec = __vsyscall_gtod_data.wall_time_nsec; } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); /* calculate interval: */ cycle_delta = (now - base) & mask; /* convert to nsecs: */ - nsec_delta = (cycle_delta * mult) >> shift; + nsec += (cycle_delta * mult) >> shift; - /* convert to usecs and add to timespec: */ - tv->tv_usec += nsec_delta / NSEC_PER_USEC; - while (tv->tv_usec > USEC_PER_SEC) { + while (nsec >= NSEC_PER_SEC) { tv->tv_sec += 1; - tv->tv_usec -= USEC_PER_SEC; + nsec -= NSEC_PER_SEC; } + tv->tv_usec = nsec / NSEC_PER_USEC; } int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) @@ -151,11 +175,16 @@ int __vsyscall(0) vgettimeofday(struct t * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { + struct timeval tv; + time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - else if (t) - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; - return __vsyscall_gtod_data.wall_time_tv.tv_sec; + + vgettimeofday(&tv, 0); + result = tv.tv_sec; + if (t) + *t = result; + return result; } /* Fast way to get current CPU and node. @@ -224,10 +253,10 @@ static int vsyscall_sysctl_change(ctl_ta return ret; /* gcc has some trouble with __va(__pa()), so just do it this way. */ - map1 = ioremap(__pa_symbol(&vsysc1), 2); + map1 = ioremap(__pa_vsymbol(&vsysc1), 2); if (!map1) return -ENOMEM; - map2 = ioremap(__pa_symbol(&vsysc2), 2); + map2 = ioremap(__pa_vsymbol(&vsysc2), 2); if (!map2) { ret = -ENOMEM; goto out; @@ -304,7 +333,7 @@ static int __cpuinit cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; - if (action == CPU_ONLINE) + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); return NOTIFY_DONE; } --- a/arch/x86/mm/fault_32-xen.c +++ b/arch/x86/mm/fault_32-xen.c @@ -14,19 +14,20 @@ #include #include #include -#include #include #include #include #include /* For unblank_screen() */ #include +#include /* for max_low_pfn */ +#include #include #include #include +#include #include #include -#include #include extern void die(const char *,struct pt_regs *,long); @@ -259,25 +260,20 @@ static void dump_fault_path(unsigned lon unsigned long page; page = read_cr3(); - page = ((unsigned long *) __va(page))[address >> 22]; - if (oops_may_print()) - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, - machine_to_phys(page)); + page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT]; + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, + machine_to_phys(page)); /* * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And lets rather not kmap-atomic the pte, just in case * it's allocated already. */ -#ifdef CONFIG_HIGHPTE - if ((page >> PAGE_SHIFT) >= highstart_pfn) - return; -#endif - if ((page & 1) && oops_may_print()) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = machine_to_phys(page); - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; + if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn + && (page & _PAGE_PRESENT)) { + page = machine_to_phys(page & PAGE_MASK); + page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT) + & (PTRS_PER_PTE - 1)]; printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, machine_to_phys(page)); } @@ -581,6 +577,11 @@ bad_area: bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + /* * Valid to do another page fault here because this one came * from user space. @@ -633,7 +634,7 @@ no_context: bust_spinlocks(1); if (oops_may_print()) { - #ifdef CONFIG_X86_PAE +#ifdef CONFIG_X86_PAE if (error_code & 16) { pte_t *pte = lookup_address(address); @@ -642,7 +643,7 @@ no_context: "NX-protected page - exploit attempt? " "(uid: %d)\n", current->uid); } - #endif +#endif if (address < PAGE_SIZE) printk(KERN_ALERT "BUG: unable to handle kernel NULL " "pointer dereference"); @@ -652,8 +653,8 @@ no_context: printk(" at virtual address %08lx\n",address); printk(KERN_ALERT " printing eip:\n"); printk("%08lx\n", regs->eip); + dump_fault_path(address); } - dump_fault_path(address); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; @@ -694,7 +695,6 @@ do_sigbus: force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -#if !HAVE_SHARED_KERNEL_PMD void vmalloc_sync_all(void) { /* @@ -710,6 +710,9 @@ void vmalloc_sync_all(void) static unsigned long start = TASK_SIZE; unsigned long address; + if (SHARED_KERNEL_PMD) + return; + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); for (address = start; address >= TASK_SIZE && address < hypervisor_virt_start; @@ -739,4 +742,3 @@ void vmalloc_sync_all(void) start = address + (1UL << PMD_SHIFT); } } -#endif --- a/arch/x86/mm/fault_64-xen.c +++ b/arch/x86/mm/fault_64-xen.c @@ -15,22 +15,22 @@ #include #include #include -#include #include #include #include #include /* For unblank_screen() */ #include +#include #include #include #include +#include #include #include #include #include #include -#include #include /* Page fault error code bits */ @@ -537,6 +537,12 @@ bad_area: bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { + + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + if (is_prefetch(regs, address, error_code)) return; @@ -646,7 +652,7 @@ do_sigbus: } DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; +LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { @@ -666,8 +672,7 @@ void vmalloc_sync_all(void) if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); - for (page = pgd_list; page; - page = (struct page *)page->index) { + list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); if (pgd_none(*pgd)) --- a/arch/x86/mm/highmem_32-xen.c +++ b/arch/x86/mm/highmem_32-xen.c @@ -26,7 +26,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -49,15 +49,7 @@ static void *__kmap_atomic(struct page * void *kmap_atomic(struct page *page, enum km_type type) { - return __kmap_atomic(page, type, kmap_prot); -} - -/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */ -void *kmap_atomic_pte(struct page *page, enum km_type type) -{ - return __kmap_atomic(page, type, - test_bit(PG_pinned, &page->flags) - ? PAGE_KERNEL_RO : kmap_prot); + return kmap_atomic_prot(page, type, kmap_prot); } void kunmap_atomic(void *kvaddr, enum km_type type) @@ -80,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km #endif } + /*arch_flush_lazy_mmu_mode();*/ pagefault_enable(); } @@ -162,7 +155,6 @@ void copy_highpage(struct page *to, stru EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kmap_atomic_pte); EXPORT_SYMBOL(kunmap_atomic); EXPORT_SYMBOL(kmap_atomic_to_page); EXPORT_SYMBOL(clear_highpage); --- a/arch/x86/mm/init_32-xen.c +++ b/arch/x86/mm/init_32-xen.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -65,17 +66,19 @@ static pmd_t * __init one_md_table_init( pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); - make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); -#else + if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) { + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + if (pmd_table != pmd_offset(pud, 0)) + BUG(); + } +#endif pud = pud_offset(pgd, 0); pmd_table = pmd_offset(pud, 0); -#endif return pmd_table; } @@ -86,16 +89,18 @@ static pmd_t * __init one_md_table_init( */ static pte_t * __init one_page_table_init(pmd_t *pmd) { +#if CONFIG_XEN_COMPAT <= 0x030002 if (pmd_none(*pmd)) { +#else + if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) { +#endif pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); make_lowmem_page_readonly(page_table, XENFEAT_writable_page_tables); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } return pte_offset_kernel(pmd, 0); @@ -115,7 +120,6 @@ static pte_t * __init one_page_table_ini static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { pgd_t *pgd; - pud_t *pud; pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; @@ -126,12 +130,10 @@ static void __init page_table_range_init pgd = pgd_base + pgd_idx; for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); + pmd = one_md_table_init(pgd); + pmd = pmd + pmd_index(vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (vaddr < hypervisor_virt_start && pmd_none(*pmd)) + if (vaddr < hypervisor_virt_start) one_page_table_init(pmd); vaddr += PMD_SIZE; @@ -194,24 +196,25 @@ static void __init kernel_physical_mappi /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (is_kernel_text(address) || is_kernel_text(address2)) set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); else set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + pfn += PTRS_PER_PTE; } else { pte = one_page_table_init(pmd); - pte += pte_ofs; - for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { - /* XEN: Only map initial RAM allocation. */ - if ((pfn >= max_ram_pfn) || pte_present(*pte)) - continue; - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + for (pte += pte_ofs; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { + /* XEN: Only map initial RAM allocation. */ + if ((pfn >= max_ram_pfn) || pte_present(*pte)) + continue; + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); } pte_ofs = 0; } @@ -381,15 +384,44 @@ extern void __init remap_numa_kva(void); pgd_t *swapper_pg_dir; +static void __init xen_pagetable_setup_start(pgd_t *base) +{ +} + +static void __init xen_pagetable_setup_done(pgd_t *base) +{ +} + +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * If we're booting on native hardware, this will be a pagetable + * constructed in arch/i386/kernel/head.S, and not running in PAE mode + * (even if we'll end up running in PAE). The root of the pagetable + * will be swapper_pg_dir. + * + * If we're booting paravirtualized under a hypervisor, then there are + * more options: we may already be running PAE, and the pagetable may + * or may not be based in swapper_pg_dir. In any case, + * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * appropriately for the rest of the initialization to work. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ static void __init pagetable_init (void) { - unsigned long vaddr; + unsigned long vaddr, end; pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; + xen_pagetable_setup_start(pgd_base); + /* Enable PSE if available */ - if (cpu_has_pse) { + if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); - } /* Enable PGE if available */ if (cpu_has_pge) { @@ -406,9 +438,12 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, hypervisor_virt_start, pgd_base); + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); permanent_kmaps_init(pgd_base); + + xen_pagetable_setup_done(pgd_base); } #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) @@ -750,34 +785,29 @@ int remove_memory(u64 start, u64 size) EXPORT_SYMBOL_GPL(remove_memory); #endif -struct kmem_cache *pgd_cache; struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); + if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), PTRS_PER_PMD*sizeof(pmd_t), - 0, + SLAB_PANIC, pmd_ctor, NULL); - if (!pmd_cache) - panic("pgtable_cache_init(): cannot create pmd cache"); + if (!SHARED_KERNEL_PMD) { + /* If we're in PAE mode and have a non-shared + kernel pmd, then the pgd size must be a + page size. This is because the pgd_list + links through the page structure, so there + can only be one pgd per page for this to + work. */ + pgd_size = PAGE_SIZE; + } } - pgd_cache = kmem_cache_create("pgd", -#ifndef CONFIG_XEN - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), -#else - PAGE_SIZE, - PAGE_SIZE, -#endif - 0, - pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); - if (!pgd_cache) - panic("pgtable_cache_init(): Cannot create pgd cache"); } /* @@ -811,13 +841,26 @@ static int noinline do_test_wp_bit(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; - - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; - printk("Write protecting the kernel read-only data: %uk\n", - (__end_rodata - __start_rodata) >> 10); +#ifndef CONFIG_KPROBES +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() <= 1) +#endif + { + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); + } +#endif + start += size; + size = (unsigned long)__end_rodata - start; + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RO); + printk("Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr() requires a global_flush_tlb() call after it. @@ -840,7 +883,7 @@ void free_init_pages(char *what, unsigne free_page(addr); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); } void free_initmem(void) --- a/arch/x86/mm/init_64-xen.c +++ b/arch/x86/mm/init_64-xen.c @@ -25,10 +25,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -51,7 +53,7 @@ #define Dprintk(x...) #endif -struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops* dma_ops; EXPORT_SYMBOL(dma_ops); #if CONFIG_XEN_COMPAT <= 0x030002 @@ -191,6 +193,13 @@ void show_mem(void) for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* this loop can take a while with 256 GB and 4k pages + so update the NMI watchdog */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + touch_nmi_watchdog(); + } + if (!pfn_valid(pgdat->node_start_pfn + i)) + continue; page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) @@ -359,7 +368,7 @@ __set_fixmap (enum fixed_addresses idx, } } -unsigned long __initdata table_start, table_end; +unsigned long __meminitdata table_start, table_end; static __meminit void *alloc_static_page(unsigned long *phys) { @@ -376,7 +385,7 @@ static __meminit void *alloc_static_page start_pfn++; memset((void *)va, 0, PAGE_SIZE); return (void *)va; -} +} #define PTE_SIZE PAGE_SIZE @@ -412,28 +421,46 @@ static inline int make_readonly(unsigned #ifndef CONFIG_XEN /* Must run before zap_low_mappings */ -__init void *early_ioremap(unsigned long addr, unsigned long size) +__meminit void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long map = round_down(addr, LARGE_PAGE_SIZE); - - /* actually usually some more */ - if (size >= LARGE_PAGE_SIZE) { - return NULL; + unsigned long vaddr; + pmd_t *pmd, *last_pmd; + int i, pmds; + + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + vaddr = __START_KERNEL_map; + pmd = level2_kernel_pgt; + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { + for (i = 0; i < pmds; i++) { + if (pmd_present(pmd[i])) + goto next; + } + vaddr += addr & ~PMD_MASK; + addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); + __flush_tlb(); + return (void *)vaddr; + next: + ; } - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - map += LARGE_PAGE_SIZE; - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; } /* To avoid virtual aliases later */ -__init void early_iounmap(void *addr, unsigned long size) +__meminit void early_iounmap(void *addr, unsigned long size) { - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) - printk("early_iounmap: bad address %p\n", addr); - set_pmd(temp_mappings[0].pmd, __pmd(0)); - set_pmd(temp_mappings[1].pmd, __pmd(0)); + unsigned long vaddr; + pmd_t *pmd; + int i, pmds; + + vaddr = (unsigned long)addr; + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) + pmd_clear(pmd + i); __flush_tlb(); } #endif @@ -767,14 +794,6 @@ void __meminit init_memory_mapping(unsig __flush_tlb_all(); } -void __cpuinit zap_low_mappings(int cpu) -{ - /* this is not required for Xen */ -#if 0 - swap_low_mappings(); -#endif -} - #ifndef CONFIG_NUMA void __init paging_init(void) { @@ -960,17 +979,6 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - -#ifndef CONFIG_XEN -#ifdef CONFIG_SMP - /* - * Sync boot_level4_pgt mappings with the init_level4_pgt - * except for the low identity mappings which are already zapped - * in init_level4_pgt. This sync-up is essential for AP's bringup - */ - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); -#endif -#endif } void free_init_pages(char *what, unsigned long begin, unsigned long end) @@ -980,7 +988,7 @@ void free_init_pages(char *what, unsigne if (begin >= end) return; - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); @@ -989,24 +997,17 @@ void free_init_pages(char *what, unsigne if (addr >= __START_KERNEL_map) { /* make_readonly() reports all kernel addresses. */ __make_page_writable(__va(__pa(addr))); - if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { - pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud = pud_offset(pgd, addr); - pmd_t *pmd = pmd_offset(pud, addr); - pte_t *pte = pte_offset_kernel(pmd, addr); - - xen_l1_entry_update(pte, __pte(0)); /* fallback */ - } + change_page_attr_addr(addr, 1, __pgprot(0)); } free_page(addr); totalram_pages++; } + if (addr > __START_KERNEL_map) + global_flush_tlb(); } void free_initmem(void) { - memset(__initdata_begin, POISON_FREE_INITDATA, - __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); @@ -1016,13 +1017,28 @@ void free_initmem(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long start = (unsigned long)_stext, end; + +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() > 1) + start = (unsigned long)_etext; +#endif + +#ifdef CONFIG_KPROBES + start = (unsigned long)__start_rodata; +#endif + + end = (unsigned long)__end_rodata; + start = (start + PAGE_SIZE - 1) & PAGE_MASK; + end &= PAGE_MASK; + if (end <= start) + return; - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); + change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk ("Write protecting the kernel read-only data: %luk\n", - (__end_rodata - __start_rodata) >> 10); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); /* * change_page_attr_addr() requires a global_flush_tlb() call after it. @@ -1175,3 +1191,11 @@ int in_gate_area_no_task(unsigned long a { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } + +#ifndef CONFIG_XEN +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return __alloc_bootmem_core(pgdat->bdata, size, + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); +} +#endif --- a/arch/x86/mm/ioremap_32-xen.c +++ b/arch/x86/mm/ioremap_32-xen.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include --- a/arch/x86/mm/pageattr_64-xen.c +++ b/arch/x86/mm/pageattr_64-xen.c @@ -215,13 +215,13 @@ void mm_pin_all(void) preempt_enable(); } -void _arch_dup_mmap(struct mm_struct *mm) +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { if (!mm->context.pinned) mm_pin(mm); } -void _arch_exit_mmap(struct mm_struct *mm) +void arch_exit_mmap(struct mm_struct *mm) { struct task_struct *tsk = current; @@ -343,10 +343,11 @@ static void flush_kernel_map(void *arg) struct page *pg; /* When clflush is available always use it because it is - much cheaper than WBINVD */ - if (!cpu_has_clflush) + much cheaper than WBINVD. Disable clflush for now because + the high level code is not ready yet */ + if (1 || !cpu_has_clflush) asm volatile("wbinvd" ::: "memory"); - list_for_each_entry(pg, l, lru) { + else list_for_each_entry(pg, l, lru) { void *adr = page_address(pg); if (cpu_has_clflush) cache_flush_page(adr); @@ -460,16 +461,24 @@ __change_page_attr(unsigned long address */ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) { - int err = 0; + int err = 0, kernel_map = 0; int i; + if (address >= __START_KERNEL_map + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { + address = (unsigned long)__va(__pa(address)); + kernel_map = 1; + } + down_write(&init_mm.mmap_sem); for (i = 0; i < numpages; i++, address += PAGE_SIZE) { unsigned long pfn = __pa(address) >> PAGE_SHIFT; - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; + if (!kernel_map || pte_present(pfn_pte(0, prot))) { + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); + if (err) + break; + } /* Handle kernel mapping too which aliases part of the * lowmem */ if (__pa(address) < KERNEL_TEXT_SIZE) { --- a/arch/x86/mm/pgtable_32-xen.c +++ b/arch/x86/mm/pgtable_32-xen.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -218,8 +219,6 @@ void pmd_ctor(void *pmd, struct kmem_cac * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. - * The locking scheme was chosen on the basis of manfred's - * recommendations and having no core impact whatsoever. * -- wli */ DEFINE_SPINLOCK(pgd_lock); @@ -245,37 +244,54 @@ static inline void pgd_list_del(pgd_t *p set_page_private(next, (unsigned long)pprev); } -void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) + + +#if (PTRS_PER_PMD == 1) +/* Non-PAE pgd constructor */ +void pgd_ctor(void *pgd) { unsigned long flags; - if (PTRS_PER_PMD > 1) { - if (HAVE_SHARED_KERNEL_PMD) - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - } else { - spin_lock_irqsave(&pgd_lock, flags); + /* !PAE, no pagetable sharing */ + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + + spin_lock_irqsave(&pgd_lock, flags); + + /* must happen under lock */ + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} +#else /* PTRS_PER_PMD > 1 */ +/* PAE pgd constructor */ +void pgd_ctor(void *pgd) +{ + /* PAE, kernel PMD may be shared */ + + if (SHARED_KERNEL_PMD) { clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); + } else { memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - - /* must happen under lock */ - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); - - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); } } +#endif /* PTRS_PER_PMD */ -/* never called when PTRS_PER_PMD > 1 */ -void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) +void pgd_dtor(void *pgd) { unsigned long flags; /* can be called from interrupt context */ + if (SHARED_KERNEL_PMD) + return; + paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); @@ -284,11 +300,46 @@ void pgd_dtor(void *pgd, struct kmem_cac pgd_test_and_unpin(pgd); } +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) + +/* If we allocate a pmd for part of the kernel address space, then + make sure its initialized with the appropriate kernel mappings. + Otherwise use a cached zeroed pmd. */ +static pmd_t *pmd_cache_alloc(int idx) +{ + pmd_t *pmd; + + if (idx >= USER_PTRS_PER_PGD) { + pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + +#ifndef CONFIG_XEN + if (pmd) + memcpy(pmd, + (void *)pgd_page_vaddr(swapper_pg_dir[idx]), + sizeof(pmd_t) * PTRS_PER_PMD); +#endif + } else + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + + return pmd; +} + +static void pmd_cache_free(pmd_t *pmd, int idx) +{ + if (idx >= USER_PTRS_PER_PGD) { + make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables); + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); + free_page((unsigned long)pmd); + } else + kmem_cache_free(pmd_cache, pmd); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { int i; - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - pmd_t **pmd; + pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); + pmd_t **pmds = NULL; unsigned long flags; pgd_test_and_unpin(pgd); @@ -296,37 +347,40 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (PTRS_PER_PMD == 1 || !pgd) return pgd; - if (HAVE_SHARED_KERNEL_PMD) { - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); +#ifdef CONFIG_XEN + if (!SHARED_KERNEL_PMD) { + /* + * We can race save/restore (if we sleep during a GFP_KERNEL memory + * allocation). We therefore store virtual addresses of pmds as they + * do not change across save/restore, and poke the machine addresses + * into the pgdir under the pgd_lock. + */ + pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); + if (!pmds) { + quicklist_free(0, pgd_dtor, pgd); + return NULL; } - return pgd; - } - - /* - * We can race save/restore (if we sleep during a GFP_KERNEL memory - * allocation). We therefore store virtual addresses of pmds as they - * do not change across save/restore, and poke the machine addresses - * into the pgdir under the pgd_lock. - */ - pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); - if (!pmd) { - kmem_cache_free(pgd_cache, pgd); - return NULL; } +#endif /* Allocate pmds, remember virtual addresses. */ - for (i = 0; i < PTRS_PER_PGD; ++i) { - pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd[i]) + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { + pmd_t *pmd = pmd_cache_alloc(i); + + if (!pmd) goto out_oom; + paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); + if (pmds) + pmds[i] = pmd; + else + set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } +#ifdef CONFIG_XEN + if (SHARED_KERNEL_PMD) + return pgd; + spin_lock_irqsave(&pgd_lock, flags); /* Protect against save/restore: move below 4GB under pgd_lock. */ @@ -341,44 +395,43 @@ pgd_t *pgd_alloc(struct mm_struct *mm) /* Copy kernel pmd contents and write-protect the new pmds. */ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { - unsigned long v = (unsigned long)i << PGDIR_SHIFT; - pgd_t *kpgd = pgd_offset_k(v); - pud_t *kpud = pud_offset(kpgd, v); - pmd_t *kpmd = pmd_offset(kpud, v); - memcpy(pmd[i], kpmd, PAGE_SIZE); + memcpy(pmds[i], + (void *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); make_lowmem_page_readonly( - pmd[i], XENFEAT_writable_page_tables); + pmds[i], XENFEAT_writable_page_tables); } /* It is safe to poke machine addresses of pmds under the pmd_lock. */ for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i]))); + set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i]))); /* Ensure this pgd gets picked up and pinned on save/restore. */ pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); - kfree(pmd); + kfree(pmds); +#endif return pgd; out_oom: - if (HAVE_SHARED_KERNEL_PMD) { + if (!pmds) { for (i--; i >= 0; i--) { pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } } else { for (i--; i >= 0; i--) { - paravirt_release_pd(__pa(pmd[i]) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd[i]); + paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT); + pmd_cache_free(pmds[i], i); } - kfree(pmd); + kfree(pmds); } - kmem_cache_free(pgd_cache, pgd); + quicklist_free(0, pgd_dtor, pgd); return NULL; } @@ -398,35 +451,24 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) { - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } - if (!HAVE_SHARED_KERNEL_PMD) { - unsigned long flags; - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { - pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); - make_lowmem_page_writable( - pmd, XENFEAT_writable_page_tables); - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); - kmem_cache_free(pmd_cache, pmd); - } - - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) - xen_destroy_contiguous_region( - (unsigned long)pgd, 0); - } + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) + xen_destroy_contiguous_region((unsigned long)pgd, 0); } /* in the non-PAE case, free_pgtables() clears user pgd entries */ - kmem_cache_free(pgd_cache, pgd); + quicklist_free(0, pgd_dtor, pgd); +} + +void check_pgt_cache(void) +{ + quicklist_trim(0, pgd_dtor, 25, 16); } void make_lowmem_page_readonly(void *va, unsigned int feature) @@ -723,13 +765,13 @@ void mm_pin_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } -void _arch_dup_mmap(struct mm_struct *mm) +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags)) mm_pin(mm); } -void _arch_exit_mmap(struct mm_struct *mm) +void arch_exit_mmap(struct mm_struct *mm) { struct task_struct *tsk = current; --- a/drivers/char/tpm/tpm_xen.c +++ b/drivers/char/tpm/tpm_xen.c @@ -463,7 +463,7 @@ static int tpmif_connect(struct xenbus_d tp->backend_id = domid; err = bind_listening_port_to_irqhandler( - domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp); + domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp); if (err <= 0) { WPRINTK("bind_listening_port_to_irqhandler failed " "(err=%d)\n", err); --- a/drivers/pci/msi-xen.c +++ b/drivers/pci/msi-xen.c @@ -12,16 +12,15 @@ #include #include #include -#include #include #include #include +#include #include #include #include -#include #include "pci.h" #include "msi.h" @@ -154,6 +153,7 @@ int register_msi_get_owner(int (*func)(s get_owner = func; return 0; } +EXPORT_SYMBOL(register_msi_get_owner); int unregister_msi_get_owner(int (*func)(struct pci_dev *dev)) { @@ -162,6 +162,7 @@ int unregister_msi_get_owner(int (*func) get_owner = NULL; return 0; } +EXPORT_SYMBOL(unregister_msi_get_owner); static int msi_get_dev_owner(struct pci_dev *dev) { @@ -263,11 +264,6 @@ static int msi_map_vector(struct pci_dev return msi_map_pirq_to_vector(dev, -1, entry_nr, table_base); } -static int msi_init(void) -{ - return 0; -} - #ifdef CONFIG_PM static void __pci_restore_msi_state(struct pci_dev *dev) { @@ -434,21 +430,32 @@ static int msix_capability_init(struct p } /** - * pci_msi_supported - check whether MSI may be enabled on device + * pci_msi_check_device - check whether MSI may be enabled on a device * @dev: pointer to the pci_dev data structure of MSI device function + * @nvec: how many MSIs have been requested ? + * @type: are we checking for MSI or MSI-X ? * * Look at global flags, the device itself, and its parent busses - * to return 0 if MSI are supported for the device. + * to determine if MSI/-X are supported for the device. If MSI/-X is + * supported return 0, else return an error code. **/ -static -int pci_msi_supported(struct pci_dev * dev) +static int pci_msi_check_device(struct pci_dev* dev, int nvec, int type) { struct pci_bus *bus; + int ret; /* MSI must be globally enabled and supported by the device */ if (!pci_msi_enable || !dev || dev->no_msi) return -EINVAL; + /* + * You can't ask to have 0 or less MSIs configured. + * a) it's stupid .. + * b) the list manipulation code assumes nvec >= 1. + */ + if (nvec < 1) + return -ERANGE; + /* Any bridge which does NOT route MSI transactions from it's * secondary bus to it's primary bus must set NO_MSI flag on * the secondary pci_bus. @@ -459,6 +466,13 @@ int pci_msi_supported(struct pci_dev * d if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) return -EINVAL; + ret = arch_msi_check_device(dev, nvec, type); + if (ret) + return ret; + + if (!pci_find_capability(dev, type)) + return -EINVAL; + return 0; } @@ -476,18 +490,15 @@ extern int pci_frontend_enable_msi(struc int pci_enable_msi(struct pci_dev* dev) { struct pci_bus *bus; - int pos, temp, status; - - if (pci_msi_supported(dev) < 0) - return -EINVAL; + int temp, status; for (bus = dev->bus; bus; bus = bus->parent) if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) return -EINVAL; - status = msi_init(); - if (status < 0) - return status; + status = pci_msi_check_device(dev, 1, PCI_CAP_ID_MSI); + if (status) + return status; #ifdef CONFIG_XEN_PCIDEV_FRONTEND if (!is_initial_xendomain()) @@ -508,10 +519,6 @@ int pci_enable_msi(struct pci_dev* dev) temp = dev->irq; - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); - if (!pos) - return -EINVAL; - /* Check whether driver already requested for MSI-X irqs */ if (dev->msix_enabled) { printk(KERN_INFO "PCI: %s: Can't enable MSI. " @@ -526,15 +533,14 @@ int pci_enable_msi(struct pci_dev* dev) return status; } +EXPORT_SYMBOL(pci_enable_msi); extern void pci_frontend_disable_msi(struct pci_dev* dev); void pci_disable_msi(struct pci_dev* dev) { int pirq; - if (!pci_msi_enable) - return; - if (!dev) + if (!pci_msi_enable || !dev) return; #ifdef CONFIG_XEN_PCIDEV_FRONTEND @@ -559,6 +565,7 @@ void pci_disable_msi(struct pci_dev* dev pci_intx(dev, 1); /* enable intx */ dev->msi_enabled = 0; } +EXPORT_SYMBOL(pci_disable_msi); /** * pci_enable_msix - configure device's MSI-X capability structure @@ -583,7 +590,7 @@ int pci_enable_msix(struct pci_dev* dev, int i, j, temp; u16 control; - if (!entries || pci_msi_supported(dev) < 0) + if (!entries) return -EINVAL; #ifdef CONFIG_XEN_PCIDEV_FRONTEND @@ -621,14 +628,11 @@ int pci_enable_msix(struct pci_dev* dev, } #endif - status = msi_init(); - if (status < 0) + status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSIX); + if (status) return status; pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - if (!pos) - return -EINVAL; - pci_read_config_word(dev, msi_control_reg(pos), &control); nr_entries = multi_msix_capable(control); if (nvec > nr_entries) @@ -660,6 +664,7 @@ int pci_enable_msix(struct pci_dev* dev, return status; } +EXPORT_SYMBOL(pci_enable_msix); extern void pci_frontend_disable_msix(struct pci_dev* dev); void pci_disable_msix(struct pci_dev* dev) @@ -699,6 +704,7 @@ void pci_disable_msix(struct pci_dev* de pci_intx(dev, 1); /* enable intx */ dev->msix_enabled = 0; } +EXPORT_SYMBOL(pci_disable_msix); /** * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state @@ -742,12 +748,57 @@ void pci_no_msi(void) pci_msi_enable = 0; } -EXPORT_SYMBOL(pci_enable_msi); -EXPORT_SYMBOL(pci_disable_msi); -EXPORT_SYMBOL(pci_enable_msix); -EXPORT_SYMBOL(pci_disable_msix); -#ifdef CONFIG_XEN -EXPORT_SYMBOL(register_msi_get_owner); -EXPORT_SYMBOL(unregister_msi_get_owner); +void pci_msi_init_pci_dev(struct pci_dev *dev) +{ +#ifndef CONFIG_XEN + INIT_LIST_HEAD(&dev->msi_list); #endif +} + + +/* Arch hooks */ + +int __attribute__ ((weak)) +arch_msi_check_device(struct pci_dev* dev, int nvec, int type) +{ + return 0; +} + +#ifndef CONFIG_XEN +int __attribute__ ((weak)) +arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry) +{ + return 0; +} + +int __attribute__ ((weak)) +arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + struct msi_desc *entry; + int ret; + list_for_each_entry(entry, &dev->msi_list, list) { + ret = arch_setup_msi_irq(dev, entry); + if (ret) + return ret; + } + + return 0; +} + +void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq) +{ + return; +} + +void __attribute__ ((weak)) +arch_teardown_msi_irqs(struct pci_dev *dev) +{ + struct msi_desc *entry; + + list_for_each_entry(entry, &dev->msi_list, list) { + if (entry->irq != 0) + arch_teardown_msi_irq(entry->irq); + } +} +#endif --- a/drivers/xen/blkfront/blkfront.c +++ b/drivers/xen/blkfront/blkfront.c @@ -241,7 +241,7 @@ static int setup_blkring(struct xenbus_d info->ring_ref = err; err = bind_listening_port_to_irqhandler( - dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info); + dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_listening_port_to_irqhandler"); --- a/drivers/xen/char/mem.c +++ b/drivers/xen/char/mem.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include --- a/drivers/xen/core/hypervisor_sysfs.c +++ b/drivers/xen/core/hypervisor_sysfs.c @@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init if (!is_running_on_xen()) return -ENODEV; - hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type; + hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type; return 0; } --- a/drivers/xen/core/smpboot.c +++ b/drivers/xen/core/smpboot.c @@ -165,13 +165,12 @@ static void xen_smp_intr_exit(unsigned i void __cpuinit cpu_bringup(void) { + cpu_init(); #ifdef __i386__ - cpu_set_gdt(current_thread_info()->cpu); - secondary_cpu_init(); + identify_secondary_cpu(cpu_data + smp_processor_id()); #else - cpu_init(); -#endif identify_cpu(cpu_data + smp_processor_id()); +#endif touch_softlockup_watchdog(); preempt_disable(); local_irq_enable(); @@ -191,11 +190,6 @@ static void __cpuinit cpu_initialize_con static DEFINE_SPINLOCK(ctxt_lock); struct task_struct *idle = idle_task(cpu); -#ifdef __x86_64__ - struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu]; -#else - struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu); -#endif if (cpu_test_and_set(cpu, cpu_initialized_map)) return; @@ -218,11 +212,11 @@ static void __cpuinit cpu_initialize_con smp_trap_init(ctxt.trap_ctxt); ctxt.ldt_ents = 0; - - ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address); - ctxt.gdt_ents = gdt_descr->size / 8; + ctxt.gdt_ents = GDT_SIZE / 8; #ifdef __i386__ + ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu)); + ctxt.user_regs.cs = __KERNEL_CS; ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); @@ -235,7 +229,11 @@ static void __cpuinit cpu_initialize_con ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); + + ctxt.user_regs.fs = __KERNEL_PERCPU; #else /* __x86_64__ */ + ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address); + ctxt.user_regs.cs = __KERNEL_CS; ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); @@ -265,9 +263,8 @@ void __init smp_prepare_cpus(unsigned in struct vcpu_get_physid cpu_id; #ifdef __x86_64__ struct desc_ptr *gdt_descr; -#else - struct Xgt_desc_struct *gdt_descr; #endif + void *gdt_addr; apicid = 0; if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0) @@ -317,14 +314,12 @@ void __init smp_prepare_cpus(unsigned in } gdt_descr->size = GDT_SIZE; memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE); + gdt_addr = (void *)gdt_descr->address; #else - if (unlikely(!init_gdt(cpu, idle))) - continue; - gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + init_gdt(cpu); + gdt_addr = get_cpu_gdt_table(cpu); #endif - make_page_readonly( - (void *)gdt_descr->address, - XENFEAT_writable_descriptor_tables); + make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables); apicid = cpu; if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) @@ -338,7 +333,9 @@ void __init smp_prepare_cpus(unsigned in #ifdef __x86_64__ cpu_pda(cpu)->pcurrent = idle; cpu_pda(cpu)->cpunumber = cpu; - clear_ti_thread_flag(idle->thread_info, TIF_FORK); + clear_ti_thread_flag(task_thread_info(idle), TIF_FORK); +#else + per_cpu(current_task, cpu) = idle; #endif irq_ctx_init(cpu); @@ -363,8 +360,12 @@ void __init smp_prepare_cpus(unsigned in #endif } -void __devinit smp_prepare_boot_cpu(void) +void __init smp_prepare_boot_cpu(void) { +#ifdef __i386__ + init_gdt(smp_processor_id()); + switch_to_new_gdt(); +#endif prefill_possible_map(); } --- a/drivers/xen/core/xen_sysfs.c +++ b/drivers/xen/core/xen_sysfs.c @@ -29,12 +29,12 @@ HYPERVISOR_ATTR_RO(type); static int __init xen_sysfs_type_init(void) { - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr); + return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr); } static void xen_sysfs_type_destroy(void) { - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr); + sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr); } /* xen version attributes */ @@ -90,13 +90,13 @@ static struct attribute_group version_gr static int __init xen_sysfs_version_init(void) { - return sysfs_create_group(&hypervisor_subsys.kset.kobj, + return sysfs_create_group(&hypervisor_subsys.kobj, &version_group); } static void xen_sysfs_version_destroy(void) { - sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group); + sysfs_remove_group(&hypervisor_subsys.kobj, &version_group); } /* UUID */ @@ -126,12 +126,12 @@ HYPERVISOR_ATTR_RO(uuid); static int __init xen_sysfs_uuid_init(void) { - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr); + return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr); } static void xen_sysfs_uuid_destroy(void) { - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr); + sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr); } /* xen compilation attributes */ @@ -204,13 +204,13 @@ static struct attribute_group xen_compil int __init static xen_compilation_init(void) { - return sysfs_create_group(&hypervisor_subsys.kset.kobj, + return sysfs_create_group(&hypervisor_subsys.kobj, &xen_compilation_group); } static void xen_compilation_destroy(void) { - sysfs_remove_group(&hypervisor_subsys.kset.kobj, + sysfs_remove_group(&hypervisor_subsys.kobj, &xen_compilation_group); } @@ -325,13 +325,13 @@ static struct attribute_group xen_proper static int __init xen_properties_init(void) { - return sysfs_create_group(&hypervisor_subsys.kset.kobj, + return sysfs_create_group(&hypervisor_subsys.kobj, &xen_properties_group); } static void xen_properties_destroy(void) { - sysfs_remove_group(&hypervisor_subsys.kset.kobj, + sysfs_remove_group(&hypervisor_subsys.kobj, &xen_properties_group); } @@ -350,13 +350,13 @@ HYPERVISOR_ATTR_RO(vmcoreinfo); static int __init xen_sysfs_vmcoreinfo_init(void) { - return sysfs_create_file(&hypervisor_subsys.kset.kobj, + return sysfs_create_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr); } static void xen_sysfs_vmcoreinfo_destroy(void) { - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &vmcoreinfo_attr.attr); + sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr); } #endif --- a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c @@ -179,7 +179,7 @@ static struct sk_buff *netbk_copy_skb(st goto err; skb_reserve(nskb, 16 + NET_IP_ALIGN); - headlen = nskb->end - nskb->data; + headlen = skb_end_pointer(nskb) - nskb->data; if (headlen > skb_headlen(skb)) headlen = skb_headlen(skb); ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen); @@ -225,11 +225,15 @@ static struct sk_buff *netbk_copy_skb(st len -= copy; } +#ifdef NET_SKBUFF_DATA_USES_OFFSET + offset = 0; +#else offset = nskb->data - skb->data; +#endif - nskb->h.raw = skb->h.raw + offset; - nskb->nh.raw = skb->nh.raw + offset; - nskb->mac.raw = skb->mac.raw + offset; + nskb->transport_header = skb->transport_header + offset; + nskb->network_header = skb->network_header + offset; + nskb->mac_header = skb->mac_header + offset; return nskb; @@ -1601,7 +1605,7 @@ static int __init netback_init(void) (void)bind_virq_to_irqhandler(VIRQ_DEBUG, 0, netif_be_dbg, - SA_SHIRQ, + IRQF_SHARED, "net-be-dbg", &netif_be_dbg); #endif --- a/drivers/xen/netfront/netfront.c +++ b/drivers/xen/netfront/netfront.c @@ -513,7 +513,7 @@ static int setup_device(struct xenbus_de memcpy(netdev->dev_addr, info->mac, ETH_ALEN); err = bind_listening_port_to_irqhandler( - dev->otherend_id, netif_int, SA_SAMPLE_RANDOM, netdev->name, + dev->otherend_id, netif_int, IRQF_SAMPLE_RANDOM, netdev->name, netdev); if (err < 0) goto fail; --- a/drivers/xen/pciback/xenbus.c +++ b/drivers/xen/pciback/xenbus.c @@ -99,7 +99,7 @@ static int pciback_do_attach(struct pcib err = bind_interdomain_evtchn_to_irqhandler( pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event, - SA_SAMPLE_RANDOM, "pciback", pdev); + IRQF_SAMPLE_RANDOM, "pciback", pdev); if (err < 0) { xenbus_dev_fatal(pdev->xdev, err, "Error binding event channel to IRQ"); --- a/drivers/xen/pcifront/xenbus.c +++ b/drivers/xen/pcifront/xenbus.c @@ -10,10 +10,6 @@ #include #include "pcifront.h" -#ifndef __init_refok -#define __init_refok -#endif - #define INVALID_GRANT_REF (0) #define INVALID_EVTCHN (-1) --- a/drivers/xen/scsifront/xenbus.c +++ b/drivers/xen/scsifront/xenbus.c @@ -96,7 +96,7 @@ static int scsifront_alloc_ring(struct v err = bind_listening_port_to_irqhandler( dev->otherend_id, scsifront_intr, - SA_SAMPLE_RANDOM, "scsifront", info); + IRQF_SAMPLE_RANDOM, "scsifront", info); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_listening_port_to_irqhandler"); --- a/drivers/xen/sfc_netback/accel_fwd.c +++ b/drivers/xen/sfc_netback/accel_fwd.c @@ -308,7 +308,7 @@ static struct netback_accel *for_a_vnic( static inline int packet_is_arp_reply(struct sk_buff *skb) { return skb->protocol == ntohs(ETH_P_ARP) - && skb->nh.arph->ar_op == ntohs(ARPOP_REPLY); + && arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY); } @@ -392,12 +392,13 @@ void netback_accel_tx_packet(struct sk_b BUG_ON(fwd_priv == NULL); - if (is_broadcast_ether_addr(skb->mac.raw) && packet_is_arp_reply(skb)) { + if (is_broadcast_ether_addr(skb_mac_header(skb)) + && packet_is_arp_reply(skb)) { /* * update our fast path forwarding to reflect this * gratuitous ARP */ - mac = skb->mac.raw+ETH_ALEN; + mac = skb_mac_header(skb)+ETH_ALEN; DPRINTK("%s: found gratuitous ARP for " MAC_FMT "\n", __FUNCTION__, MAC_ARG(mac)); --- a/drivers/xen/sfc_netback/accel_solarflare.c +++ b/drivers/xen/sfc_netback/accel_solarflare.c @@ -114,7 +114,7 @@ bend_dl_tx_packet(struct efx_dl_device * BUG_ON(port == NULL); NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++); - if (skb->mac.raw != NULL) + if (skb_mac_header_was_set(skb)) netback_accel_tx_packet(skb, port->fwd_priv); else { DPRINTK("Ignoring packet with missing mac address\n"); --- a/drivers/xen/sfc_netfront/accel_tso.c +++ b/drivers/xen/sfc_netfront/accel_tso.c @@ -33,10 +33,9 @@ #include "accel_tso.h" -#define PTR_DIFF(p1, p2) ((u8*)(p1) - (u8*)(p2)) -#define ETH_HDR_LEN(skb) ((skb)->nh.raw - (skb)->data) -#define SKB_TCP_OFF(skb) PTR_DIFF ((skb)->h.th, (skb)->data) -#define SKB_IP_OFF(skb) PTR_DIFF ((skb)->nh.iph, (skb)->data) +#define ETH_HDR_LEN(skb) skb_network_offset(skb) +#define SKB_TCP_OFF(skb) skb_transport_offset(skb) +#define SKB_IP_OFF(skb) skb_network_offset(skb) /* * Set a maximum number of buffers in each output packet to make life @@ -114,9 +113,8 @@ struct netfront_accel_tso_state { static inline void tso_check_safe(struct sk_buff *skb) { EPRINTK_ON(skb->protocol != htons (ETH_P_IP)); EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP)); - EPRINTK_ON(skb->nh.iph->protocol != IPPROTO_TCP); - EPRINTK_ON((SKB_TCP_OFF(skb) - + (skb->h.th->doff << 2u)) > skb_headlen(skb)); + EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP); + EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb)); } @@ -129,17 +127,17 @@ static inline void tso_start(struct netf * All ethernet/IP/TCP headers combined size is TCP header size * plus offset of TCP header relative to start of packet. */ - st->p.header_length = (skb->h.th->doff << 2u) + SKB_TCP_OFF(skb); + st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb); st->p.full_packet_size = (st->p.header_length + skb_shinfo(skb)->gso_size); st->p.gso_size = skb_shinfo(skb)->gso_size; - st->p.ip_id = htons(skb->nh.iph->id); - st->seqnum = ntohl(skb->h.th->seq); + st->p.ip_id = htons(ip_hdr(skb)->id); + st->seqnum = ntohl(tcp_hdr(skb)->seq); - EPRINTK_ON(skb->h.th->urg); - EPRINTK_ON(skb->h.th->syn); - EPRINTK_ON(skb->h.th->rst); + EPRINTK_ON(tcp_hdr(skb)->urg); + EPRINTK_ON(tcp_hdr(skb)->syn); + EPRINTK_ON(tcp_hdr(skb)->rst); st->remaining_len = skb->len - st->p.header_length; @@ -258,8 +256,8 @@ int tso_start_new_packet(netfront_accel_ /* This packet will be the last in the TSO burst. */ ip_length = (st->p.header_length - ETH_HDR_LEN(skb) + st->remaining_len); - tsoh_th->fin = skb->h.th->fin; - tsoh_th->psh = skb->h.th->psh; + tsoh_th->fin = tcp_hdr(skb)->fin; + tsoh_th->psh = tcp_hdr(skb)->psh; } tsoh_iph->tot_len = htons(ip_length); --- a/drivers/xen/sfc_netfront/accel_vi.c +++ b/drivers/xen/sfc_netfront/accel_vi.c @@ -463,7 +463,7 @@ netfront_accel_enqueue_skb_multi(netfron if (skb->ip_summed == CHECKSUM_PARTIAL) { /* Set to zero to encourage falcon to work it out for us */ - *(u16*)(skb->h.raw + skb->csum_offset) = 0; + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0; } if (multi_post_start_new_buffer(vnic, &state)) { @@ -582,7 +582,7 @@ netfront_accel_enqueue_skb_single(netfro if (skb->ip_summed == CHECKSUM_PARTIAL) { /* Set to zero to encourage falcon to work it out for us */ - *(u16*)(skb->h.raw + skb->csum_offset) = 0; + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0; } NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT (skb, idx, frag_data, frag_len, { --- a/drivers/xen/sfc_netfront/accel_xenbus.c +++ b/drivers/xen/sfc_netfront/accel_xenbus.c @@ -356,7 +356,7 @@ static int vnic_setup_domU_shared_state( /* Create xenbus msg event channel */ err = bind_listening_port_to_irqhandler (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend, - SA_SAMPLE_RANDOM, "vnicctrl", vnic); + IRQF_SAMPLE_RANDOM, "vnicctrl", vnic); if (err < 0) { EPRINTK("Couldn't bind msg event channel\n"); goto fail_msg_irq; @@ -367,7 +367,7 @@ static int vnic_setup_domU_shared_state( /* Create xenbus net event channel */ err = bind_listening_port_to_irqhandler (dev->otherend_id, netfront_accel_net_channel_irq_from_bend, - SA_SAMPLE_RANDOM, "vnicfront", vnic); + IRQF_SAMPLE_RANDOM, "vnicfront", vnic); if (err < 0) { EPRINTK("Couldn't bind net event channel\n"); goto fail_net_irq; --- a/fs/aio.c +++ b/fs/aio.c @@ -38,7 +38,7 @@ #ifdef CONFIG_EPOLL #include -#include +#include #endif #if DEBUG > 1 @@ -1309,7 +1309,7 @@ static const struct file_operations aioq /* make_aio_fd: * Create a file descriptor that can be used to poll the event queue. - * Based and piggybacked on the excellent epoll code. + * Based on the excellent epoll code. */ static int make_aio_fd(struct kioctx *ioctx) @@ -1318,7 +1318,8 @@ static int make_aio_fd(struct kioctx *io struct inode *inode; struct file *file; - error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops); + error = anon_inode_getfd(&fd, &inode, &file, "[aioq]", + &aioq_fops, ioctx); if (error) return error; --- a/include/asm-x86/mach-xen/asm/desc_32.h +++ b/include/asm-x86/mach-xen/asm/desc_32.h @@ -11,23 +11,24 @@ #include -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; - struct Xgt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); unsigned short pad; } __attribute__ ((packed)); -extern struct Xgt_desc_struct idt_descr; -DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); -extern struct Xgt_desc_struct early_gdt_descr; +struct gdt_page +{ + struct desc_struct gdt[GDT_ENTRIES]; +} __attribute__((aligned(PAGE_SIZE))); +DECLARE_PER_CPU(struct gdt_page, gdt_page); static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { - return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; + return per_cpu(gdt_page, cpu).gdt; } +extern struct Xgt_desc_struct idt_descr; extern struct desc_struct idt_table[]; extern void set_intr_gate(unsigned int irq, void * addr); @@ -55,53 +56,32 @@ static inline void pack_gate(__u32 *a, _ #define DESCTYPE_S 0x10 /* !system */ #ifndef CONFIG_XEN -#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) - -#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) -#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) +#define load_TR_desc() native_load_tr_desc() +#define load_gdt(dtr) native_load_gdt(dtr) +#define load_idt(dtr) native_load_idt(dtr) #define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) #define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) -#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) -#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) -#define store_tr(tr) __asm__ ("str %0":"=m" (tr)) +#define store_gdt(dtr) native_store_gdt(dtr) +#define store_idt(dtr) native_store_idt(dtr) +#define store_tr(tr) (tr = native_store_tr()) #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) -#endif -#if TLS_SIZE != 24 -# error update this code. -#endif - -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \ - *(u64 *)&t->tls_array[i]) \ - BUG() - C(0); C(1); C(2); -#undef C -} +#define load_TLS(t, cpu) native_load_tls(t, cpu) +#define set_ldt native_set_ldt -#ifndef CONFIG_XEN #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) -static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) +static inline void write_dt_entry(struct desc_struct *dt, + int entry, u32 entry_low, u32 entry_high) { - __u32 *lp = (__u32 *)((char *)dt + entry*8); - *lp = entry_a; - *(lp+1) = entry_b; + dt[entry].a = entry_low; + dt[entry].b = entry_high; } -#define set_ldt native_set_ldt -#else -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b); -#define set_ldt xen_set_ldt -#endif -#ifndef CONFIG_XEN -static inline fastcall void native_set_ldt(const void *addr, - unsigned int entries) +static inline void native_set_ldt(const void *addr, unsigned int entries) { if (likely(entries == 0)) __asm__ __volatile__("lldt %w0"::"q" (0)); @@ -116,6 +96,65 @@ static inline fastcall void native_set_l __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); } } + + +static inline void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static inline void native_load_gdt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct Xgt_desc_struct *dtr) +{ + asm ("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct Xgt_desc_struct *dtr) +{ + asm ("sidt %0":"=m" (*dtr)); +} + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + asm ("str %0":"=r" (tr)); + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} +#else +#define load_TLS(t, cpu) xen_load_tls(t, cpu) +#define set_ldt xen_set_ldt + +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b); + +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), + *(u64 *)&t->tls_array[i])) + BUG(); +} #endif #ifndef CONFIG_X86_NO_IDT --- a/include/asm-x86/mach-xen/asm/desc_64.h +++ b/include/asm-x86/mach-xen/asm/desc_64.h @@ -127,16 +127,6 @@ static inline void set_ldt_desc(unsigned DESC_LDT, size * 8 - 1); } -static inline void set_seg_base(unsigned cpu, int entry, void *base) -{ - struct desc_struct *d = &cpu_gdt(cpu)[entry]; - u32 addr = (u32)(u64)base; - BUG_ON((u64)base >> 32); - d->base0 = addr & 0xffff; - d->base1 = (addr >> 16) & 0xff; - d->base2 = (addr >> 24) & 0xff; -} - #define LDT_entry_a(info) \ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) /* Don't allow setting of the lm bit. It is useless anyways because @@ -165,25 +155,15 @@ static inline void set_seg_base(unsigned (info)->useable == 0 && \ (info)->lm == 0) -#if TLS_SIZE != 24 -# error update this code. -#endif - static inline void load_TLS(struct thread_struct *t, unsigned int cpu) { -#if 0 + unsigned int i; u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN); - gdt[0] = t->tls_array[0]; - gdt[1] = t->tls_array[1]; - gdt[2] = t->tls_array[2]; -#endif -#define C(i) \ - if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \ - t->tls_array[i])) \ - BUG(); - C(0); C(1); C(2); -#undef C + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), + t->tls_array[i])) + BUG(); } /* --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h @@ -51,7 +51,7 @@ struct dma_mapping_ops { }; extern dma_addr_t bad_dma_address; -extern struct dma_mapping_ops* dma_ops; +extern const struct dma_mapping_ops* dma_ops; extern int iommu_merge; #if 0 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h @@ -19,10 +19,8 @@ * the start of the fixmap. */ extern unsigned long __FIXADDR_TOP; -#ifdef CONFIG_COMPAT_VDSO -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) -#endif +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) #ifndef __ASSEMBLY__ #include @@ -85,6 +83,9 @@ enum fixed_addresses { #ifdef CONFIG_PCI_MMCONFIG FIX_PCIE_MCFG, #endif +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, +#endif FIX_SHARED_INFO, #define NR_FIX_ISAMAPS 256 FIX_ISAMAP_END, --- a/include/asm-x86/mach-xen/asm/fixmap_64.h +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h @@ -15,7 +15,6 @@ #include #include #include -#include #include /* --- a/include/asm-x86/mach-xen/asm/highmem.h +++ b/include/asm-x86/mach-xen/asm/highmem.h @@ -67,12 +67,18 @@ extern void FASTCALL(kunmap_high(struct void *kmap(struct page *page); void kunmap(struct page *page); +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); void *kmap_atomic(struct page *page, enum km_type type); void *kmap_atomic_pte(struct page *page, enum km_type type); void kunmap_atomic(void *kvaddr, enum km_type type); void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); struct page *kmap_atomic_to_page(void *ptr); +#define kmap_atomic_pte(page, type) \ + kmap_atomic_prot(page, type, \ + test_bit(PG_pinned, &(page)->flags) \ + ? PAGE_KERNEL_RO : kmap_prot) + #define flush_cache_kmaps() do { } while (0) void clear_highpage(struct page *); --- a/include/asm-x86/mach-xen/asm/io_32.h +++ b/include/asm-x86/mach-xen/asm/io_32.h @@ -263,15 +263,18 @@ static inline void flush_write_buffers(v #endif /* __KERNEL__ */ -#define __SLOW_DOWN_IO "outb %%al,$0x80;" +static inline void xen_io_delay(void) +{ + asm volatile("outb %%al,$0x80" : : : "memory"); +} static inline void slow_down_io(void) { - __asm__ __volatile__( - __SLOW_DOWN_IO + xen_io_delay(); #ifdef REALLY_SLOW_IO - __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO + xen_io_delay(); + xen_io_delay(); + xen_io_delay(); #endif - : : ); } #ifdef CONFIG_X86_NUMAQ --- a/include/asm-x86/mach-xen/asm/irqflags_32.h +++ b/include/asm-x86/mach-xen/asm/irqflags_32.h @@ -11,6 +11,40 @@ #define _ASM_IRQFLAGS_H #ifndef __ASSEMBLY__ +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask) + +#define xen_restore_fl(f) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ + barrier(); /* unmask then check (avoid races) */\ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ + } \ +} while (0) + +#define xen_irq_disable() \ +do { \ + current_vcpu_info()->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + +#define xen_irq_enable() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ +} while (0) + +void xen_safe_halt(void); + +void xen_halt(void); /* * The use of 'barrier' in the following reflects their use as local-lock @@ -20,48 +54,31 @@ * includes these barriers, for example. */ -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) +#define __raw_local_save_flags() xen_save_fl() -#define raw_local_irq_restore(x) \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - _vcpu = current_vcpu_info(); \ - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ - barrier(); /* unmask then check (avoid races) */ \ - if (unlikely(_vcpu->evtchn_upcall_pending)) \ - force_evtchn_callback(); \ - } \ -} while (0) +#define raw_local_irq_restore(flags) xen_restore_fl(flags) -#define raw_local_irq_disable() \ -do { \ - current_vcpu_info()->evtchn_upcall_mask = 1; \ - barrier(); \ -} while (0) +#define raw_local_irq_disable() xen_irq_disable() -#define raw_local_irq_enable() \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - _vcpu = current_vcpu_info(); \ - _vcpu->evtchn_upcall_mask = 0; \ - barrier(); /* unmask then check (avoid races) */ \ - if (unlikely(_vcpu->evtchn_upcall_pending)) \ - force_evtchn_callback(); \ -} while (0) +#define raw_local_irq_enable() xen_irq_enable() /* * Used in the idle loop; sti takes one instruction cycle * to complete: */ -void raw_safe_halt(void); +static inline void raw_safe_halt(void) +{ + xen_safe_halt(); +} /* * Used when interrupts are already enabled or to * shutdown the processor: */ -void halt(void); +static inline void halt(void) +{ + xen_halt(); +} /* * For spinlocks, etc: --- a/include/asm-x86/mach-xen/asm/irqflags_64.h +++ b/include/asm-x86/mach-xen/asm/irqflags_64.h @@ -9,6 +9,7 @@ */ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H +#include #ifndef __ASSEMBLY__ /* @@ -50,19 +51,19 @@ static inline void raw_local_irq_disable { unsigned long flags = __raw_local_save_flags(); - raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); + raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); } static inline void raw_local_irq_enable(void) { unsigned long flags = __raw_local_save_flags(); - raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); + raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } static inline int raw_irqs_disabled_flags(unsigned long flags) { - return !(flags & (1<<9)) || (flags & (1 << 18)); + return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC); } #else /* CONFIG_X86_VSMP */ @@ -118,13 +119,21 @@ static inline int raw_irqs_disabled_flag * Used in the idle loop; sti takes one instruction cycle * to complete: */ -void raw_safe_halt(void); +void xen_safe_halt(void); +static inline void raw_safe_halt(void) +{ + xen_safe_halt(); +} /* * Used when interrupts are already enabled or to * shutdown the processor: */ -void halt(void); +void xen_halt(void); +static inline void halt(void) +{ + xen_halt(); +} #else /* __ASSEMBLY__: */ # ifdef CONFIG_TRACE_IRQFLAGS --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h @@ -6,6 +6,20 @@ #include #include +void arch_exit_mmap(struct mm_struct *mm); +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); + +void mm_pin(struct mm_struct *mm); +void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); + +static inline void xen_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) + mm_pin(next); +} + /* * Used for LDT copy/destruction. */ @@ -37,10 +51,6 @@ static inline void __prepare_arch_switch : : "r" (0) ); } -extern void mm_pin(struct mm_struct *mm); -extern void mm_unpin(struct mm_struct *mm); -void mm_pin_all(void); - static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -97,11 +107,10 @@ static inline void switch_mm(struct mm_s #define deactivate_mm(tsk, mm) \ asm("movl %0,%%gs": :"r" (0)); -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) -{ - if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) - mm_pin(next); - switch_mm(prev, next, NULL); -} +#define activate_mm(prev, next) \ + do { \ + xen_activate_mm(prev, next); \ + switch_mm((prev),(next),NULL); \ + } while(0) #endif --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h @@ -9,6 +9,9 @@ #include #include +void arch_exit_mmap(struct mm_struct *mm); +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); + /* * possibly do the LDT unload here? */ --- a/include/asm-x86/mach-xen/asm/page_64.h +++ b/include/asm-x86/mach-xen/asm/page_64.h @@ -7,6 +7,7 @@ #include #include #endif +#include #include /* @@ -19,18 +20,14 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#ifdef __ASSEMBLY__ -#define PAGE_SIZE (0x1 << PAGE_SHIFT) -#else -#define PAGE_SIZE (1UL << PAGE_SHIFT) -#endif +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) /* See Documentation/x86_64/mm.txt for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 46 -#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1) +#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1) #define __VIRTUAL_MASK_SHIFT 48 -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1) #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK) @@ -55,10 +52,10 @@ #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) +#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT) #define HPAGE_SHIFT PMD_SHIFT -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) @@ -152,17 +149,23 @@ static inline pgd_t __pgd(unsigned long #define __pgprot(x) ((pgprot_t) { (x) } ) -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) -#define __START_KERNEL_map 0xffffffff80000000UL -#define __PAGE_OFFSET 0xffff880000000000UL +#endif /* !__ASSEMBLY__ */ -#else #define __PHYSICAL_START CONFIG_PHYSICAL_START +#define __KERNEL_ALIGN 0x200000 + +/* + * Make sure kernel is aligned to 2MB address. Catching it at compile + * time is better. Change your config file and compile the kernel + * for a 2MB aligned address (CONFIG_PHYSICAL_START) + */ +#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 +#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" +#endif + #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) -#define __START_KERNEL_map 0xffffffff80000000 -#define __PAGE_OFFSET 0xffff880000000000 -#endif /* !__ASSEMBLY__ */ +#define __START_KERNEL_map _AC(0xffffffff80000000, UL) +#define __PAGE_OFFSET _AC(0xffff880000000000, UL) #if CONFIG_XEN_COMPAT <= 0x030002 #undef LOAD_OFFSET @@ -172,20 +175,20 @@ static inline pgd_t __pgd(unsigned long /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) -#define KERNEL_TEXT_SIZE (40UL*1024*1024) -#define KERNEL_TEXT_START 0xffffffff80000000UL +#define KERNEL_TEXT_SIZE (40*1024*1024) +#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) + +#define PAGE_OFFSET __PAGE_OFFSET -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) +#ifndef __ASSEMBLY__ +static inline unsigned long __phys_addr(unsigned long x) +{ + return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET); +} +#endif -/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. - Otherwise you risk miscompilation. */ -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET) -/* __pa_symbol should be used for C visible symbols. - This seems to be the official gcc blessed way to do such arithmetic. */ -#define __pa_symbol(x) \ - ({unsigned long v; \ - asm("" : "=r" (v) : "0" (x)); \ - __pa(v); }) +#define __pa(x) __phys_addr((unsigned long)(x)) +#define __pa_symbol(x) __phys_addr((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define __boot_va(x) __va(x) --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h +++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h @@ -1,7 +1,6 @@ #ifndef _I386_PGALLOC_H #define _I386_PGALLOC_H -#include #include #include /* for struct page */ #include /* for phys_to_virt and page_to_pseudophys */ @@ -69,6 +68,4 @@ do { \ #define pud_populate(mm, pmd, pte) BUG() #endif -#define check_pgt_cache() do { } while (0) - #endif /* _I386_PGALLOC_H */ --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h +++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h @@ -1,7 +1,6 @@ #ifndef _X86_64_PGALLOC_H #define _X86_64_PGALLOC_H -#include #include #include #include @@ -100,24 +99,16 @@ static inline void pgd_list_add(pgd_t *p struct page *page = virt_to_page(pgd); spin_lock(&pgd_lock); - page->index = (pgoff_t)pgd_list; - if (pgd_list) - pgd_list->private = (unsigned long)&page->index; - pgd_list = page; - page->private = (unsigned long)&pgd_list; + list_add(&page->lru, &pgd_list); spin_unlock(&pgd_lock); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *next, **pprev, *page = virt_to_page(pgd); + struct page *page = virt_to_page(pgd); spin_lock(&pgd_lock); - next = (struct page *)page->index; - pprev = (struct page **)page->private; - *pprev = next; - if (next) - next->private = (unsigned long)pprev; + list_del(&page->lru); spin_unlock(&pgd_lock); } --- a/include/asm-x86/mach-xen/asm/pgtable_32.h +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h @@ -24,11 +24,11 @@ #include #include #include +#include /* Is this pagetable pinned? */ #define PG_pinned PG_arch_1 -struct mm_struct; struct vm_area_struct; /* @@ -38,17 +38,16 @@ struct vm_area_struct; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern unsigned long empty_zero_page[1024]; extern pgd_t *swapper_pg_dir; -extern struct kmem_cache *pgd_cache; extern struct kmem_cache *pmd_cache; extern spinlock_t pgd_lock; extern struct page *pgd_list; +void check_pgt_cache(void); void pmd_ctor(void *, struct kmem_cache *, unsigned long); -void pgd_ctor(void *, struct kmem_cache *, unsigned long); -void pgd_dtor(void *, struct kmem_cache *, unsigned long); void pgtable_cache_init(void); void paging_init(void); + /* * The Linux x86 paging architecture is 'compile-time dual-mode', it * implements both the traditional 2-level x86 page tables and the @@ -165,6 +164,7 @@ void paging_init(void); extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -172,6 +172,7 @@ extern unsigned long long __PAGE_KERNEL, #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) @@ -275,7 +276,13 @@ static inline pte_t pte_mkhuge(pte_t pte */ #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) -#define paravirt_map_pt_hook(slot, va, pfn) do { } while (0) + +/* local pte updates need not use xchg for locking */ +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) +{ + xen_set_pte(ptep, __pte(0)); + return res; +} /* * We only update the dirty/accessed state if we set @@ -286,17 +293,34 @@ static inline pte_t pte_mkhuge(pte_t pte */ #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ -do { \ - if (dirty) \ +({ \ + int __changed = !pte_same(*(ptep), entry); \ + if (__changed && (dirty)) \ ptep_establish(vma, address, ptep, entry); \ -} while (0) + __changed; \ +}) -/* - * We don't actually have these, but we want to advertise them so that - * we can encompass the flush here. - */ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY +#define ptep_test_and_clear_dirty(vma, addr, ptep) ({ \ + int __ret = 0; \ + if (pte_dirty(*(ptep))) \ + __ret = test_and_clear_bit(_PAGE_BIT_DIRTY, \ + &(ptep)->pte_low); \ + if (__ret) \ + pte_update((vma)->vm_mm, addr, ptep); \ + __ret; \ +}) + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \ + int __ret = 0; \ + if (pte_young(*(ptep))) \ + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ + &(ptep)->pte_low); \ + if (__ret) \ + pte_update((vma)->vm_mm, addr, ptep); \ + __ret; \ +}) /* * Rules for using ptep_establish: the pte MUST be a user pte, and @@ -323,7 +347,7 @@ do { \ int __dirty = pte_dirty(__pte); \ __pte = pte_mkclean(__pte); \ if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \ - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ else if (__dirty) \ (ptep)->pte_low = __pte.pte_low; \ __dirty; \ @@ -336,7 +360,7 @@ do { \ int __young = pte_young(__pte); \ __pte = pte_mkold(__pte); \ if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \ - ptep_set_access_flags(vma, address, ptep, __pte, __young); \ + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ else if (__young) \ (ptep)->pte_low = __pte.pte_low; \ __young; \ @@ -349,7 +373,7 @@ static inline pte_t ptep_get_and_clear(s if (!pte_none(pte) && (mm != &init_mm || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { - pte = raw_ptep_get_and_clear(ptep, pte); + pte = xen_ptep_get_and_clear(ptep, pte); pte_update(mm, addr, ptep); } return pte; @@ -491,24 +515,10 @@ extern pte_t *lookup_address(unsigned lo #endif #if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ -({ \ - pte_t *__ptep; \ - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \ - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE0); \ - paravirt_map_pt_hook(KM_PTE0,__ptep, pfn); \ - __ptep = __ptep + pte_index(address); \ - __ptep; \ -}) -#define pte_offset_map_nested(dir, address) \ -({ \ - pte_t *__ptep; \ - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \ - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE1); \ - paravirt_map_pt_hook(KM_PTE1,__ptep, pfn); \ - __ptep = __ptep + pte_index(address); \ - __ptep; \ -}) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else @@ -584,10 +594,6 @@ int xen_change_pte_range(struct mm_struc #define io_remap_pfn_range(vma,from,pfn,size,prot) \ direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) -#define MK_IOSPACE_PFN(space, pfn) (pfn) -#define GET_IOSPACE(pfn) 0 -#define GET_PFN(pfn) (pfn) - #include #endif /* _I386_PGTABLE_H */ --- a/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h +++ b/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h @@ -1,7 +1,7 @@ #ifndef _I386_PGTABLE_3LEVEL_DEFS_H #define _I386_PGTABLE_3LEVEL_DEFS_H -#define HAVE_SHARED_KERNEL_PMD 0 +#define SHARED_KERNEL_PMD 0 /* * PGDIR_SHIFT determines what a top-level page table entry can map --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h @@ -52,32 +52,40 @@ static inline int pte_exec_kernel(pte_t * value and then use set_pte to update it. -ben */ -static inline void set_pte(pte_t *ptep, pte_t pte) +static inline void xen_set_pte(pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -#define set_pte_atomic(pteptr,pteval) \ - set_64bit((unsigned long long *)(pteptr),__pte_val(pteval)) -#define set_pte_at(_mm,addr,ptep,pteval) do { \ - if (((_mm) != current->mm && (_mm) != &init_mm) || \ - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ - set_pte((ptep), (pteval)); \ -} while (0) - -#define set_pmd(pmdptr,pmdval) \ - xen_l2_entry_update((pmdptr), (pmdval)) -#define set_pud(pudptr,pudval) \ - xen_l3_entry_update((pudptr), (pudval)) +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , pte_t pte) +{ + if ((mm != current->mm && mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, pte, 0)) + xen_set_pte(ptep, pte); +} + +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((unsigned long long *)(ptep),__pte_val(pte)); +} +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + xen_l2_entry_update(pmdp, pmd); +} +static inline void xen_set_pud(pud_t *pudp, pud_t pud) +{ + xen_l3_entry_update(pudp, pud); +} /* * For PTEs and PDEs, we must clear the P-bit first when clearing a page table * entry, so clear the bottom half first and enforce ordering with a compiler * barrier. */ -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { if ((mm != current->mm && mm != &init_mm) || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { @@ -87,7 +95,18 @@ static inline void pte_clear(struct mm_s } } -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) +static inline void xen_pmd_clear(pmd_t *pmd) +{ + xen_l2_entry_update(pmd, __pmd(0)); +} + +#define set_pte(ptep, pte) xen_set_pte(ptep, pte) +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) +#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte) +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) +#define set_pud(pudp, pud) xen_set_pud(pudp, pud) +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) +#define pmd_clear(pmd) xen_pmd_clear(pmd) /* * Pentium-II erratum A13: in PAE mode we explicitly have to flush @@ -108,7 +127,8 @@ static inline void pud_clear (pud_t * pu #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ pmd_index(address)) -static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res) +#ifdef CONFIG_SMP +static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res) { uint64_t val = __pte_val(res); if (__cmpxchg64(ptep, val, 0) != val) { @@ -119,6 +139,9 @@ static inline pte_t raw_ptep_get_and_cle } return res; } +#else +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) +#endif #define __HAVE_ARCH_PTEP_CLEAR_FLUSH #define ptep_clear_flush(vma, addr, ptep) \ @@ -165,13 +188,13 @@ extern unsigned long long __supported_pt static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { return __pte((((unsigned long long)page_nr << PAGE_SHIFT) | - pgprot_val(pgprot)) & __supported_pte_mask); + pgprot_val(pgprot)) & __supported_pte_mask); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | - pgprot_val(pgprot)) & __supported_pte_mask); + pgprot_val(pgprot)) & __supported_pte_mask); } /* @@ -191,6 +214,4 @@ static inline pmd_t pfn_pmd(unsigned lon #define __pmd_free_tlb(tlb, x) do { } while (0) -void vmalloc_sync_all(void); - #endif /* _I386_PGTABLE_3LEVEL_H */ --- a/include/asm-x86/mach-xen/asm/pgtable_64.h +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h @@ -1,12 +1,14 @@ #ifndef _X86_64_PGTABLE_H #define _X86_64_PGTABLE_H +#include +#ifndef __ASSEMBLY__ + /* * This file contains the functions and defines necessary to modify and use * the x86-64 page table tree. */ #include -#include #include #include #include @@ -33,11 +35,9 @@ extern pte_t *lookup_address(unsigned lo #endif extern pud_t level3_kernel_pgt[512]; -extern pud_t level3_physmem_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; -extern pgd_t boot_level4_pgt[]; extern unsigned long __supported_pte_mask; #define swapper_pg_dir init_level4_pgt @@ -52,6 +52,8 @@ extern void clear_kernel_mapping(unsigne extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#endif /* !__ASSEMBLY__ */ + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ @@ -76,6 +78,8 @@ extern unsigned long empty_zero_page[PAG */ #define PTRS_PER_PTE 512 +#ifndef __ASSEMBLY__ + #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ &(e), __pte_val(e), pte_pfn(e)) @@ -118,22 +122,23 @@ static inline void pgd_clear (pgd_t * pg #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK)) -#define PMD_SIZE (1UL << PMD_SHIFT) +#endif /* !__ASSEMBLY__ */ + +#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) #define FIRST_USER_ADDRESS 0 -#ifndef __ASSEMBLY__ -#define MAXMEM 0x3fffffffffffUL -#define VMALLOC_START 0xffffc20000000000UL -#define VMALLOC_END 0xffffe1ffffffffffUL -#define MODULES_VADDR 0xffffffff88000000UL -#define MODULES_END 0xfffffffffff00000UL +#define MAXMEM _AC(0x3fffffffffff, UL) +#define VMALLOC_START _AC(0xffffc20000000000, UL) +#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) +#define MODULES_VADDR _AC(0xffffffff88000000, UL) +#define MODULES_END _AC(0xfffffffffff00000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define _PAGE_BIT_PRESENT 0 @@ -159,16 +164,18 @@ static inline void pgd_clear (pgd_t * pg #define _PAGE_GLOBAL 0x100 /* Global TLB entry */ #define _PAGE_PROTNONE 0x080 /* If not present */ -#define _PAGE_NX (1UL<<_PAGE_BIT_NX) +#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX) /* Mapped page is I/O or foreign and has no associated page struct. */ #define _PAGE_IO 0x200 +#ifndef __ASSEMBLY__ #if CONFIG_XEN_COMPAT <= 0x030002 extern unsigned int __kernel_page_user; #else #define __kernel_page_user 0 #endif +#endif #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) @@ -233,6 +240,8 @@ extern unsigned int __kernel_page_user; #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC +#ifndef __ASSEMBLY__ + static inline unsigned long pgd_bad(pgd_t pgd) { return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); @@ -344,6 +353,20 @@ static inline pte_t pte_mkwrite(pte_t pt static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; } static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; } +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +{ + if (!pte_dirty(*ptep)) + return 0; + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte); +} + +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +{ + if (!pte_young(*ptep)) + return 0; + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte); +} + static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; @@ -468,18 +491,12 @@ static inline pte_t pte_modify(pte_t pte * bit at the same time. */ #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ - do { \ - if (dirty) \ - ptep_establish(vma, address, ptep, entry); \ - } while (0) - - -/* - * i386 says: We don't actually have these, but we want to advertise - * them so that we can encompass the flush here. - */ -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +({ \ + int __changed = !pte_same(*(ptep), entry); \ + if (__changed && (dirty)) \ + ptep_establish(vma, address, ptep, entry); \ + __changed; \ +}) #define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH #define ptep_clear_flush_dirty(vma, address, ptep) \ @@ -488,7 +505,7 @@ static inline pte_t pte_modify(pte_t pte int __dirty = pte_dirty(__pte); \ __pte = pte_mkclean(__pte); \ if ((vma)->vm_mm->context.pinned) \ - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ else if (__dirty) \ set_pte(ptep, __pte); \ __dirty; \ @@ -501,7 +518,7 @@ static inline pte_t pte_modify(pte_t pte int __young = pte_young(__pte); \ __pte = pte_mkold(__pte); \ if ((vma)->vm_mm->context.pinned) \ - ptep_set_access_flags(vma, address, ptep, __pte, __young); \ + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ else if (__young) \ set_pte(ptep, __pte); \ __young; \ @@ -515,10 +532,7 @@ static inline pte_t pte_modify(pte_t pte #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) extern spinlock_t pgd_lock; -extern struct page *pgd_list; -void vmalloc_sync_all(void); - -#endif /* !__ASSEMBLY__ */ +extern struct list_head pgd_list; extern int kern_addr_valid(unsigned long addr); @@ -557,10 +571,6 @@ int xen_change_pte_range(struct mm_struc #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) -#define MK_IOSPACE_PFN(space, pfn) (pfn) -#define GET_IOSPACE(pfn) 0 -#define GET_PFN(pfn) (pfn) - #define HAVE_ARCH_UNMAPPED_AREA #define pgtable_cache_init() do { } while (0) @@ -574,11 +584,14 @@ int xen_change_pte_range(struct mm_struc #define kc_offset_to_vaddr(o) \ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY #define __HAVE_ARCH_PTEP_GET_AND_CLEAR #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL #define __HAVE_ARCH_PTEP_CLEAR_FLUSH #define __HAVE_ARCH_PTEP_SET_WRPROTECT #define __HAVE_ARCH_PTE_SAME #include +#endif /* !__ASSEMBLY__ */ #endif /* _X86_64_PGTABLE_H */ --- a/include/asm-x86/mach-xen/asm/processor_32.h +++ b/include/asm-x86/mach-xen/asm/processor_32.h @@ -21,6 +21,7 @@ #include #include #include +#include #include /* flag for disabling the tsc */ @@ -118,7 +119,8 @@ extern char ignore_fpu_irq; void __init cpu_detect(struct cpuinfo_x86 *c); -extern void identify_cpu(struct cpuinfo_x86 *); +extern void identify_boot_cpu(void); +extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; @@ -129,29 +131,8 @@ extern void detect_ht(struct cpuinfo_x86 static inline void detect_ht(struct cpuinfo_x86 *c) {} #endif -/* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ - -static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ __asm__(XEN_CPUID @@ -165,21 +146,6 @@ static inline fastcall void xen_cpuid(un #define load_cr3(pgdir) write_cr3(__pa(pgdir)) /* - * Intel CPU features in CR4 - */ -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ - -/* * Save the cr4 feature set we're using (ie * Pentium 4MB enable and PPro Global page * enable), so that any CPU's that boot up @@ -206,26 +172,6 @@ static inline void clear_in_cr4 (unsigne } /* - * NSC/Cyrix CPU configuration register indexes - */ - -#define CX86_PCR0 0x20 -#define CX86_GCR 0xb8 -#define CX86_CCR0 0xc0 -#define CX86_CCR1 0xc1 -#define CX86_CCR2 0xc2 -#define CX86_CCR3 0xc3 -#define CX86_CCR4 0xe8 -#define CX86_CCR5 0xe9 -#define CX86_CCR6 0xea -#define CX86_CCR7 0xeb -#define CX86_PCR1 0xf0 -#define CX86_DIR0 0xfe -#define CX86_DIR1 0xff -#define CX86_ARR_BASE 0xc4 -#define CX86_RCR_BASE 0xdc - -/* * NSC/Cyrix CPU indexed register access macros */ @@ -351,7 +297,8 @@ typedef struct { struct thread_struct; #ifndef CONFIG_X86_NO_TSS -struct tss_struct { +/* This is the TSS defined by the hardware. */ +struct i386_hw_tss { unsigned short back_link,__blh; unsigned long esp0; unsigned short ss0,__ss0h; @@ -375,6 +322,11 @@ struct tss_struct { unsigned short gs, __gsh; unsigned short ldt, __ldth; unsigned short trace, io_bitmap_base; +} __attribute__((packed)); + +struct tss_struct { + struct i386_hw_tss x86_tss; + /* * The extra 1 is there because the CPU will access an * additional byte beyond the end of the IO permission @@ -428,10 +380,11 @@ struct thread_struct { }; #define INIT_THREAD { \ + .esp0 = sizeof(init_stack) + (long)&init_stack, \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PDA, \ + .fs = __KERNEL_PERCPU, \ } /* @@ -441,10 +394,12 @@ struct thread_struct { * be within the limit. */ #define INIT_TSS { \ - .esp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + .x86_tss = { \ + .esp0 = sizeof(init_stack) + (long)&init_stack, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + }, \ .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ } @@ -551,38 +506,33 @@ static inline void rep_nop(void) #define cpu_relax() rep_nop() -#define paravirt_enabled() 0 -#define __cpuid xen_cpuid - #ifndef CONFIG_X86_NO_TSS -static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread) +static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { - tss->esp0 = thread->esp0; + tss->x86_tss.esp0 = thread->esp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } } -#define load_esp0(tss, thread) \ - __load_esp0(tss, thread) #else -#define load_esp0(tss, thread) do { \ +#define xen_load_esp0(tss, thread) do { \ if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \ BUG(); \ } while (0) #endif -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - (var) = HYPERVISOR_get_debugreg(register) -#define set_debugreg(value, register) \ - WARN_ON(HYPERVISOR_set_debugreg(register, value)) +static inline unsigned long xen_get_debugreg(int regno) +{ + return HYPERVISOR_get_debugreg(regno); +} -#define set_iopl_mask xen_set_iopl_mask +static inline void xen_set_debugreg(int regno, unsigned long value) +{ + WARN_ON(HYPERVISOR_set_debugreg(regno, value)); +} /* * Set IOPL bits in EFLAGS from given mask @@ -597,6 +547,21 @@ static inline void xen_set_iopl_mask(uns } +#define paravirt_enabled() 0 +#define __cpuid xen_cpuid + +#define load_esp0 xen_load_esp0 + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = xen_get_debugreg(register) +#define set_debugreg(value, register) \ + xen_set_debugreg(register, value) + +#define set_iopl_mask xen_set_iopl_mask + /* * Generic CPUID function * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx @@ -749,8 +714,14 @@ extern unsigned long boot_option_idle_ov extern void enable_sep_cpu(void); extern int sysenter_setup(void); -extern int init_gdt(int cpu, struct task_struct *idle); +/* Defined in head.S */ +extern struct Xgt_desc_struct early_gdt_descr; + extern void cpu_set_gdt(int); -extern void secondary_cpu_init(void); +extern void switch_to_new_gdt(void); +extern void cpu_init(void); +extern void init_gdt(int cpu); + +extern int force_mwait; #endif /* __ASM_I386_PROCESSOR_H */ --- a/include/asm-x86/mach-xen/asm/processor_64.h +++ b/include/asm-x86/mach-xen/asm/processor_64.h @@ -20,6 +20,7 @@ #include #include #include +#include #define TF_MASK 0x00000100 #define IF_MASK 0x00000200 @@ -103,42 +104,6 @@ extern unsigned int init_intel_cacheinfo extern unsigned short num_cache_leaves; /* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ - -/* - * Intel CPU features in CR4 - */ -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ - -/* * Save the cr4 feature set we're using (ie * Pentium 4MB enable and PPro Global page * enable), so that any CPU's that boot up @@ -203,7 +168,7 @@ struct i387_fxsave_struct { u32 mxcsr; u32 mxcsr_mask; u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */ + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ u32 padding[24]; } __attribute__ ((aligned (16))); @@ -436,22 +401,6 @@ static inline void prefetchw(void *x) #define cpu_relax() rep_nop() /* - * NSC/Cyrix CPU configuration register indexes - */ -#define CX86_CCR0 0xc0 -#define CX86_CCR1 0xc1 -#define CX86_CCR2 0xc2 -#define CX86_CCR3 0xc3 -#define CX86_CCR4 0xe8 -#define CX86_CCR5 0xe9 -#define CX86_CCR6 0xea -#define CX86_CCR7 0xeb -#define CX86_DIR0 0xfe -#define CX86_DIR1 0xff -#define CX86_ARR_BASE 0xc4 -#define CX86_RCR_BASE 0xdc - -/* * NSC/Cyrix CPU indexed register access macros */ --- a/include/asm-x86/mach-xen/asm/segment_32.h +++ b/include/asm-x86/mach-xen/asm/segment_32.h @@ -39,7 +39,7 @@ * 25 - APM BIOS support * * 26 - ESPFIX small SS - * 27 - PDA [ per-cpu private data area ] + * 27 - per-cpu [ offset to per-cpu data area ] * 28 - unused * 29 - unused * 30 - unused @@ -74,8 +74,12 @@ #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) -#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15) -#define __KERNEL_PDA (GDT_ENTRY_PDA * 8) +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) +#ifdef CONFIG_SMP +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) +#else +#define __KERNEL_PERCPU 0 +#endif #define GDT_ENTRY_DOUBLEFAULT_TSS 31 --- a/include/asm-x86/mach-xen/asm/smp_32.h +++ b/include/asm-x86/mach-xen/asm/smp_32.h @@ -8,19 +8,15 @@ #include #include #include -#include #endif -#ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ -#include +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) #include #include +#include #ifdef CONFIG_X86_IO_APIC #include #endif -#include -#endif #endif #define BAD_APICID 0xFFu @@ -52,9 +48,76 @@ extern void cpu_exit_clear(void); extern void cpu_uninit(void); #endif -#ifndef CONFIG_PARAVIRT +#ifndef CONFIG_XEN +struct smp_ops +{ + void (*smp_prepare_boot_cpu)(void); + void (*smp_prepare_cpus)(unsigned max_cpus); + int (*cpu_up)(unsigned cpu); + void (*smp_cpus_done)(unsigned max_cpus); + + void (*smp_send_stop)(void); + void (*smp_send_reschedule)(int cpu); + int (*smp_call_function_mask)(cpumask_t mask, + void (*func)(void *info), void *info, + int wait); +}; + +extern struct smp_ops smp_ops; + +static inline void smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} +static inline void smp_prepare_cpus(unsigned int max_cpus) +{ + smp_ops.smp_prepare_cpus(max_cpus); +} +static inline int __cpu_up(unsigned int cpu) +{ + return smp_ops.cpu_up(cpu); +} +static inline void smp_cpus_done(unsigned int max_cpus) +{ + smp_ops.smp_cpus_done(max_cpus); +} + +static inline void smp_send_stop(void) +{ + smp_ops.smp_send_stop(); +} +static inline void smp_send_reschedule(int cpu) +{ + smp_ops.smp_send_reschedule(cpu); +} +static inline int smp_call_function_mask(cpumask_t mask, + void (*func) (void *info), void *info, + int wait) +{ + return smp_ops.smp_call_function_mask(mask, func, info, wait); +} + +void native_smp_prepare_boot_cpu(void); +void native_smp_prepare_cpus(unsigned int max_cpus); +int native_cpu_up(unsigned int cpunum); +void native_smp_cpus_done(unsigned int max_cpus); + #define startup_ipi_hook(phys_apicid, start_eip, start_esp) \ do { } while (0) + +#else + + +void xen_smp_send_stop(void); +void xen_smp_send_reschedule(int cpu); +int xen_smp_call_function_mask(cpumask_t mask, + void (*func) (void *info), void *info, + int wait); + +#define smp_send_stop xen_smp_send_stop +#define smp_send_reschedule xen_smp_send_reschedule +#define smp_call_function_mask xen_smp_call_function_mask + #endif /* @@ -62,7 +125,8 @@ do { } while (0) * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (read_pda(cpu_number)) +DECLARE_PER_CPU(int, cpu_number); +#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) extern cpumask_t cpu_possible_map; #define cpu_callin_map cpu_possible_map @@ -73,20 +137,6 @@ static inline int num_booting_cpus(void) return cpus_weight(cpu_possible_map); } -#ifdef CONFIG_X86_LOCAL_APIC - -#ifdef APIC_DEFINITION -extern int hard_smp_processor_id(void); -#else -#include -static inline int hard_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); -} -#endif -#endif - #define safe_smp_processor_id() smp_processor_id() extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); @@ -102,10 +152,31 @@ extern unsigned int num_processors; #define NO_PROC_ID 0xFF /* No processor magic marker */ -#endif +#endif /* CONFIG_SMP */ #ifndef __ASSEMBLY__ +#ifdef CONFIG_X86_LOCAL_APIC + +#ifdef APIC_DEFINITION +extern int hard_smp_processor_id(void); +#else +#include +static inline int hard_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); +} +#endif /* APIC_DEFINITION */ + +#else /* CONFIG_X86_LOCAL_APIC */ + +#ifndef CONFIG_SMP +#define hard_smp_processor_id() 0 +#endif + +#endif /* CONFIG_X86_LOCAL_APIC */ + extern u8 apicid_2_node[]; #ifdef CONFIG_X86_LOCAL_APIC --- a/include/asm-x86/mach-xen/asm/smp_64.h +++ b/include/asm-x86/mach-xen/asm/smp_64.h @@ -11,12 +11,11 @@ extern int disable_apic; #ifdef CONFIG_X86_LOCAL_APIC -#include #include +#include #ifdef CONFIG_X86_IO_APIC #include #endif -#include #include #endif @@ -41,7 +40,6 @@ extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); extern int smp_num_siblings; extern void smp_send_reschedule(int cpu); -void smp_stop_cpu(void); extern cpumask_t cpu_sibling_map[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; @@ -62,14 +60,6 @@ static inline int num_booting_cpus(void) #define raw_smp_processor_id() read_pda(cpunumber) -#ifdef CONFIG_X86_LOCAL_APIC -static inline int hard_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); -} -#endif - extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void prefill_possible_map(void); @@ -78,6 +68,14 @@ extern unsigned __cpuinitdata disabled_c #define NO_PROC_ID 0xFF /* No processor magic marker */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_X86_LOCAL_APIC +static inline int hard_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); +} #endif /* --- a/include/asm-x86/mach-xen/asm/system_32.h +++ b/include/asm-x86/mach-xen/asm/system_32.h @@ -4,7 +4,7 @@ #include #include #include -#include /* for LOCK_PREFIX */ +#include #include #include @@ -90,308 +90,102 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" #define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) -#define read_cr0() ({ \ - unsigned int __dummy; \ - __asm__ __volatile__( \ - "movl %%cr0,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr0(x) \ - __asm__ __volatile__("movl %0,%%cr0": :"r" (x)) - -#define read_cr2() (current_vcpu_info()->arch.cr2) -#define write_cr2(x) \ - __asm__ __volatile__("movl %0,%%cr2": :"r" (x)) - -#define read_cr3() ({ \ - unsigned int __dummy; \ - __asm__ ( \ - "movl %%cr3,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy = xen_cr3_to_pfn(__dummy); \ - mfn_to_pfn(__dummy) << PAGE_SHIFT; \ -}) -#define write_cr3(x) ({ \ - unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \ - __dummy = xen_pfn_to_cr3(__dummy); \ - __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \ -}) -#define read_cr4() ({ \ - unsigned int __dummy; \ - __asm__( \ - "movl %%cr4,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define read_cr4_safe() ({ \ - unsigned int __dummy; \ - /* This could fault if %cr4 does not exist */ \ - __asm__("1: movl %%cr4, %0 \n" \ - "2: \n" \ - ".section __ex_table,\"a\" \n" \ - ".long 1b,2b \n" \ - ".previous \n" \ - : "=r" (__dummy): "0" (0)); \ - __dummy; \ -}) - -#define write_cr4(x) \ - __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) - -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory") - -/* Clear the 'TS' bit */ -#define clts() (HYPERVISOR_fpu_taskswitch(0)) - -/* Set the 'TS' bit */ -#define stts() (HYPERVISOR_fpu_taskswitch(1)) - -#endif /* __KERNEL__ */ - -static inline unsigned long get_limit(unsigned long segment) +static inline void xen_clts(void) { - unsigned long __limit; - __asm__("lsll %1,%0" - :"=r" (__limit):"r" (segment)); - return __limit+1; + HYPERVISOR_fpu_taskswitch(0); } -#define nop() __asm__ __volatile__ ("nop") - -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) - -#define tas(ptr) (xchg((ptr),1)) - -struct __xchg_dummy { unsigned long a[100]; }; -#define __xg(x) ((struct __xchg_dummy *)(x)) +static inline unsigned long xen_read_cr0(void) +{ + unsigned long val; + asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); + return val; +} +static inline void xen_write_cr0(unsigned long val) +{ + asm volatile("movl %0,%%cr0": :"r" (val)); +} -#ifdef CONFIG_X86_CMPXCHG64 +#define xen_read_cr2() (current_vcpu_info()->arch.cr2) -/* - * The semantics of XCHGCMP8B are a bit strange, this is why - * there is a loop and the loading of %%eax and %%edx has to - * be inside. This inlines well in most cases, the cached - * cost is around ~38 cycles. (in the future we might want - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that - * might have an implicit FPU-save as a cost, so it's not - * clear which path to go.) - * - * cmpxchg8b must be used with the lock prefix here to allow - * the instruction to be executed atomically, see page 3-102 - * of the instruction set reference 24319102.pdf. We need - * the reader side to see the coherent 64bit value. - */ -static inline void __set_64bit (unsigned long long * ptr, - unsigned int low, unsigned int high) +static inline void xen_write_cr2(unsigned long val) { - __asm__ __volatile__ ( - "\n1:\t" - "movl (%0), %%eax\n\t" - "movl 4(%0), %%edx\n\t" - "lock cmpxchg8b (%0)\n\t" - "jnz 1b" - : /* no outputs */ - : "D"(ptr), - "b"(low), - "c"(high) - : "ax","dx","memory"); + asm volatile("movl %0,%%cr2": :"r" (val)); } -static inline void __set_64bit_constant (unsigned long long *ptr, - unsigned long long value) +static inline unsigned long xen_read_cr3(void) { - __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL)); + unsigned long val; + asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; } -#define ll_low(x) *(((unsigned int*)&(x))+0) -#define ll_high(x) *(((unsigned int*)&(x))+1) -static inline void __set_64bit_var (unsigned long long *ptr, - unsigned long long value) +static inline void xen_write_cr3(unsigned long val) { - __set_64bit(ptr,ll_low(value), ll_high(value)); + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); + asm volatile("movl %0,%%cr3": :"r" (val)); } -#define set_64bit(ptr,value) \ -(__builtin_constant_p(value) ? \ - __set_64bit_constant(ptr, value) : \ - __set_64bit_var(ptr, value) ) - -#define _set_64bit(ptr,value) \ -(__builtin_constant_p(value) ? \ - __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \ - __set_64bit(ptr, ll_low(value), ll_high(value)) ) - -#endif - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK - */ -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) +static inline unsigned long xen_read_cr4(void) { - switch (size) { - case 1: - __asm__ __volatile__("xchgb %b0,%1" - :"=q" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 2: - __asm__ __volatile__("xchgw %w0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 4: - __asm__ __volatile__("xchgl %0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - } - return x; + unsigned long val; + asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); + return val; } -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ - -#ifdef CONFIG_X86_CMPXCHG -#define __HAVE_ARCH_CMPXCHG 1 -#define cmpxchg(ptr,o,n)\ - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ - (unsigned long)(n),sizeof(*(ptr)))) -#define sync_cmpxchg(ptr,o,n)\ - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\ - (unsigned long)(n),sizeof(*(ptr)))) -#endif +static inline unsigned long xen_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist */ + asm("1: movl %%cr4, %0 \n" + "2: \n" + ".section __ex_table,\"a\" \n" + ".long 1b,2b \n" + ".previous \n" + : "=r" (val): "0" (0)); + return val; +} -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) +static inline void xen_write_cr4(unsigned long val) { - unsigned long prev; - switch (size) { - case 1: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; + asm volatile("movl %0,%%cr4": :"r" (val)); } -/* - * Always use locked operations when touching memory shared with a - * hypervisor, since the system may be SMP even if the guest kernel - * isn't. - */ -static inline unsigned long __sync_cmpxchg(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - __asm__ __volatile__("lock; cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - __asm__ __volatile__("lock; cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - __asm__ __volatile__("lock; cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; +static inline void xen_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); } -#ifndef CONFIG_X86_CMPXCHG -/* - * Building a kernel capable running on 80386. It may be necessary to - * simulate the cmpxchg on the 80386 CPU. For that purpose we define - * a function for each of the sizes we support. - */ +#define read_cr0() (xen_read_cr0()) +#define write_cr0(x) (xen_write_cr0(x)) +#define read_cr2() (xen_read_cr2()) +#define write_cr2(x) (xen_write_cr2(x)) +#define read_cr3() (xen_read_cr3()) +#define write_cr3(x) (xen_write_cr3(x)) +#define read_cr4() (xen_read_cr4()) +#define read_cr4_safe() (xen_read_cr4_safe()) +#define write_cr4(x) (xen_write_cr4(x)) +#define wbinvd() (xen_wbinvd()) -extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); -extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); -extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); - -static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - switch (size) { - case 1: - return cmpxchg_386_u8(ptr, old, new); - case 2: - return cmpxchg_386_u16(ptr, old, new); - case 4: - return cmpxchg_386_u32(ptr, old, new); - } - return old; -} - -#define cmpxchg(ptr,o,n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 3)) \ - __ret = __cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), sizeof(*(ptr))); \ - else \ - __ret = cmpxchg_386((ptr), (unsigned long)(o), \ - (unsigned long)(n), sizeof(*(ptr))); \ - __ret; \ -}) -#endif +/* Clear the 'TS' bit */ +#define clts() (xen_clts()) -#ifdef CONFIG_X86_CMPXCHG64 +/* Set the 'TS' bit */ +#define stts() (HYPERVISOR_fpu_taskswitch(1)) -static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, - unsigned long long new) +#endif /* __KERNEL__ */ + +static inline unsigned long get_limit(unsigned long segment) { - unsigned long long prev; - __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3" - : "=A"(prev) - : "b"((unsigned long)new), - "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), - "0"(old) - : "memory"); - return prev; -} - -#define cmpxchg64(ptr,o,n)\ - ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\ - (unsigned long long)(n))) + unsigned long __limit; + __asm__("lsll %1,%0" + :"=r" (__limit):"r" (segment)); + return __limit+1; +} + +#define nop() __asm__ __volatile__ ("nop") -#endif - /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking --- a/include/asm-x86/mach-xen/asm/system_64.h +++ b/include/asm-x86/mach-xen/asm/system_64.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include @@ -43,7 +43,7 @@ [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ [ti_flags] "i" (offsetof(struct thread_info, flags)),\ [tif_fork] "i" (TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, thread_info)), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ : "memory", "cc" __EXTRA_CLOBBER) @@ -92,6 +92,12 @@ static inline void write_cr0(unsigned lo machine_to_phys(__dummy); \ }) +static inline void write_cr3(unsigned long val) +{ + val = phys_to_machine(val); + asm volatile("movq %0,%%cr3" :: "r" (val) : "memory"); +} + static inline unsigned long read_cr4(void) { unsigned long cr4; @@ -101,7 +107,7 @@ static inline unsigned long read_cr4(voi static inline void write_cr4(unsigned long val) { - asm volatile("movq %0,%%cr4" :: "r" (val)); + asm volatile("movq %0,%%cr4" :: "r" (val) : "memory"); } #define stts() (HYPERVISOR_fpu_taskswitch(1)) @@ -122,100 +128,6 @@ static inline void sched_cacheflush(void #define nop() __asm__ __volatile__ ("nop") -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) - -#define tas(ptr) (xchg((ptr),1)) - -#define __xg(x) ((volatile long *)(x)) - -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) -{ - *ptr = val; -} - -#define _set_64bit set_64bit - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK - */ -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) -{ - switch (size) { - case 1: - __asm__ __volatile__("xchgb %b0,%1" - :"=q" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 2: - __asm__ __volatile__("xchgw %w0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 4: - __asm__ __volatile__("xchgl %k0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 8: - __asm__ __volatile__("xchgq %0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - } - return x; -} - -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ - -#define __HAVE_ARCH_CMPXCHG 1 - -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 8: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - -#define cmpxchg(ptr,o,n)\ - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ - (unsigned long)(n),sizeof(*(ptr)))) - #ifdef CONFIG_SMP #define smp_mb() mb() #define smp_rmb() rmb() --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h @@ -29,8 +29,13 @@ * and page-granular flushes are available only on i486 and up. */ +#define TLB_FLUSH_ALL 0xffffffff + + #ifndef CONFIG_SMP +#include + #define flush_tlb() __flush_tlb() #define flush_tlb_all() __flush_tlb_all() #define local_flush_tlb() __flush_tlb() @@ -55,7 +60,7 @@ static inline void flush_tlb_range(struc __flush_tlb(); } -#else +#else /* SMP */ #include @@ -84,9 +89,7 @@ struct tlb_state char __cacheline_padding[L1_CACHE_BYTES-8]; }; DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); - - -#endif +#endif /* SMP */ #define flush_tlb_kernel_range(start, end) flush_tlb_all() --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h @@ -2,7 +2,9 @@ #define _X8664_TLBFLUSH_H #include +#include #include +#include #define __flush_tlb() xen_tlb_flush() --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -239,7 +239,7 @@ struct pci_dev { int rom_attr_enabled; /* has display of the rom attribute been enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ -#ifdef CONFIG_PCI_MSI +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) struct list_head msi_list; #endif struct pci_vpd *vpd; --- a/lib/swiotlb-xen.c +++ b/lib/swiotlb-xen.c @@ -723,7 +723,6 @@ swiotlb_dma_supported (struct device *hw return (mask >= ((1UL << dma_bits) - 1)); } -EXPORT_SYMBOL(swiotlb_init); EXPORT_SYMBOL(swiotlb_map_single); EXPORT_SYMBOL(swiotlb_unmap_single); EXPORT_SYMBOL(swiotlb_map_sg); --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1744,12 +1744,17 @@ static struct netdev_queue *dev_pick_tx( inline int skb_checksum_setup(struct sk_buff *skb) { if (skb->proto_csum_blank) { + struct iphdr *iph; + unsigned char *th; + if (skb->protocol != htons(ETH_P_IP)) goto out; - skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl; - if (skb->h.raw >= skb->tail) + iph = ip_hdr(skb); + th = skb_network_header(skb) + 4 * iph->ihl; + if (th >= skb_tail_pointer(skb)) goto out; - switch (skb->nh.iph->protocol) { + skb->csum_start = th - skb->head; + switch (iph->protocol) { case IPPROTO_TCP: skb->csum_offset = offsetof(struct tcphdr, check); break; @@ -1760,10 +1765,10 @@ inline int skb_checksum_setup(struct sk_ if (net_ratelimit()) printk(KERN_ERR "Attempting to checksum a non-" "TCP/UDP packet, dropping a protocol" - " %d packet", skb->nh.iph->protocol); + " %d packet", iph->protocol); goto out; } - if ((skb->h.raw + skb->csum_offset + 2) > skb->tail) + if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb)) goto out; skb->ip_summed = CHECKSUM_PARTIAL; skb->proto_csum_blank = 0; --- a/scripts/Makefile.xen.awk +++ b/scripts/Makefile.xen.awk @@ -13,7 +13,7 @@ BEGIN { next } -/:[[:space:]]*%\.[cS][[:space:]]/ { +/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ { line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0) line = gensub(/(single-used-m)/, "xen-\\1", "g", line) print line