Subject: xen3 common From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 728:832aac894efd) Patch-mainline: obsolete Acked-by: jbeulich@novell.com List of files that don't require modification anymore (and hence removed from this patch), for reference and in case upstream wants to take the forward porting patches: 2.6.22/include/linux/sched.h 2.6.22/kernel/softlockup.c 2.6.22/kernel/timer.c 2.6.25/mm/highmem.c --- drivers/Makefile | 1 drivers/acpi/Makefile | 3 drivers/acpi/hardware/hwsleep.c | 15 drivers/acpi/processor_core.c | 72 +++ drivers/acpi/processor_extcntl.c | 241 +++++++++++ drivers/acpi/processor_idle.c | 24 - drivers/acpi/processor_perflib.c | 21 drivers/acpi/sleep/main.c | 9 drivers/char/agp/intel-agp.c | 10 drivers/char/mem.c | 16 drivers/char/tpm/Makefile | 2 drivers/char/tpm/tpm.h | 15 drivers/char/tpm/tpm_vtpm.c | 542 +++++++++++++++++++++++++ drivers/char/tpm/tpm_vtpm.h | 55 ++ drivers/char/tpm/tpm_xen.c | 722 ++++++++++++++++++++++++++++++++++ drivers/ide/ide-lib.c | 8 drivers/oprofile/buffer_sync.c | 87 +++- drivers/oprofile/cpu_buffer.c | 51 +- drivers/oprofile/cpu_buffer.h | 9 drivers/oprofile/event_buffer.h | 3 drivers/oprofile/oprof.c | 30 + drivers/oprofile/oprof.h | 3 drivers/oprofile/oprofile_files.c | 201 +++++++++ fs/compat_ioctl.c | 19 include/acpi/processor.h | 143 ++++++ include/asm-generic/pci.h | 2 include/asm-generic/pgtable.h | 4 include/linux/aio.h | 5 include/linux/highmem.h | 8 include/linux/interrupt.h | 6 include/linux/kexec.h | 13 include/linux/mm.h | 8 include/linux/oprofile.h | 12 include/linux/page-flags.h | 26 + include/linux/pci.h | 11 include/linux/vermagic.h | 7 kernel/irq/spurious.c | 2 kernel/kexec.c | 71 ++- kernel/sysctl.c | 2 mm/memory.c | 42 + mm/mprotect.c | 2 mm/page_alloc.c | 12 net/core/dev.c | 62 ++ net/core/skbuff.c | 4 net/ipv4/netfilter/nf_nat_proto_tcp.c | 3 net/ipv4/netfilter/nf_nat_proto_udp.c | 4 net/ipv4/xfrm4_output.c | 2 scripts/Makefile.build | 14 scripts/Makefile.lib | 6 51 files changed, 2671 insertions(+), 86 deletions(-) --- a/drivers/acpi/hardware/hwsleep.c +++ b/drivers/acpi/hardware/hwsleep.c @@ -241,7 +241,11 @@ acpi_status asmlinkage acpi_enter_sleep_ u32 PM1Bcontrol; struct acpi_bit_register_info *sleep_type_reg_info; struct acpi_bit_register_info *sleep_enable_reg_info; +#if !(defined(CONFIG_XEN) && defined(CONFIG_X86)) u32 in_value; +#else + int err; +#endif struct acpi_object_list arg_list; union acpi_object arg; acpi_status status; @@ -351,6 +355,7 @@ acpi_status asmlinkage acpi_enter_sleep_ ACPI_FLUSH_CPU_CACHE(); +#if !(defined(CONFIG_XEN) && defined(CONFIG_X86)) status = acpi_hw_register_write(ACPI_REGISTER_PM1A_CONTROL, PM1Acontrol); if (ACPI_FAILURE(status)) { @@ -397,6 +402,16 @@ acpi_status asmlinkage acpi_enter_sleep_ /* Spin until we wake */ } while (!in_value); +#else + /* PV ACPI just need check hypercall return value */ + err = acpi_notify_hypervisor_state(sleep_state, + PM1Acontrol, PM1Bcontrol); + if (err) { + ACPI_DEBUG_PRINT((ACPI_DB_ERROR, + "Hypervisor failure [%d]\n", err)); + return_ACPI_STATUS(AE_ERROR); + } +#endif return_ACPI_STATUS(AE_OK); } --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -34,6 +34,9 @@ processor-objs += processor_core.o proce ifdef CONFIG_CPU_FREQ processor-objs += processor_perflib.o endif +ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +processor-objs += processor_perflib.o processor_extcntl.o +endif obj-y += sleep/ obj-y += bus.o glue.o --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -620,7 +620,8 @@ static int acpi_processor_get_info(struc */ if (pr->id == -1) { if (ACPI_FAILURE - (acpi_processor_hotadd_init(pr->handle, &pr->id))) { + (acpi_processor_hotadd_init(pr->handle, &pr->id)) && + !processor_cntl_external()) { return -ENODEV; } } @@ -662,7 +663,11 @@ static int acpi_processor_get_info(struc return 0; } +#ifndef CONFIG_XEN static DEFINE_PER_CPU(void *, processor_device_array); +#else +static void *processor_device_array[NR_ACPI_CPUS]; +#endif static int __cpuinit acpi_processor_start(struct acpi_device *device) { @@ -671,30 +676,46 @@ static int __cpuinit acpi_processor_star struct acpi_processor *pr; struct sys_device *sysdev; + processor_extcntl_init(); + pr = acpi_driver_data(device); result = acpi_processor_get_info(pr, device->flags.unique_id); - if (result) { + if (result || + ((pr->id == -1) && !processor_cntl_external())) { /* Processor is physically not present */ return 0; } - BUG_ON((pr->id >= nr_cpu_ids) || (pr->id < 0)); + BUG_ON(!processor_cntl_external() && + ((pr->id >= nr_cpu_ids) || (pr->id < 0))); /* * Buggy BIOS check * ACPI id of processors can be reported wrongly by the BIOS. * Don't trust it blindly */ +#ifndef CONFIG_XEN if (per_cpu(processor_device_array, pr->id) != NULL && per_cpu(processor_device_array, pr->id) != device) { +#else + BUG_ON(pr->acpi_id >= NR_ACPI_CPUS); + if (processor_device_array[pr->acpi_id] != NULL && + processor_device_array[pr->acpi_id] != device) { +#endif printk(KERN_WARNING "BIOS reported wrong ACPI id " "for the processor\n"); return -ENODEV; } +#ifndef CONFIG_XEN per_cpu(processor_device_array, pr->id) = device; per_cpu(processors, pr->id) = pr; +#else + processor_device_array[pr->acpi_id] = device; + if (pr->id != -1) + per_cpu(processors, pr->id) = pr; +#endif result = acpi_processor_add_fs(device); if (result) @@ -710,15 +731,28 @@ static int __cpuinit acpi_processor_star /* _PDC call should be done before doing anything else (if reqd.). */ arch_acpi_processor_init_pdc(pr); acpi_processor_set_pdc(pr); -#ifdef CONFIG_CPU_FREQ +#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) acpi_processor_ppc_has_changed(pr); #endif - acpi_processor_get_throttling_info(pr); - acpi_processor_get_limit_info(pr); + + /* + * pr->id may equal to -1 while processor_cntl_external enabled. + * throttle and thermal module don't support this case. + * Tx only works when dom0 vcpu == pcpu num by far, as we give + * control to dom0. + */ + if (pr->id != -1) { + acpi_processor_get_throttling_info(pr); + acpi_processor_get_limit_info(pr); + } acpi_processor_power_init(pr, device); + result = processor_extcntl_prepare(pr); + if (result) + goto end; + pr->cdev = thermal_cooling_device_register("Processor", device, &processor_cooling_ops); if (IS_ERR(pr->cdev)) { @@ -846,7 +880,7 @@ static int acpi_processor_remove(struct pr = acpi_driver_data(device); - if (pr->id >= nr_cpu_ids) { + if (!processor_cntl_external() && pr->id >= nr_cpu_ids) { kfree(pr); return 0; } @@ -872,8 +906,14 @@ static int acpi_processor_remove(struct pr->cdev = NULL; } +#ifndef CONFIG_XEN per_cpu(processors, pr->id) = NULL; per_cpu(processor_device_array, pr->id) = NULL; +#else + if (pr->id != -1) + per_cpu(processors, pr->id) = NULL; + processor_device_array[pr->acpi_id] = NULL; +#endif kfree(pr); return 0; @@ -933,6 +973,10 @@ int acpi_processor_device_add(acpi_handl if (!pr) return -ENODEV; + if (processor_cntl_external()) + processor_notify_external(pr, + PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD); + if ((pr->id >= 0) && (pr->id < nr_cpu_ids)) { kobject_uevent(&(*device)->dev.kobj, KOBJ_ONLINE); } @@ -972,6 +1016,10 @@ static void __ref acpi_processor_hotplug break; } + if (processor_cntl_external()) + processor_notify_external(pr, + PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD); + if (pr->id >= 0 && (pr->id < nr_cpu_ids)) { kobject_uevent(&device->dev.kobj, KOBJ_OFFLINE); break; @@ -1003,6 +1051,11 @@ static void __ref acpi_processor_hotplug if ((pr->id < nr_cpu_ids) && (cpu_present(pr->id))) kobject_uevent(&device->dev.kobj, KOBJ_OFFLINE); + + if (processor_cntl_external()) + processor_notify_external(pr, PROCESSOR_HOTPLUG, + HOTPLUG_TYPE_REMOVE); + break; default: ACPI_DEBUG_PRINT((ACPI_DB_INFO, @@ -1067,6 +1120,11 @@ static acpi_status acpi_processor_hotadd static int acpi_processor_handle_eject(struct acpi_processor *pr) { +#ifdef CONFIG_XEN + if (pr->id == -1) + return (0); +#endif + if (cpu_online(pr->id)) cpu_down(pr->id); --- /dev/null +++ b/drivers/acpi/processor_extcntl.c @@ -0,0 +1,241 @@ +/* + * processor_extcntl.c - channel to external control logic + * + * Copyright (C) 2008, Intel corporation + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +#define ACPI_PROCESSOR_COMPONENT 0x01000000 +#define ACPI_PROCESSOR_CLASS "processor" +#define ACPI_PROCESSOR_DRIVER_NAME "ACPI Processor Driver" +#define _COMPONENT ACPI_PROCESSOR_COMPONENT +ACPI_MODULE_NAME("acpi_processor") + +static int processor_extcntl_parse_csd(struct acpi_processor *pr); +static int processor_extcntl_get_performance(struct acpi_processor *pr); +/* + * External processor control logic may register with its own set of + * ops to get ACPI related notification. One example is like VMM. + */ +const struct processor_extcntl_ops *processor_extcntl_ops; +EXPORT_SYMBOL(processor_extcntl_ops); + +static int processor_notify_smm(void) +{ + acpi_status status; + static int is_done = 0; + + /* only need successfully notify BIOS once */ + /* avoid double notification which may lead to unexpected result */ + if (is_done) + return 0; + + /* Can't write pstate_cnt to smi_cmd if either value is zero */ + if ((!acpi_fadt.smi_cmd) || (!acpi_fadt.pstate_cnt)) { + ACPI_DEBUG_PRINT((ACPI_DB_INFO,"No SMI port or pstate_cnt\n")); + return 0; + } + + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n", + acpi_fadt.pstate_cnt, acpi_fadt.smi_cmd)); + + /* FADT v1 doesn't support pstate_cnt, many BIOS vendors use + * it anyway, so we need to support it... */ + if (acpi_fadt_is_v1) { + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Using v1.0 FADT reserved value for pstate_cnt\n")); + } + + status = acpi_os_write_port(acpi_fadt.smi_cmd, + (u32) acpi_fadt.pstate_cnt, 8); + if (ACPI_FAILURE(status)) + return status; + + is_done = 1; + + return 0; +} + +int processor_notify_external(struct acpi_processor *pr, int event, int type) +{ + int ret = -EINVAL; + + if (!processor_cntl_external()) + return -EINVAL; + + switch (event) { + case PROCESSOR_PM_INIT: + case PROCESSOR_PM_CHANGE: + if ((type >= PM_TYPE_MAX) || + !processor_extcntl_ops->pm_ops[type]) + break; + + ret = processor_extcntl_ops->pm_ops[type](pr, event); + break; + case PROCESSOR_HOTPLUG: + if (processor_extcntl_ops->hotplug) + ret = processor_extcntl_ops->hotplug(pr, type); + break; + default: + printk(KERN_ERR "Unsupport processor events %d.\n", event); + break; + } + + return ret; +} + +/* + * External control logic can decide to grab full or part of physical + * processor control bits. Take a VMM for example, physical processors + * are owned by VMM and thus existence information like hotplug is + * always required to be notified to VMM. Similar is processor idle + * state which is also necessarily controlled by VMM. But for other + * control bits like performance/throttle states, VMM may choose to + * control or not upon its own policy. + */ +void processor_extcntl_init(void) +{ + if (!processor_extcntl_ops) + arch_acpi_processor_init_extcntl(&processor_extcntl_ops); +} + +/* + * This is called from ACPI processor init, and targeted to hold + * some tricky housekeeping jobs to satisfy external control model. + * For example, we may put dependency parse stub here for idle + * and performance state. Those information may be not available + * if splitting from dom0 control logic like cpufreq driver. + */ +int processor_extcntl_prepare(struct acpi_processor *pr) +{ + /* parse cstate dependency information */ + if (processor_pm_external()) + processor_extcntl_parse_csd(pr); + + /* Initialize performance states */ + if (processor_pmperf_external()) + processor_extcntl_get_performance(pr); + + return 0; +} + +/* + * Currently no _CSD is implemented which is why existing ACPI code + * doesn't parse _CSD at all. But to keep interface complete with + * external control logic, we put a placeholder here for future + * compatibility. + */ +static int processor_extcntl_parse_csd(struct acpi_processor *pr) +{ + int i; + + for (i = 0; i < pr->power.count; i++) { + if (!pr->power.states[i].valid) + continue; + + /* No dependency by default */ + pr->power.states[i].domain_info = NULL; + pr->power.states[i].csd_count = 0; + } + + return 0; +} + +/* + * Existing ACPI module does parse performance states at some point, + * when acpi-cpufreq driver is loaded which however is something + * we'd like to disable to avoid confliction with external control + * logic. So we have to collect raw performance information here + * when ACPI processor object is found and started. + */ +static int processor_extcntl_get_performance(struct acpi_processor *pr) +{ + int ret; + struct acpi_processor_performance *perf; + struct acpi_psd_package *pdomain; + + if (pr->performance) + return -EBUSY; + + perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL); + if (!perf) + return -ENOMEM; + + pr->performance = perf; + /* Get basic performance state information */ + ret = acpi_processor_get_performance_info(pr); + if (ret < 0) + goto err_out; + + /* + * Well, here we need retrieve performance dependency information + * from _PSD object. The reason why existing interface is not used + * is due to the reason that existing interface sticks to Linux cpu + * id to construct some bitmap, however we want to split ACPI + * processor objects from Linux cpu id logic. For example, even + * when Linux is configured as UP, we still want to parse all ACPI + * processor objects to external logic. In this case, it's preferred + * to use ACPI ID instead. + */ + pdomain = &pr->performance->domain_info; + pdomain->num_processors = 0; + ret = acpi_processor_get_psd(pr); + if (ret < 0) { + /* + * _PSD is optional - assume no coordination if absent (or + * broken), matching native kernels' behavior. + */ + pdomain->num_entries = ACPI_PSD_REV0_ENTRIES; + pdomain->revision = ACPI_PSD_REV0_REVISION; + pdomain->domain = pr->acpi_id; + pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL; + pdomain->num_processors = 1; + } + + /* Some sanity check */ + if ((pdomain->revision != ACPI_PSD_REV0_REVISION) || + (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) || + ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) && + (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) && + (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) { + ret = -EINVAL; + goto err_out; + } + + /* Last step is to notify BIOS that external logic exists */ + processor_notify_smm(); + + processor_notify_external(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF); + + return 0; +err_out: + pr->performance = NULL; + kfree(perf); + return ret; +} --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -905,7 +905,8 @@ static int acpi_processor_get_power_info */ cx.entry_method = ACPI_CSTATE_HALT; snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT"); - } else { + /* This doesn't apply to external control case */ + } else if (!processor_pm_external()) { continue; } if (cx.type == ACPI_STATE_C1 && @@ -944,6 +945,12 @@ static int acpi_processor_get_power_info cx.power = obj->integer.value; +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL + /* cache control methods to notify external logic */ + if (processor_pm_external()) + memcpy(&cx.reg, reg, sizeof(*reg)); +#endif + current_count++; memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx)); @@ -1285,14 +1292,18 @@ int acpi_processor_cst_has_changed(struc * been initialized. */ if (pm_idle_save) { - pm_idle = pm_idle_save; + if (!processor_pm_external()) + pm_idle = pm_idle_save; /* Relies on interrupts forcing exit from idle. */ synchronize_sched(); } pr->flags.power = 0; result = acpi_processor_get_power_info(pr); - if ((pr->flags.power == 1) && (pr->flags.power_setup_done)) + if (processor_pm_external()) + processor_notify_external(pr, + PROCESSOR_PM_CHANGE, PM_TYPE_IDLE); + else if ((pr->flags.power == 1) && (pr->flags.power_setup_done)) pm_idle = acpi_processor_idle; return result; @@ -1814,7 +1825,7 @@ int __cpuinit acpi_processor_power_init( printk(")\n"); #ifndef CONFIG_CPU_IDLE - if (pr->id == 0) { + if (!processor_pm_external() && (pr->id == 0)) { pm_idle_save = pm_idle; pm_idle = acpi_processor_idle; } @@ -1828,6 +1839,11 @@ int __cpuinit acpi_processor_power_init( acpi_driver_data(device)); if (!entry) return -EIO; + + if (processor_pm_external()) + processor_notify_external(pr, + PROCESSOR_PM_INIT, PM_TYPE_IDLE); + return 0; } --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -80,6 +80,7 @@ MODULE_PARM_DESC(ignore_ppc, "If the fre static int acpi_processor_ppc_status; +#ifdef CONFIG_CPU_FREQ static int acpi_processor_ppc_notifier(struct notifier_block *nb, unsigned long event, void *data) { @@ -122,6 +123,7 @@ static int acpi_processor_ppc_notifier(s static struct notifier_block acpi_ppc_notifier_block = { .notifier_call = acpi_processor_ppc_notifier, }; +#endif /* CONFIG_CPU_FREQ */ static int acpi_processor_get_platform_limit(struct acpi_processor *pr) { @@ -166,9 +168,15 @@ int acpi_processor_ppc_has_changed(struc if (ret < 0) return (ret); else +#ifdef CONFIG_CPU_FREQ return cpufreq_update_policy(pr->id); +#elif CONFIG_PROCESSOR_EXTERNAL_CONTROL + return processor_notify_external(pr, + PROCESSOR_PM_CHANGE, PM_TYPE_PERF); +#endif } +#ifdef CONFIG_CPU_FREQ void acpi_processor_ppc_init(void) { if (!cpufreq_register_notifier @@ -187,6 +195,7 @@ void acpi_processor_ppc_exit(void) acpi_processor_ppc_status &= ~PPC_REGISTERED; } +#endif /* CONFIG_CPU_FREQ */ static int acpi_processor_get_performance_control(struct acpi_processor *pr) { @@ -328,7 +337,10 @@ static int acpi_processor_get_performanc return result; } -static int acpi_processor_get_performance_info(struct acpi_processor *pr) +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL +static +#endif +int acpi_processor_get_performance_info(struct acpi_processor *pr) { int result = 0; acpi_status status = AE_OK; @@ -356,6 +368,7 @@ static int acpi_processor_get_performanc return 0; } +#ifdef CONFIG_CPU_FREQ int acpi_processor_notify_smm(struct module *calling_module) { acpi_status status; @@ -416,6 +429,7 @@ int acpi_processor_notify_smm(struct mod } EXPORT_SYMBOL(acpi_processor_notify_smm); +#endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_X86_ACPI_CPUFREQ_PROC_INTF /* /proc/acpi/processor/../performance interface (DEPRECATED) */ @@ -507,7 +521,10 @@ static void acpi_cpufreq_remove_file(str } #endif /* CONFIG_X86_ACPI_CPUFREQ_PROC_INTF */ -static int acpi_processor_get_psd(struct acpi_processor *pr) +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL +static +#endif +int acpi_processor_get_psd(struct acpi_processor *pr) { int result = 0; acpi_status status = AE_OK; --- a/drivers/acpi/sleep/main.c +++ b/drivers/acpi/sleep/main.c @@ -27,6 +27,7 @@ u8 sleep_states[ACPI_S_STATE_COUNT]; static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP +#ifndef CONFIG_ACPI_PV_SLEEP /* do we have a wakeup address for S2 and S3? */ if (acpi_state == ACPI_STATE_S3) { if (!acpi_wakeup_address) { @@ -36,6 +37,7 @@ static int acpi_sleep_prepare(u32 acpi_s (acpi_physical_address)acpi_wakeup_address); } +#endif ACPI_FLUSH_CPU_CACHE(); acpi_enable_wakeup_device_prep(acpi_state); #endif @@ -208,7 +210,14 @@ static int acpi_suspend_enter(suspend_st break; case ACPI_STATE_S3: +#ifdef CONFIG_ACPI_PV_SLEEP + /* Hyperviosr will save and restore CPU context + * and then we can skip low level housekeeping here. + */ + acpi_enter_sleep_state(acpi_state); +#else do_suspend_lowlevel(); +#endif break; } --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -247,6 +247,13 @@ static void *i8xx_alloc_pages(void) if (page == NULL) return NULL; +#ifdef CONFIG_XEN + if (xen_create_contiguous_region((unsigned long)page_address(page), 2, 32)) { + __free_pages(page, 2); + return NULL; + } +#endif + if (set_pages_uc(page, 4) < 0) { set_pages_wb(page, 4); __free_pages(page, 2); @@ -266,6 +273,9 @@ static void i8xx_destroy_pages(void *add page = virt_to_page(addr); set_pages_wb(page, 4); +#ifdef CONFIG_XEN + xen_destroy_contiguous_region((unsigned long)page_address(page), 2); +#endif put_page(page); __free_pages(page, 2); atomic_dec(&agp_bridge->current_memory_agp); --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -110,6 +110,7 @@ void __attribute__((weak)) unxlate_dev_m { } +#ifndef ARCH_HAS_DEV_MEM /* * This funcion reads the *physical* memory. The f_pos points directly to the * memory location. @@ -254,6 +255,7 @@ static ssize_t write_mem(struct file * f *ppos += written; return written; } +#endif int __attribute__((weak)) phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) @@ -372,6 +374,9 @@ static int mmap_mem(struct file * file, static int mmap_kmem(struct file * file, struct vm_area_struct * vma) { unsigned long pfn; +#ifdef CONFIG_XEN + unsigned long i, count; +#endif /* Turn a kernel-virtual address into a physical page frame */ pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT; @@ -386,6 +391,13 @@ static int mmap_kmem(struct file * file, if (!pfn_valid(pfn)) return -EIO; +#ifdef CONFIG_XEN + count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + for (i = 0; i < count; i++) + if ((pfn + i) != mfn_to_local_pfn(pfn_to_mfn(pfn + i))) + return -EIO; +#endif + vma->vm_pgoff = pfn; return mmap_mem(file, vma); } @@ -802,6 +814,7 @@ static int open_port(struct inode * inod #define open_kmem open_mem #define open_oldmem open_mem +#ifndef ARCH_HAS_DEV_MEM static const struct file_operations mem_fops = { .llseek = memory_lseek, .read = read_mem, @@ -810,6 +823,9 @@ static const struct file_operations mem_ .open = open_mem, .get_unmapped_area = get_unmapped_area_mem, }; +#else +extern const struct file_operations mem_fops; +#endif #ifdef CONFIG_DEVKMEM static const struct file_operations kmem_fops = { --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -9,3 +9,5 @@ obj-$(CONFIG_TCG_TIS) += tpm_tis.o obj-$(CONFIG_TCG_NSC) += tpm_nsc.o obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o +obj-$(CONFIG_TCG_XEN) += tpm_xenu.o +tpm_xenu-y = tpm_xen.o tpm_vtpm.o --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -107,6 +107,9 @@ struct tpm_chip { struct dentry **bios_dir; struct list_head list; +#ifdef CONFIG_XEN + void *priv; +#endif void (*release) (struct device *); }; @@ -124,6 +127,18 @@ static inline void tpm_write_index(int b outb(value & 0xFF, base+1); } +#ifdef CONFIG_XEN +static inline void *chip_get_private(const struct tpm_chip *chip) +{ + return chip->priv; +} + +static inline void chip_set_private(struct tpm_chip *chip, void *priv) +{ + chip->priv = priv; +} +#endif + extern void tpm_get_timeouts(struct tpm_chip *); extern void tpm_gen_interrupt(struct tpm_chip *); extern void tpm_continue_selftest(struct tpm_chip *); --- /dev/null +++ b/drivers/char/tpm/tpm_vtpm.c @@ -0,0 +1,542 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Authors: + * Stefan Berger + * + * Generic device driver part for device drivers in a virtualized + * environment. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + */ + +#include +#include +#include +#include +#include +#include "tpm.h" +#include "tpm_vtpm.h" + +/* read status bits */ +enum { + STATUS_BUSY = 0x01, + STATUS_DATA_AVAIL = 0x02, + STATUS_READY = 0x04 +}; + +struct transmission { + struct list_head next; + + unsigned char *request; + size_t request_len; + size_t request_buflen; + + unsigned char *response; + size_t response_len; + size_t response_buflen; + + unsigned int flags; +}; + +enum { + TRANSMISSION_FLAG_WAS_QUEUED = 0x1 +}; + + +enum { + DATAEX_FLAG_QUEUED_ONLY = 0x1 +}; + + +/* local variables */ + +/* local function prototypes */ +static int _vtpm_send_queued(struct tpm_chip *chip); + + +/* ============================================================= + * Some utility functions + * ============================================================= + */ +static void vtpm_state_init(struct vtpm_state *vtpms) +{ + vtpms->current_request = NULL; + spin_lock_init(&vtpms->req_list_lock); + init_waitqueue_head(&vtpms->req_wait_queue); + INIT_LIST_HEAD(&vtpms->queued_requests); + + vtpms->current_response = NULL; + spin_lock_init(&vtpms->resp_list_lock); + init_waitqueue_head(&vtpms->resp_wait_queue); + + vtpms->disconnect_time = jiffies; +} + + +static inline struct transmission *transmission_alloc(void) +{ + return kzalloc(sizeof(struct transmission), GFP_ATOMIC); +} + +static unsigned char * +transmission_set_req_buffer(struct transmission *t, + unsigned char *buffer, size_t len) +{ + if (t->request_buflen < len) { + kfree(t->request); + t->request = kmalloc(len, GFP_KERNEL); + if (!t->request) { + t->request_buflen = 0; + return NULL; + } + t->request_buflen = len; + } + + memcpy(t->request, buffer, len); + t->request_len = len; + + return t->request; +} + +static unsigned char * +transmission_set_res_buffer(struct transmission *t, + const unsigned char *buffer, size_t len) +{ + if (t->response_buflen < len) { + kfree(t->response); + t->response = kmalloc(len, GFP_ATOMIC); + if (!t->response) { + t->response_buflen = 0; + return NULL; + } + t->response_buflen = len; + } + + memcpy(t->response, buffer, len); + t->response_len = len; + + return t->response; +} + +static inline void transmission_free(struct transmission *t) +{ + kfree(t->request); + kfree(t->response); + kfree(t); +} + +/* ============================================================= + * Interface with the lower layer driver + * ============================================================= + */ +/* + * Lower layer uses this function to make a response available. + */ +int vtpm_vd_recv(const struct tpm_chip *chip, + const unsigned char *buffer, size_t count, + void *ptr) +{ + unsigned long flags; + int ret_size = 0; + struct transmission *t; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + /* + * The list with requests must contain one request + * only and the element there must be the one that + * was passed to me from the front-end. + */ + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + if (vtpms->current_request != ptr) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return 0; + } + + if ((t = vtpms->current_request)) { + transmission_free(t); + vtpms->current_request = NULL; + } + + t = transmission_alloc(); + if (t) { + if (!transmission_set_res_buffer(t, buffer, count)) { + transmission_free(t); + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return -ENOMEM; + } + ret_size = count; + vtpms->current_response = t; + wake_up_interruptible(&vtpms->resp_wait_queue); + } + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + + return ret_size; +} + + +/* + * Lower layer indicates its status (connected/disconnected) + */ +void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status) +{ + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + vtpms->vd_status = vd_status; + if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) { + vtpms->disconnect_time = jiffies; + } +} + +/* ============================================================= + * Interface with the generic TPM driver + * ============================================================= + */ +static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) +{ + int rc = 0; + unsigned long flags; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + /* + * Check if the previous operation only queued the command + * In this case there won't be a response, so I just + * return from here and reset that flag. In any other + * case I should receive a response from the back-end. + */ + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) { + vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY; + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + /* + * The first few commands (measurements) must be + * queued since it might not be possible to talk to the + * TPM, yet. + * Return a response of up to 30 '0's. + */ + + count = min_t(size_t, count, 30); + memset(buf, 0x0, count); + return count; + } + /* + * Check whether something is in the responselist and if + * there's nothing in the list wait for something to appear. + */ + + if (!vtpms->current_response) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + interruptible_sleep_on_timeout(&vtpms->resp_wait_queue, + 1000); + spin_lock_irqsave(&vtpms->resp_list_lock ,flags); + } + + if (vtpms->current_response) { + struct transmission *t = vtpms->current_response; + vtpms->current_response = NULL; + rc = min(count, t->response_len); + memcpy(buf, t->response, rc); + transmission_free(t); + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return rc; +} + +static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) +{ + int rc = 0; + unsigned long flags; + struct transmission *t = transmission_alloc(); + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + if (!t) + return -ENOMEM; + /* + * If there's a current request, it must be the + * previous request that has timed out. + */ + spin_lock_irqsave(&vtpms->req_list_lock, flags); + if (vtpms->current_request != NULL) { + printk("WARNING: Sending although there is a request outstanding.\n" + " Previous request must have timed out.\n"); + transmission_free(vtpms->current_request); + vtpms->current_request = NULL; + } + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + /* + * Queue the packet if the driver below is not + * ready, yet, or there is any packet already + * in the queue. + * If the driver below is ready, unqueue all + * packets first before sending our current + * packet. + * For each unqueued packet, except for the + * last (=current) packet, call the function + * tpm_xen_recv to wait for the response to come + * back. + */ + if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) { + if (time_after(jiffies, + vtpms->disconnect_time + HZ * 10)) { + rc = -ENOENT; + } else { + goto queue_it; + } + } else { + /* + * Send all queued packets. + */ + if (_vtpm_send_queued(chip) == 0) { + + vtpms->current_request = t; + + rc = vtpm_vd_send(vtpms->tpm_private, + buf, + count, + t); + /* + * The generic TPM driver will call + * the function to receive the response. + */ + if (rc < 0) { + vtpms->current_request = NULL; + goto queue_it; + } + } else { +queue_it: + if (!transmission_set_req_buffer(t, buf, count)) { + transmission_free(t); + rc = -ENOMEM; + goto exit; + } + /* + * An error occurred. Don't event try + * to send the current request. Just + * queue it. + */ + spin_lock_irqsave(&vtpms->req_list_lock, flags); + vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY; + list_add_tail(&t->next, &vtpms->queued_requests); + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + } + } + +exit: + return rc; +} + + +/* + * Send all queued requests. + */ +static int _vtpm_send_queued(struct tpm_chip *chip) +{ + int rc; + int error = 0; + long flags; + unsigned char buffer[1]; + struct vtpm_state *vtpms; + vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->req_list_lock, flags); + + while (!list_empty(&vtpms->queued_requests)) { + /* + * Need to dequeue them. + * Read the result into a dummy buffer. + */ + struct transmission *qt = (struct transmission *) + vtpms->queued_requests.next; + list_del(&qt->next); + vtpms->current_request = qt; + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + rc = vtpm_vd_send(vtpms->tpm_private, + qt->request, + qt->request_len, + qt); + + if (rc < 0) { + spin_lock_irqsave(&vtpms->req_list_lock, flags); + if ((qt = vtpms->current_request) != NULL) { + /* + * requeue it at the beginning + * of the list + */ + list_add(&qt->next, + &vtpms->queued_requests); + } + vtpms->current_request = NULL; + error = 1; + break; + } + /* + * After this point qt is not valid anymore! + * It is freed when the front-end is delivering + * the data by calling tpm_recv + */ + /* + * Receive response into provided dummy buffer + */ + rc = vtpm_recv(chip, buffer, sizeof(buffer)); + spin_lock_irqsave(&vtpms->req_list_lock, flags); + } + + spin_unlock_irqrestore(&vtpms->req_list_lock, flags); + + return error; +} + +static void vtpm_cancel(struct tpm_chip *chip) +{ + unsigned long flags; + struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->resp_list_lock,flags); + + if (!vtpms->current_response && vtpms->current_request) { + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + interruptible_sleep_on(&vtpms->resp_wait_queue); + spin_lock_irqsave(&vtpms->resp_list_lock,flags); + } + + if (vtpms->current_response) { + struct transmission *t = vtpms->current_response; + vtpms->current_response = NULL; + transmission_free(t); + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock,flags); +} + +static u8 vtpm_status(struct tpm_chip *chip) +{ + u8 rc = 0; + unsigned long flags; + struct vtpm_state *vtpms; + + vtpms = (struct vtpm_state *)chip_get_private(chip); + + spin_lock_irqsave(&vtpms->resp_list_lock, flags); + /* + * Data are available if: + * - there's a current response + * - the last packet was queued only (this is fake, but necessary to + * get the generic TPM layer to call the receive function.) + */ + if (vtpms->current_response || + 0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) { + rc = STATUS_DATA_AVAIL; + } else if (!vtpms->current_response && !vtpms->current_request) { + rc = STATUS_READY; + } + + spin_unlock_irqrestore(&vtpms->resp_list_lock, flags); + return rc; +} + +static struct file_operations vtpm_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .open = tpm_open, + .read = tpm_read, + .write = tpm_write, + .release = tpm_release, +}; + +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL); +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL); +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL); +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, + NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); +static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel); + +static struct attribute *vtpm_attrs[] = { + &dev_attr_pubek.attr, + &dev_attr_pcrs.attr, + &dev_attr_enabled.attr, + &dev_attr_active.attr, + &dev_attr_owned.attr, + &dev_attr_temp_deactivated.attr, + &dev_attr_caps.attr, + &dev_attr_cancel.attr, + NULL, +}; + +static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs }; + +#define TPM_LONG_TIMEOUT (10 * 60 * HZ) + +static struct tpm_vendor_specific tpm_vtpm = { + .recv = vtpm_recv, + .send = vtpm_send, + .cancel = vtpm_cancel, + .status = vtpm_status, + .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL, + .req_complete_val = STATUS_DATA_AVAIL, + .req_canceled = STATUS_READY, + .attr_group = &vtpm_attr_grp, + .miscdev = { + .fops = &vtpm_ops, + }, + .duration = { + TPM_LONG_TIMEOUT, + TPM_LONG_TIMEOUT, + TPM_LONG_TIMEOUT, + }, +}; + +struct tpm_chip *init_vtpm(struct device *dev, + struct tpm_private *tp) +{ + long rc; + struct tpm_chip *chip; + struct vtpm_state *vtpms; + + vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL); + if (!vtpms) + return ERR_PTR(-ENOMEM); + + vtpm_state_init(vtpms); + vtpms->tpm_private = tp; + + chip = tpm_register_hardware(dev, &tpm_vtpm); + if (!chip) { + rc = -ENODEV; + goto err_free_mem; + } + + chip_set_private(chip, vtpms); + + return chip; + +err_free_mem: + kfree(vtpms); + + return ERR_PTR(rc); +} + +void cleanup_vtpm(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip); + tpm_remove_hardware(dev); + kfree(vtpms); +} --- /dev/null +++ b/drivers/char/tpm/tpm_vtpm.h @@ -0,0 +1,55 @@ +#ifndef TPM_VTPM_H +#define TPM_VTPM_H + +struct tpm_chip; +struct tpm_private; + +struct vtpm_state { + struct transmission *current_request; + spinlock_t req_list_lock; + wait_queue_head_t req_wait_queue; + + struct list_head queued_requests; + + struct transmission *current_response; + spinlock_t resp_list_lock; + wait_queue_head_t resp_wait_queue; // processes waiting for responses + + u8 vd_status; + u8 flags; + + unsigned long disconnect_time; + + /* + * The following is a private structure of the underlying + * driver. It is passed as parameter in the send function. + */ + struct tpm_private *tpm_private; +}; + + +enum vdev_status { + TPM_VD_STATUS_DISCONNECTED = 0x0, + TPM_VD_STATUS_CONNECTED = 0x1 +}; + +/* this function is called from tpm_vtpm.c */ +int vtpm_vd_send(struct tpm_private * tp, + const u8 * buf, size_t count, void *ptr); + +/* these functions are offered by tpm_vtpm.c */ +struct tpm_chip *init_vtpm(struct device *, + struct tpm_private *); +void cleanup_vtpm(struct device *); +int vtpm_vd_recv(const struct tpm_chip* chip, + const unsigned char *buffer, size_t count, void *ptr); +void vtpm_vd_status(const struct tpm_chip *, u8 status); + +static inline struct tpm_private *tpm_private_from_dev(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + struct vtpm_state *vtpms = chip_get_private(chip); + return vtpms->tpm_private; +} + +#endif --- /dev/null +++ b/drivers/char/tpm/tpm_xen.c @@ -0,0 +1,722 @@ +/* + * Copyright (c) 2005, IBM Corporation + * + * Author: Stefan Berger, stefanb@us.ibm.com + * Grant table support: Mahadevan Gomathisankaran + * + * This code has been derived from drivers/xen/netfront/netfront.c + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tpm.h" +#include "tpm_vtpm.h" + +#undef DEBUG + +/* local structures */ +struct tpm_private { + struct tpm_chip *chip; + + tpmif_tx_interface_t *tx; + atomic_t refcnt; + unsigned int irq; + u8 is_connected; + u8 is_suspended; + + spinlock_t tx_lock; + + struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE]; + + atomic_t tx_busy; + void *tx_remember; + + domid_t backend_id; + wait_queue_head_t wait_q; + + struct xenbus_device *dev; + int ring_ref; +}; + +struct tx_buffer { + unsigned int size; // available space in data + unsigned int len; // used space in data + unsigned char *data; // pointer to a page +}; + + +/* locally visible variables */ +static grant_ref_t gref_head; +static struct tpm_private *my_priv; + +/* local function prototypes */ +static irqreturn_t tpmif_int(int irq, + void *tpm_priv, + struct pt_regs *ptregs); +static void tpmif_rx_action(unsigned long unused); +static int tpmif_connect(struct xenbus_device *dev, + struct tpm_private *tp, + domid_t domid); +static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0); +static int tpmif_allocate_tx_buffers(struct tpm_private *tp); +static void tpmif_free_tx_buffers(struct tpm_private *tp); +static void tpmif_set_connected_state(struct tpm_private *tp, + u8 newstate); +static int tpm_xmit(struct tpm_private *tp, + const u8 * buf, size_t count, int userbuffer, + void *remember); +static void destroy_tpmring(struct tpm_private *tp); +void __exit tpmif_exit(void); + +#define DPRINTK(fmt, args...) \ + pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_tpm_fr: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args) + +#define GRANT_INVALID_REF 0 + + +static inline int +tx_buffer_copy(struct tx_buffer *txb, const u8 *src, int len, + int isuserbuffer) +{ + int copied = len; + + if (len > txb->size) + copied = txb->size; + if (isuserbuffer) { + if (copy_from_user(txb->data, src, copied)) + return -EFAULT; + } else { + memcpy(txb->data, src, copied); + } + txb->len = len; + return copied; +} + +static inline struct tx_buffer *tx_buffer_alloc(void) +{ + struct tx_buffer *txb; + + txb = kzalloc(sizeof(struct tx_buffer), GFP_KERNEL); + if (!txb) + return NULL; + + txb->len = 0; + txb->size = PAGE_SIZE; + txb->data = (unsigned char *)__get_free_page(GFP_KERNEL); + if (txb->data == NULL) { + kfree(txb); + txb = NULL; + } + + return txb; +} + + +static inline void tx_buffer_free(struct tx_buffer *txb) +{ + if (txb) { + free_page((long)txb->data); + kfree(txb); + } +} + +/************************************************************** + Utility function for the tpm_private structure +**************************************************************/ +static void tpm_private_init(struct tpm_private *tp) +{ + spin_lock_init(&tp->tx_lock); + init_waitqueue_head(&tp->wait_q); + atomic_set(&tp->refcnt, 1); +} + +static void tpm_private_put(void) +{ + if (!atomic_dec_and_test(&my_priv->refcnt)) + return; + + tpmif_free_tx_buffers(my_priv); + kfree(my_priv); + my_priv = NULL; +} + +static struct tpm_private *tpm_private_get(void) +{ + int err; + + if (my_priv) { + atomic_inc(&my_priv->refcnt); + return my_priv; + } + + my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL); + if (!my_priv) + return NULL; + + tpm_private_init(my_priv); + err = tpmif_allocate_tx_buffers(my_priv); + if (err < 0) + tpm_private_put(); + + return my_priv; +} + +/************************************************************** + + The interface to let the tpm plugin register its callback + function and send data to another partition using this module + +**************************************************************/ + +static DEFINE_MUTEX(suspend_lock); +/* + * Send data via this module by calling this function + */ +int vtpm_vd_send(struct tpm_private *tp, + const u8 * buf, size_t count, void *ptr) +{ + int sent; + + mutex_lock(&suspend_lock); + sent = tpm_xmit(tp, buf, count, 0, ptr); + mutex_unlock(&suspend_lock); + + return sent; +} + +/************************************************************** + XENBUS support code +**************************************************************/ + +static int setup_tpmring(struct xenbus_device *dev, + struct tpm_private *tp) +{ + tpmif_tx_interface_t *sring; + int err; + + tp->ring_ref = GRANT_INVALID_REF; + + sring = (void *)__get_free_page(GFP_KERNEL); + if (!sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + tp->tx = sring; + + err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx)); + if (err < 0) { + free_page((unsigned long)sring); + tp->tx = NULL; + xenbus_dev_fatal(dev, err, "allocating grant reference"); + goto fail; + } + tp->ring_ref = err; + + err = tpmif_connect(dev, tp, dev->otherend_id); + if (err) + goto fail; + + return 0; +fail: + destroy_tpmring(tp); + return err; +} + + +static void destroy_tpmring(struct tpm_private *tp) +{ + tpmif_set_connected_state(tp, 0); + + if (tp->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(tp->ring_ref, (unsigned long)tp->tx); + tp->ring_ref = GRANT_INVALID_REF; + tp->tx = NULL; + } + + if (tp->irq) + unbind_from_irqhandler(tp->irq, tp); + + tp->irq = 0; +} + + +static int talk_to_backend(struct xenbus_device *dev, + struct tpm_private *tp) +{ + const char *message = NULL; + int err; + struct xenbus_transaction xbt; + + err = setup_tpmring(dev, tp); + if (err) { + xenbus_dev_fatal(dev, err, "setting up ring"); + goto out; + } + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_tpmring; + } + + err = xenbus_printf(xbt, dev->nodename, + "ring-ref","%u", tp->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(tp->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) { + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_tpmring; + } + + xenbus_switch_state(dev, XenbusStateConnected); + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_error(dev, err, "%s", message); +destroy_tpmring: + destroy_tpmring(tp); +out: + return err; +} + +/** + * Callback received when the backend's state changes. + */ +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + DPRINTK("\n"); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + break; + + case XenbusStateConnected: + tpmif_set_connected_state(tp, 1); + break; + + case XenbusStateClosing: + tpmif_set_connected_state(tp, 0); + xenbus_frontend_closed(dev); + break; + + case XenbusStateClosed: + tpmif_set_connected_state(tp, 0); + if (tp->is_suspended == 0) + device_unregister(&dev->dev); + xenbus_frontend_closed(dev); + break; + } +} + +static int tpmfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + int handle; + struct tpm_private *tp = tpm_private_get(); + + if (!tp) + return -ENOMEM; + + tp->chip = init_vtpm(&dev->dev, tp); + if (IS_ERR(tp->chip)) + return PTR_ERR(tp->chip); + + err = xenbus_scanf(XBT_NIL, dev->nodename, + "handle", "%i", &handle); + if (XENBUS_EXIST_ERR(err)) + return err; + + if (err < 0) { + xenbus_dev_fatal(dev,err,"reading virtual-device"); + return err; + } + + tp->dev = dev; + + err = talk_to_backend(dev, tp); + if (err) { + tpm_private_put(); + return err; + } + + return 0; +} + + +static int tpmfront_remove(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + destroy_tpmring(tp); + cleanup_vtpm(&dev->dev); + return 0; +} + +static int tpmfront_suspend(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + u32 ctr; + + /* Take the lock, preventing any application from sending. */ + mutex_lock(&suspend_lock); + tp->is_suspended = 1; + + for (ctr = 0; atomic_read(&tp->tx_busy); ctr++) { + if ((ctr % 10) == 0) + printk("TPM-FE [INFO]: Waiting for outstanding " + "request.\n"); + /* Wait for a request to be responded to. */ + interruptible_sleep_on_timeout(&tp->wait_q, 100); + } + + return 0; +} + +static int tpmfront_suspend_finish(struct tpm_private *tp) +{ + tp->is_suspended = 0; + /* Allow applications to send again. */ + mutex_unlock(&suspend_lock); + return 0; +} + +static int tpmfront_suspend_cancel(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + return tpmfront_suspend_finish(tp); +} + +static int tpmfront_resume(struct xenbus_device *dev) +{ + struct tpm_private *tp = tpm_private_from_dev(&dev->dev); + destroy_tpmring(tp); + return talk_to_backend(dev, tp); +} + +static int tpmif_connect(struct xenbus_device *dev, + struct tpm_private *tp, + domid_t domid) +{ + int err; + + tp->backend_id = domid; + + err = bind_listening_port_to_irqhandler( + domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp); + if (err <= 0) { + WPRINTK("bind_listening_port_to_irqhandler failed " + "(err=%d)\n", err); + return err; + } + tp->irq = err; + + return 0; +} + +static struct xenbus_device_id tpmfront_ids[] = { + { "vtpm" }, + { "" } +}; + +static struct xenbus_driver tpmfront = { + .name = "vtpm", + .owner = THIS_MODULE, + .ids = tpmfront_ids, + .probe = tpmfront_probe, + .remove = tpmfront_remove, + .resume = tpmfront_resume, + .otherend_changed = backend_changed, + .suspend = tpmfront_suspend, + .suspend_cancel = tpmfront_suspend_cancel, +}; + +static void __init init_tpm_xenbus(void) +{ + xenbus_register_frontend(&tpmfront); +} + +static int tpmif_allocate_tx_buffers(struct tpm_private *tp) +{ + unsigned int i; + + for (i = 0; i < TPMIF_TX_RING_SIZE; i++) { + tp->tx_buffers[i] = tx_buffer_alloc(); + if (!tp->tx_buffers[i]) { + tpmif_free_tx_buffers(tp); + return -ENOMEM; + } + } + return 0; +} + +static void tpmif_free_tx_buffers(struct tpm_private *tp) +{ + unsigned int i; + + for (i = 0; i < TPMIF_TX_RING_SIZE; i++) + tx_buffer_free(tp->tx_buffers[i]); +} + +static void tpmif_rx_action(unsigned long priv) +{ + struct tpm_private *tp = (struct tpm_private *)priv; + int i = 0; + unsigned int received; + unsigned int offset = 0; + u8 *buffer; + tpmif_tx_request_t *tx = &tp->tx->ring[i].req; + + atomic_set(&tp->tx_busy, 0); + wake_up_interruptible(&tp->wait_q); + + received = tx->size; + + buffer = kmalloc(received, GFP_ATOMIC); + if (!buffer) + return; + + for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) { + struct tx_buffer *txb = tp->tx_buffers[i]; + tpmif_tx_request_t *tx; + unsigned int tocopy; + + tx = &tp->tx->ring[i].req; + tocopy = tx->size; + if (tocopy > PAGE_SIZE) + tocopy = PAGE_SIZE; + + memcpy(&buffer[offset], txb->data, tocopy); + + gnttab_release_grant_reference(&gref_head, tx->ref); + + offset += tocopy; + } + + vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember); + kfree(buffer); +} + + +static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs) +{ + struct tpm_private *tp = tpm_priv; + unsigned long flags; + + spin_lock_irqsave(&tp->tx_lock, flags); + tpmif_rx_tasklet.data = (unsigned long)tp; + tasklet_schedule(&tpmif_rx_tasklet); + spin_unlock_irqrestore(&tp->tx_lock, flags); + + return IRQ_HANDLED; +} + + +static int tpm_xmit(struct tpm_private *tp, + const u8 * buf, size_t count, int isuserbuffer, + void *remember) +{ + tpmif_tx_request_t *tx; + TPMIF_RING_IDX i; + unsigned int offset = 0; + + spin_lock_irq(&tp->tx_lock); + + if (unlikely(atomic_read(&tp->tx_busy))) { + printk("tpm_xmit: There's an outstanding request/response " + "on the way!\n"); + spin_unlock_irq(&tp->tx_lock); + return -EBUSY; + } + + if (tp->is_connected != 1) { + spin_unlock_irq(&tp->tx_lock); + return -EIO; + } + + for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) { + struct tx_buffer *txb = tp->tx_buffers[i]; + int copied; + + if (!txb) { + DPRINTK("txb (i=%d) is NULL. buffers initilized?\n" + "Not transmitting anything!\n", i); + spin_unlock_irq(&tp->tx_lock); + return -EFAULT; + } + + copied = tx_buffer_copy(txb, &buf[offset], count, + isuserbuffer); + if (copied < 0) { + /* An error occurred */ + spin_unlock_irq(&tp->tx_lock); + return copied; + } + count -= copied; + offset += copied; + + tx = &tp->tx->ring[i].req; + tx->addr = virt_to_machine(txb->data); + tx->size = txb->len; + tx->unused = 0; + + DPRINTK("First 4 characters sent by TPM-FE are " + "0x%02x 0x%02x 0x%02x 0x%02x\n", + txb->data[0],txb->data[1],txb->data[2],txb->data[3]); + + /* Get the granttable reference for this page. */ + tx->ref = gnttab_claim_grant_reference(&gref_head); + if (tx->ref == -ENOSPC) { + spin_unlock_irq(&tp->tx_lock); + DPRINTK("Grant table claim reference failed in " + "func:%s line:%d file:%s\n", + __FUNCTION__, __LINE__, __FILE__); + return -ENOSPC; + } + gnttab_grant_foreign_access_ref(tx->ref, + tp->backend_id, + virt_to_mfn(txb->data), + 0 /*RW*/); + wmb(); + } + + atomic_set(&tp->tx_busy, 1); + tp->tx_remember = remember; + + mb(); + + notify_remote_via_irq(tp->irq); + + spin_unlock_irq(&tp->tx_lock); + return offset; +} + + +static void tpmif_notify_upperlayer(struct tpm_private *tp) +{ + /* Notify upper layer about the state of the connection to the BE. */ + vtpm_vd_status(tp->chip, (tp->is_connected + ? TPM_VD_STATUS_CONNECTED + : TPM_VD_STATUS_DISCONNECTED)); +} + + +static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected) +{ + /* + * Don't notify upper layer if we are in suspend mode and + * should disconnect - assumption is that we will resume + * The mutex keeps apps from sending. + */ + if (is_connected == 0 && tp->is_suspended == 1) + return; + + /* + * Unlock the mutex if we are connected again + * after being suspended - now resuming. + * This also removes the suspend state. + */ + if (is_connected == 1 && tp->is_suspended == 1) + tpmfront_suspend_finish(tp); + + if (is_connected != tp->is_connected) { + tp->is_connected = is_connected; + tpmif_notify_upperlayer(tp); + } +} + + + +/* ================================================================= + * Initialization function. + * ================================================================= + */ + + +static int __init tpmif_init(void) +{ + struct tpm_private *tp; + + if (is_initial_xendomain()) + return -EPERM; + + tp = tpm_private_get(); + if (!tp) + return -ENOMEM; + + IPRINTK("Initialising the vTPM driver.\n"); + if (gnttab_alloc_grant_references(TPMIF_TX_RING_SIZE, + &gref_head) < 0) { + tpm_private_put(); + return -EFAULT; + } + + init_tpm_xenbus(); + return 0; +} + + +module_init(tpmif_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- a/drivers/ide/ide-lib.c +++ b/drivers/ide/ide-lib.c @@ -177,12 +177,12 @@ void ide_toggle_bounce(ide_drive_t *driv { u64 addr = BLK_BOUNCE_HIGH; /* dma64_addr_t */ - if (!PCI_DMA_BUS_IS_PHYS) { - addr = BLK_BOUNCE_ANY; - } else if (on && drive->media == ide_disk) { + if (on && drive->media == ide_disk) { struct device *dev = drive->hwif->dev; - if (dev && dev->dma_mask) + if (!PCI_DMA_BUS_IS_PHYS) + addr = BLK_BOUNCE_ANY; + else if (dev && dev->dma_mask) addr = *dev->dma_mask; } --- a/drivers/Makefile +++ b/drivers/Makefile @@ -37,6 +37,7 @@ obj-y += base/ block/ misc/ mfd/ net/ obj-$(CONFIG_NUBUS) += nubus/ obj-$(CONFIG_ATM) += atm/ obj-y += macintosh/ +obj-$(CONFIG_XEN) += xen/ obj-$(CONFIG_IDE) += ide/ obj-$(CONFIG_SCSI) += scsi/ obj-$(CONFIG_ATA) += ata/ --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -6,6 +6,10 @@ * * @author John Levon * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * * This is the core of the buffer management. Each * CPU buffer is processed and entered into the * global event buffer. Such processing is necessary @@ -40,6 +44,7 @@ static cpumask_t marked_cpus = CPU_MASK_ static DEFINE_SPINLOCK(task_mortuary); static void process_task_mortuary(void); +static int cpu_current_domain[NR_CPUS]; /* Take ownership of the task struct and place it on the * list for processing. Only after two full buffer syncs @@ -148,6 +153,11 @@ static void end_sync(void) int sync_start(void) { int err; + int i; + + for (i = 0; i < NR_CPUS; i++) { + cpu_current_domain[i] = COORDINATOR_DOMAIN; + } start_cpu_work(); @@ -274,15 +284,31 @@ static void add_cpu_switch(int i) last_cookie = INVALID_COOKIE; } -static void add_kernel_ctx_switch(unsigned int in_kernel) +static void add_cpu_mode_switch(unsigned int cpu_mode) { add_event_entry(ESCAPE_CODE); - if (in_kernel) - add_event_entry(KERNEL_ENTER_SWITCH_CODE); - else - add_event_entry(KERNEL_EXIT_SWITCH_CODE); + switch (cpu_mode) { + case CPU_MODE_USER: + add_event_entry(USER_ENTER_SWITCH_CODE); + break; + case CPU_MODE_KERNEL: + add_event_entry(KERNEL_ENTER_SWITCH_CODE); + break; + case CPU_MODE_XEN: + add_event_entry(XEN_ENTER_SWITCH_CODE); + break; + default: + break; + } } - + +static void add_domain_switch(unsigned long domain_id) +{ + add_event_entry(ESCAPE_CODE); + add_event_entry(DOMAIN_SWITCH_CODE); + add_event_entry(domain_id); +} + static void add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) { @@ -347,9 +373,9 @@ static int add_us_sample(struct mm_struc * for later lookup from userspace. */ static int -add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) +add_sample(struct mm_struct * mm, struct op_sample * s, int cpu_mode) { - if (in_kernel) { + if (cpu_mode >= CPU_MODE_KERNEL) { add_sample_entry(s->eip, s->event); return 1; } else if (mm) { @@ -495,15 +521,21 @@ void sync_buffer(int cpu) struct mm_struct *mm = NULL; struct task_struct * new; unsigned long cookie = 0; - int in_kernel = 1; + int cpu_mode = 1; unsigned int i; sync_buffer_state state = sb_buffer_start; unsigned long available; + int domain_switch = 0; mutex_lock(&buffer_mutex); add_cpu_switch(cpu); + /* We need to assign the first samples in this CPU buffer to the + same domain that we were processing at the last sync_buffer */ + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) { + add_domain_switch(cpu_current_domain[cpu]); + } /* Remember, only we can modify tail_pos */ available = get_slots(cpu_buf); @@ -511,16 +543,18 @@ void sync_buffer(int cpu) for (i = 0; i < available; ++i) { struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; - if (is_code(s->eip)) { - if (s->event <= CPU_IS_KERNEL) { - /* kernel/userspace switch */ - in_kernel = s->event; + if (is_code(s->eip) && !domain_switch) { + if (s->event <= CPU_MODE_XEN) { + /* xen/kernel/userspace switch */ + cpu_mode = s->event; if (state == sb_buffer_start) state = sb_sample_start; - add_kernel_ctx_switch(s->event); + add_cpu_mode_switch(s->event); } else if (s->event == CPU_TRACE_BEGIN) { state = sb_bt_start; add_trace_begin(); + } else if (s->event == CPU_DOMAIN_SWITCH) { + domain_switch = 1; } else { struct mm_struct * oldmm = mm; @@ -534,11 +568,21 @@ void sync_buffer(int cpu) add_user_ctx_switch(new, cookie); } } else { - if (state >= sb_bt_start && - !add_sample(mm, s, in_kernel)) { - if (state == sb_bt_start) { - state = sb_bt_ignore; - atomic_inc(&oprofile_stats.bt_lost_no_mapping); + if (domain_switch) { + cpu_current_domain[cpu] = s->eip; + add_domain_switch(s->eip); + domain_switch = 0; + } else { + if (cpu_current_domain[cpu] != + COORDINATOR_DOMAIN) { + add_sample_entry(s->eip, s->event); + } + else if (state >= sb_bt_start && + !add_sample(mm, s, cpu_mode)) { + if (state == sb_bt_start) { + state = sb_bt_ignore; + atomic_inc(&oprofile_stats.bt_lost_no_mapping); + } } } } @@ -547,6 +591,11 @@ void sync_buffer(int cpu) } release_mm(mm); + /* We reset domain to COORDINATOR at each CPU switch */ + if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) { + add_domain_switch(COORDINATOR_DOMAIN); + } + mark_done(cpu); mutex_unlock(&buffer_mutex); --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -6,6 +6,10 @@ * * @author John Levon * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * * Each CPU has a local buffer that stores PC value/event * pairs. We also log context switches when we notice them. * Eventually each CPU's buffer is processed into the global @@ -34,6 +38,8 @@ static void wq_sync_buffer(struct work_s #define DEFAULT_TIMER_EXPIRE (HZ / 10) static int work_enabled; +static int32_t current_domain = COORDINATOR_DOMAIN; + void free_cpu_buffers(void) { int i; @@ -72,7 +78,7 @@ int alloc_cpu_buffers(void) goto fail; b->last_task = NULL; - b->last_is_kernel = -1; + b->last_cpu_mode = -1; b->tracing = 0; b->buffer_size = buffer_size; b->tail_pos = 0; @@ -130,7 +136,7 @@ void cpu_buffer_reset(struct oprofile_cp * collected will populate the buffer with proper * values to initialize the buffer */ - cpu_buf->last_is_kernel = -1; + cpu_buf->last_cpu_mode = -1; cpu_buf->last_task = NULL; } @@ -180,13 +186,13 @@ add_code(struct oprofile_cpu_buffer * bu * because of the head/tail separation of the writer and reader * of the CPU buffer. * - * is_kernel is needed because on some architectures you cannot + * cpu_mode is needed because on some architectures you cannot * tell if you are in kernel or user space simply by looking at - * pc. We tag this in the buffer by generating kernel enter/exit - * events whenever is_kernel changes + * pc. We tag this in the buffer by generating kernel/user (and xen) + * enter events whenever cpu_mode changes */ static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc, - int is_kernel, unsigned long event) + int cpu_mode, unsigned long event) { struct task_struct * task; @@ -202,18 +208,18 @@ static int log_sample(struct oprofile_cp return 0; } - is_kernel = !!is_kernel; - task = current; /* notice a switch from user->kernel or vice versa */ - if (cpu_buf->last_is_kernel != is_kernel) { - cpu_buf->last_is_kernel = is_kernel; - add_code(cpu_buf, is_kernel); + if (cpu_buf->last_cpu_mode != cpu_mode) { + cpu_buf->last_cpu_mode = cpu_mode; + add_code(cpu_buf, cpu_mode); } - + /* notice a task switch */ - if (cpu_buf->last_task != task) { + /* if not processing other domain samples */ + if ((cpu_buf->last_task != task) && + (current_domain == COORDINATOR_DOMAIN)) { cpu_buf->last_task = task; add_code(cpu_buf, (unsigned long)task); } @@ -297,6 +303,25 @@ void oprofile_add_trace(unsigned long pc add_sample(cpu_buf, pc, 0); } +int oprofile_add_domain_switch(int32_t domain_id) +{ + struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()]; + + /* should have space for switching into and out of domain + (2 slots each) plus one sample and one cpu mode switch */ + if (((nr_available_slots(cpu_buf) < 6) && + (domain_id != COORDINATOR_DOMAIN)) || + (nr_available_slots(cpu_buf) < 2)) + return 0; + + add_code(cpu_buf, CPU_DOMAIN_SWITCH); + add_sample(cpu_buf, domain_id, 0); + + current_domain = domain_id; + + return 1; +} + /* * This serves to avoid cpu buffer overflow, and makes sure * the task mortuary progresses --- a/drivers/oprofile/cpu_buffer.h +++ b/drivers/oprofile/cpu_buffer.h @@ -37,7 +37,7 @@ struct oprofile_cpu_buffer { volatile unsigned long tail_pos; unsigned long buffer_size; struct task_struct * last_task; - int last_is_kernel; + int last_cpu_mode; int tracing; struct op_sample * buffer; unsigned long sample_received; @@ -53,7 +53,10 @@ DECLARE_PER_CPU(struct oprofile_cpu_buff void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf); /* transient events for the CPU buffer -> event buffer */ -#define CPU_IS_KERNEL 1 -#define CPU_TRACE_BEGIN 2 +#define CPU_MODE_USER 0 +#define CPU_MODE_KERNEL 1 +#define CPU_MODE_XEN 2 +#define CPU_TRACE_BEGIN 3 +#define CPU_DOMAIN_SWITCH 4 #endif /* OPROFILE_CPU_BUFFER_H */ --- a/drivers/oprofile/event_buffer.h +++ b/drivers/oprofile/event_buffer.h @@ -30,6 +30,9 @@ void wake_up_buffer_waiter(void); #define INVALID_COOKIE ~0UL #define NO_COOKIE 0UL +/* Constant used to refer to coordinator domain (Xen) */ +#define COORDINATOR_DOMAIN -1 + extern const struct file_operations event_buffer_fops; /* mutex between sync_cpu_buffers() and the --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -5,6 +5,10 @@ * @remark Read the file COPYING * * @author John Levon + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. */ #include @@ -33,6 +37,32 @@ static DEFINE_MUTEX(start_mutex); */ static int timer = 0; +int oprofile_set_active(int active_domains[], unsigned int adomains) +{ + int err; + + if (!oprofile_ops.set_active) + return -EINVAL; + + mutex_lock(&start_mutex); + err = oprofile_ops.set_active(active_domains, adomains); + mutex_unlock(&start_mutex); + return err; +} + +int oprofile_set_passive(int passive_domains[], unsigned int pdomains) +{ + int err; + + if (!oprofile_ops.set_passive) + return -EINVAL; + + mutex_lock(&start_mutex); + err = oprofile_ops.set_passive(passive_domains, pdomains); + mutex_unlock(&start_mutex); + return err; +} + int oprofile_setup(void) { int err; --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -35,5 +35,8 @@ void oprofile_create_files(struct super_ void oprofile_timer_init(struct oprofile_operations * ops); int oprofile_set_backtrace(unsigned long depth); + +int oprofile_set_active(int active_domains[], unsigned int adomains); +int oprofile_set_passive(int passive_domains[], unsigned int pdomains); #endif /* OPROF_H */ --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -5,15 +5,21 @@ * @remark Read the file COPYING * * @author John Levon + * + * Modified by Aravind Menon for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. */ #include #include +#include +#include #include "event_buffer.h" #include "oprofile_stats.h" #include "oprof.h" - + unsigned long fs_buffer_size = 131072; unsigned long fs_cpu_buffer_size = 8192; unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */ @@ -117,11 +123,202 @@ static ssize_t dump_write(struct file * static const struct file_operations dump_fops = { .write = dump_write, }; - + +#define TMPBUFSIZE 512 + +static unsigned int adomains = 0; +static int active_domains[MAX_OPROF_DOMAINS + 1]; +static DEFINE_MUTEX(adom_mutex); + +static ssize_t adomain_write(struct file * file, char const __user * buf, + size_t count, loff_t * offset) +{ + char *tmpbuf; + char *startp, *endp; + int i; + unsigned long val; + ssize_t retval = count; + + if (*offset) + return -EINVAL; + if (count > TMPBUFSIZE - 1) + return -EINVAL; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + if (copy_from_user(tmpbuf, buf, count)) { + kfree(tmpbuf); + return -EFAULT; + } + tmpbuf[count] = 0; + + mutex_lock(&adom_mutex); + + startp = tmpbuf; + /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */ + for (i = 0; i <= MAX_OPROF_DOMAINS; i++) { + val = simple_strtoul(startp, &endp, 0); + if (endp == startp) + break; + while (ispunct(*endp) || isspace(*endp)) + endp++; + active_domains[i] = val; + if (active_domains[i] != val) + /* Overflow, force error below */ + i = MAX_OPROF_DOMAINS + 1; + startp = endp; + } + /* Force error on trailing junk */ + adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i; + + kfree(tmpbuf); + + if (adomains > MAX_OPROF_DOMAINS + || oprofile_set_active(active_domains, adomains)) { + adomains = 0; + retval = -EINVAL; + } + + mutex_unlock(&adom_mutex); + return retval; +} + +static ssize_t adomain_read(struct file * file, char __user * buf, + size_t count, loff_t * offset) +{ + char * tmpbuf; + size_t len; + int i; + ssize_t retval; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + mutex_lock(&adom_mutex); + + len = 0; + for (i = 0; i < adomains; i++) + len += snprintf(tmpbuf + len, + len < TMPBUFSIZE ? TMPBUFSIZE - len : 0, + "%u ", active_domains[i]); + WARN_ON(len > TMPBUFSIZE); + if (len != 0 && len <= TMPBUFSIZE) + tmpbuf[len-1] = '\n'; + + mutex_unlock(&adom_mutex); + + retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len); + + kfree(tmpbuf); + return retval; +} + + +static struct file_operations active_domain_ops = { + .read = adomain_read, + .write = adomain_write, +}; + +static unsigned int pdomains = 0; +static int passive_domains[MAX_OPROF_DOMAINS]; +static DEFINE_MUTEX(pdom_mutex); + +static ssize_t pdomain_write(struct file * file, char const __user * buf, + size_t count, loff_t * offset) +{ + char *tmpbuf; + char *startp, *endp; + int i; + unsigned long val; + ssize_t retval = count; + + if (*offset) + return -EINVAL; + if (count > TMPBUFSIZE - 1) + return -EINVAL; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + if (copy_from_user(tmpbuf, buf, count)) { + kfree(tmpbuf); + return -EFAULT; + } + tmpbuf[count] = 0; + + mutex_lock(&pdom_mutex); + + startp = tmpbuf; + /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */ + for (i = 0; i <= MAX_OPROF_DOMAINS; i++) { + val = simple_strtoul(startp, &endp, 0); + if (endp == startp) + break; + while (ispunct(*endp) || isspace(*endp)) + endp++; + passive_domains[i] = val; + if (passive_domains[i] != val) + /* Overflow, force error below */ + i = MAX_OPROF_DOMAINS + 1; + startp = endp; + } + /* Force error on trailing junk */ + pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i; + + kfree(tmpbuf); + + if (pdomains > MAX_OPROF_DOMAINS + || oprofile_set_passive(passive_domains, pdomains)) { + pdomains = 0; + retval = -EINVAL; + } + + mutex_unlock(&pdom_mutex); + return retval; +} + +static ssize_t pdomain_read(struct file * file, char __user * buf, + size_t count, loff_t * offset) +{ + char * tmpbuf; + size_t len; + int i; + ssize_t retval; + + if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL))) + return -ENOMEM; + + mutex_lock(&pdom_mutex); + + len = 0; + for (i = 0; i < pdomains; i++) + len += snprintf(tmpbuf + len, + len < TMPBUFSIZE ? TMPBUFSIZE - len : 0, + "%u ", passive_domains[i]); + WARN_ON(len > TMPBUFSIZE); + if (len != 0 && len <= TMPBUFSIZE) + tmpbuf[len-1] = '\n'; + + mutex_unlock(&pdom_mutex); + + retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len); + + kfree(tmpbuf); + return retval; +} + +static struct file_operations passive_domain_ops = { + .read = pdomain_read, + .write = pdomain_write, +}; + void oprofile_create_files(struct super_block * sb, struct dentry * root) { oprofilefs_create_file(sb, root, "enable", &enable_fops); oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666); + oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops); + oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops); oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops); oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size); oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed); #--- a/fs/aio.c #+++ b/fs/aio.c #@@ -36,6 +36,11 @@ # #include # #include #+#ifdef CONFIG_EPOLL #+#include #+#include #+#endif #+ # #if DEBUG > 1 # #define dprintk printk # #else #@@ -1010,6 +1015,11 @@ put_rq: # if (waitqueue_active(&ctx->wait)) # wake_up(&ctx->wait); # #+#ifdef CONFIG_EPOLL #+ if (ctx->file && waitqueue_active(&ctx->poll_wait)) #+ wake_up(&ctx->poll_wait); #+#endif #+ # spin_unlock_irqrestore(&ctx->ctx_lock, flags); # return ret; # } #@@ -1017,6 +1027,8 @@ put_rq: # /* aio_read_evt # * Pull an event off of the ioctx's event ring. Returns the number of # * events fetched (0 or 1 ;-) #+ * If ent parameter is 0, just returns the number of events that would #+ * be fetched. # * FIXME: make this use cmpxchg. # * TODO: make the ringbuffer user mmap()able (requires FIXME). # */ #@@ -1039,13 +1051,18 @@ static int aio_read_evt(struct kioctx *i # # head = ring->head % info->nr; # if (head != ring->tail) { #- struct io_event *evp = aio_ring_event(info, head, KM_USER1); #- *ent = *evp; #- head = (head + 1) % info->nr; #- smp_mb(); /* finish reading the event before updatng the head */ #- ring->head = head; #- ret = 1; #- put_aio_ring_event(evp, KM_USER1); #+ if (ent) { /* event requested */ #+ struct io_event *evp = #+ aio_ring_event(info, head, KM_USER1); #+ *ent = *evp; #+ head = (head + 1) % info->nr; #+ /* finish reading the event before updatng the head */ #+ smp_mb(); #+ ring->head = head; #+ ret = 1; #+ put_aio_ring_event(evp, KM_USER1); #+ } else /* only need to know availability */ #+ ret = 1; # } # spin_unlock(&info->ring_lock); # #@@ -1235,6 +1252,13 @@ static void io_destroy(struct kioctx *io # # aio_cancel_all(ioctx); # wait_for_all_aios(ioctx); #+#ifdef CONFIG_EPOLL #+ /* forget the poll file, but it's up to the user to close it */ #+ if (ioctx->file) { #+ ioctx->file->private_data = 0; #+ ioctx->file = 0; #+ } #+#endif # # /* # * Wake up any waiters. The setting of ctx->dead must be seen #@@ -1245,6 +1269,67 @@ static void io_destroy(struct kioctx *io # put_ioctx(ioctx); /* once for the lookup */ # } # #+#ifdef CONFIG_EPOLL #+ #+static int aio_queue_fd_close(struct inode *inode, struct file *file) #+{ #+ struct kioctx *ioctx = file->private_data; #+ if (ioctx) { #+ file->private_data = 0; #+ spin_lock_irq(&ioctx->ctx_lock); #+ ioctx->file = 0; #+ spin_unlock_irq(&ioctx->ctx_lock); #+ } #+ return 0; #+} #+ #+static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait) #+{ unsigned int pollflags = 0; #+ struct kioctx *ioctx = file->private_data; #+ #+ if (ioctx) { #+ #+ spin_lock_irq(&ioctx->ctx_lock); #+ /* Insert inside our poll wait queue */ #+ poll_wait(file, &ioctx->poll_wait, wait); #+ #+ /* Check our condition */ #+ if (aio_read_evt(ioctx, 0)) #+ pollflags = POLLIN | POLLRDNORM; #+ spin_unlock_irq(&ioctx->ctx_lock); #+ } #+ #+ return pollflags; #+} #+ #+static const struct file_operations aioq_fops = { #+ .release = aio_queue_fd_close, #+ .poll = aio_queue_fd_poll #+}; #+ #+/* make_aio_fd: #+ * Create a file descriptor that can be used to poll the event queue. #+ * Based and piggybacked on the excellent epoll code. #+ */ #+ #+static int make_aio_fd(struct kioctx *ioctx) #+{ #+ int error, fd; #+ struct inode *inode; #+ struct file *file; #+ #+ error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops); #+ if (error) #+ return error; #+ #+ /* associate the file with the IO context */ #+ file->private_data = ioctx; #+ ioctx->file = file; #+ init_waitqueue_head(&ioctx->poll_wait); #+ return fd; #+} #+#endif #+ # /* sys_io_setup: # * Create an aio_context capable of receiving at least nr_events. # * ctxp must not point to an aio_context that already exists, and #@@ -1257,18 +1342,30 @@ static void io_destroy(struct kioctx *io # * resources are available. May fail with -EFAULT if an invalid # * pointer is passed for ctxp. Will fail with -ENOSYS if not # * implemented. #+ * #+ * To request a selectable fd, the user context has to be initialized #+ * to 1, instead of 0, and the return value is the fd. #+ * This keeps the system call compatible, since a non-zero value #+ * was not allowed so far. # */ # asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp) # { # struct kioctx *ioctx = NULL; # unsigned long ctx; # long ret; #+ int make_fd = 0; # # ret = get_user(ctx, ctxp); # if (unlikely(ret)) # goto out; # # ret = -EINVAL; #+#ifdef CONFIG_EPOLL #+ if (ctx == 1) { #+ make_fd = 1; #+ ctx = 0; #+ } #+#endif # if (unlikely(ctx || nr_events == 0)) { # pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", # ctx, nr_events); #@@ -1279,8 +1376,12 @@ asmlinkage long sys_io_setup(unsigned nr # ret = PTR_ERR(ioctx); # if (!IS_ERR(ioctx)) { # ret = put_user(ioctx->user_id, ctxp); #- if (!ret) #- return 0; #+#ifdef CONFIG_EPOLL #+ if (make_fd && ret >= 0) #+ ret = make_aio_fd(ioctx); #+#endif #+ if (ret >= 0) #+ return ret; # # get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ # io_destroy(ioctx); --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -114,6 +114,13 @@ #include #endif +#ifdef CONFIG_XEN +#include +#include +#include +#include +#endif + static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *f) { @@ -2727,6 +2734,18 @@ IGNORE_IOCTL(FBIOGETCMAP32) IGNORE_IOCTL(FBIOSCURSOR32) IGNORE_IOCTL(FBIOGCURSOR32) #endif + +#ifdef CONFIG_XEN +HANDLE_IOCTL(IOCTL_PRIVCMD_MMAP_32, privcmd_ioctl_32) +HANDLE_IOCTL(IOCTL_PRIVCMD_MMAPBATCH_32, privcmd_ioctl_32) +COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_UNBOUND_PORT) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_UNBIND) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_NOTIFY) +COMPATIBLE_IOCTL(IOCTL_EVTCHN_RESET) +#endif }; #define IOCTL_HASHSIZE 256 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -17,6 +17,12 @@ #define ACPI_PROCESSOR_MAX_THROTTLE 250 /* 25% */ #define ACPI_PROCESSOR_MAX_DUTY_WIDTH 4 +#ifdef CONFIG_XEN +#define NR_ACPI_CPUS (NR_CPUS < 256 ? 256 : NR_CPUS) +#else +#define NR_ACPI_CPUS NR_CPUS +#endif /* CONFIG_XEN */ + #define ACPI_PDC_REVISION_ID 0x1 #define ACPI_PSD_REV0_REVISION 0 /* Support for _PSD as in ACPI 3.0 */ @@ -42,6 +48,17 @@ struct acpi_processor_cx; +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +struct acpi_csd_package { + acpi_integer num_entries; + acpi_integer revision; + acpi_integer domain; + acpi_integer coord_type; + acpi_integer num_processors; + acpi_integer index; +} __attribute__ ((packed)); +#endif + struct acpi_power_register { u8 descriptor; u16 length; @@ -74,6 +91,12 @@ struct acpi_processor_cx { u32 power; u32 usage; u64 time; +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL + /* Require raw information for external control logic */ + struct acpi_power_register reg; + u32 csd_count; + struct acpi_csd_package *domain_info; +#endif struct acpi_processor_cx_policy promotion; struct acpi_processor_cx_policy demotion; char desc[ACPI_CX_DESC_LEN]; @@ -304,6 +327,9 @@ static inline void acpi_processor_ppc_ex { return; } +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +int acpi_processor_ppc_has_changed(struct acpi_processor *pr); +#else static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr) { static unsigned int printout = 1; @@ -316,6 +342,7 @@ static inline int acpi_processor_ppc_has } return 0; } +#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ #endif /* CONFIG_CPU_FREQ */ /* in processor_throttling.c */ @@ -352,4 +379,120 @@ static inline void acpi_thermal_cpufreq_ } #endif +/* + * Following are interfaces geared to external processor PM control + * logic like a VMM + */ +/* Events notified to external control logic */ +#define PROCESSOR_PM_INIT 1 +#define PROCESSOR_PM_CHANGE 2 +#define PROCESSOR_HOTPLUG 3 + +/* Objects for the PM events */ +#define PM_TYPE_IDLE 0 +#define PM_TYPE_PERF 1 +#define PM_TYPE_THR 2 +#define PM_TYPE_MAX 3 + +/* Processor hotplug events */ +#define HOTPLUG_TYPE_ADD 0 +#define HOTPLUG_TYPE_REMOVE 1 + +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +struct processor_extcntl_ops { + /* Transfer processor PM events to external control logic */ + int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event); + /* Notify physical processor status to external control logic */ + int (*hotplug)(struct acpi_processor *pr, int type); +}; +extern const struct processor_extcntl_ops *processor_extcntl_ops; + +static inline int processor_cntl_external(void) +{ + return (processor_extcntl_ops != NULL); +} + +static inline int processor_pm_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_IDLE] != NULL); +} + +static inline int processor_pmperf_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_PERF] != NULL); +} + +static inline int processor_pmthr_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_THR] != NULL); +} + +extern int processor_notify_external(struct acpi_processor *pr, + int event, int type); +extern void processor_extcntl_init(void); +extern int processor_extcntl_prepare(struct acpi_processor *pr); +extern int acpi_processor_get_performance_info(struct acpi_processor *pr); +extern int acpi_processor_get_psd(struct acpi_processor *pr); +void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **); +#else +static inline int processor_cntl_external(void) {return 0;} +static inline int processor_pm_external(void) {return 0;} +static inline int processor_pmperf_external(void) {return 0;} +static inline int processor_pmthr_external(void) {return 0;} +static inline int processor_notify_external(struct acpi_processor *pr, + int event, int type) +{ + return 0; +} +static inline void processor_extcntl_init(void) {} +static inline int processor_extcntl_prepare(struct acpi_processor *pr) +{ + return 0; +} +#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ + +#ifdef CONFIG_XEN +static inline void xen_convert_pct_reg(struct xen_pct_register *xpct, + struct acpi_pct_register *apct) +{ + xpct->descriptor = apct->descriptor; + xpct->length = apct->length; + xpct->space_id = apct->space_id; + xpct->bit_width = apct->bit_width; + xpct->bit_offset = apct->bit_offset; + xpct->reserved = apct->reserved; + xpct->address = apct->address; +} + +static inline void xen_convert_pss_states(struct xen_processor_px *xpss, + struct acpi_processor_px *apss, int state_count) +{ + int i; + for(i=0; icore_frequency = apss->core_frequency; + xpss->power = apss->power; + xpss->transition_latency = apss->transition_latency; + xpss->bus_master_latency = apss->bus_master_latency; + xpss->control = apss->control; + xpss->status = apss->status; + xpss++; + apss++; + } +} + +static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd, + struct acpi_psd_package *apsd) +{ + xpsd->num_entries = apsd->num_entries; + xpsd->revision = apsd->revision; + xpsd->domain = apsd->domain; + xpsd->coord_type = apsd->coord_type; + xpsd->num_processors = apsd->num_processors; +} + +#endif /* CONFIG_XEN */ + #endif --- a/include/asm-generic/pci.h +++ b/include/asm-generic/pci.h @@ -43,7 +43,9 @@ pcibios_select_root(struct pci_dev *pdev return root; } +#ifndef pcibios_scan_all_fns #define pcibios_scan_all_fns(a, b) 0 +#endif #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -99,6 +99,10 @@ static inline void ptep_set_wrprotect(st } #endif +#ifndef arch_change_pte_range +#define arch_change_pte_range(mm, pmd, addr, end, newprot) 0 +#endif + #ifndef __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (pte_val(A) == pte_val(B)) #endif --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -199,6 +199,11 @@ struct kioctx { struct aio_ring_info ring_info; struct delayed_work wq; +#ifdef CONFIG_EPOLL + // poll integration + wait_queue_head_t poll_wait; + struct file *file; +#endif }; /* prototypes */ --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -62,6 +62,7 @@ static inline void *kmap_atomic(struct p #endif /* CONFIG_HIGHMEM */ +#ifndef __HAVE_ARCH_CLEAR_USER_HIGHPAGE /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { @@ -69,6 +70,7 @@ static inline void clear_user_highpage(s clear_user_page(addr, vaddr, page); kunmap_atomic(addr, KM_USER0); } +#endif #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE /** @@ -115,12 +117,14 @@ alloc_zeroed_user_highpage_movable(struc return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr); } +#ifndef __HAVE_ARCH_CLEAR_HIGHPAGE static inline void clear_highpage(struct page *page) { void *kaddr = kmap_atomic(page, KM_USER0); clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); } +#endif static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, @@ -174,6 +178,8 @@ static inline void copy_user_highpage(st #endif +#ifndef __HAVE_ARCH_COPY_HIGHPAGE + static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; @@ -185,4 +191,6 @@ static inline void copy_highpage(struct kunmap_atomic(vto, KM_USER1); } +#endif + #endif /* _LINUX_HIGHMEM_H */ --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -218,6 +218,12 @@ static inline int disable_irq_wake(unsig } #endif /* CONFIG_GENERIC_HARDIRQS */ +#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED +int irq_ignore_unhandled(unsigned int irq); +#else +#define irq_ignore_unhandled(irq) 0 +#endif + #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) #define or_softirq_pending(x) (local_softirq_pending() |= (x)) --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -46,6 +46,13 @@ KEXEC_CORE_NOTE_NAME_BYTES + \ KEXEC_CORE_NOTE_DESC_BYTES ) +#ifndef KEXEC_ARCH_HAS_PAGE_MACROS +#define kexec_page_to_pfn(page) page_to_pfn(page) +#define kexec_pfn_to_page(pfn) pfn_to_page(pfn) +#define kexec_virt_to_phys(addr) virt_to_phys(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(addr) +#endif + /* * This structure is used to hold the arguments that are used when loading * kernel binaries. @@ -108,6 +115,12 @@ struct kimage { extern void machine_kexec(struct kimage *image); extern int machine_kexec_prepare(struct kimage *image); extern void machine_kexec_cleanup(struct kimage *image); +#ifdef CONFIG_XEN +extern int xen_machine_kexec_load(struct kimage *image); +extern void xen_machine_kexec_unload(struct kimage *image); +extern void xen_machine_kexec_setup_resources(void); +extern void xen_machine_kexec_register_resources(struct resource *res); +#endif extern asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -113,6 +113,9 @@ extern unsigned int kobjsize(const void #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ +#ifdef CONFIG_XEN +#define VM_FOREIGN 0x40000000 /* Has pages belonging to another VM */ +#endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -183,6 +186,11 @@ struct vm_operations_struct { */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); + + /* Area-specific function for clearing the PTE at @ptep. Returns the + * original value of @ptep. */ + pte_t (*zap_pte)(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, int is_fullmm); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -16,6 +16,8 @@ #include #include #include + +#include /* Each escaped entry is prefixed by ESCAPE_CODE * then one of the following codes, then the @@ -28,7 +30,7 @@ #define CPU_SWITCH_CODE 2 #define COOKIE_SWITCH_CODE 3 #define KERNEL_ENTER_SWITCH_CODE 4 -#define KERNEL_EXIT_SWITCH_CODE 5 +#define USER_ENTER_SWITCH_CODE 5 #define MODULE_LOADED_CODE 6 #define CTX_TGID_CODE 7 #define TRACE_BEGIN_CODE 8 @@ -36,6 +38,7 @@ #define XEN_ENTER_SWITCH_CODE 10 #define SPU_PROFILING_CODE 11 #define SPU_CTX_SWITCH_CODE 12 +#define DOMAIN_SWITCH_CODE 13 struct super_block; struct dentry; @@ -47,6 +50,11 @@ struct oprofile_operations { /* create any necessary configuration files in the oprofile fs. * Optional. */ int (*create_files)(struct super_block * sb, struct dentry * root); + /* setup active domains with Xen */ + int (*set_active)(int *active_domains, unsigned int adomains); + /* setup passive domains with Xen */ + int (*set_passive)(int *passive_domains, unsigned int pdomains); + /* Do any necessary interrupt setup. Optional. */ int (*setup)(void); /* Do any necessary interrupt shutdown. Optional. */ @@ -106,6 +114,8 @@ void oprofile_add_pc(unsigned long pc, i /* add a backtrace entry, to be called from the ->backtrace callback */ void oprofile_add_trace(unsigned long eip); +/* add a domain switch entry */ +int oprofile_add_domain_switch(int32_t domain_id); /** * Create a file of the given name as a child of the given root, with --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -97,6 +97,9 @@ enum pageflags { #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR PG_uncached, /* Page has been mapped as uncached */ #endif +#ifdef CONFIG_XEN + PG_foreign, /* Page is owned by foreign allocator. */ +#endif __NR_PAGEFLAGS, /* Filesystems */ @@ -269,6 +272,19 @@ static inline void SetPageUptodate(struc CLEARPAGEFLAG(Uptodate, uptodate) +#define PageForeign(page) test_bit(PG_foreign, &(page)->flags) +#define SetPageForeign(_page, dtor) do { \ + set_bit(PG_foreign, &(_page)->flags); \ + BUG_ON((dtor) == (void (*)(struct page *))0); \ + (_page)->index = (long)(dtor); \ +} while (0) +#define ClearPageForeign(page) do { \ + clear_bit(PG_foreign, &(page)->flags); \ + (page)->index = 0; \ +} while (0) +#define PageForeignDestructor(_page) \ + ((void (*)(struct page *))(_page)->index)(_page) + extern void cancel_dirty_page(struct page *page, unsigned int account_size); int test_clear_page_writeback(struct page *page); @@ -339,8 +355,16 @@ PAGEFLAG(MemError, memerror) PAGEFLAG_FALSE(MemError) #endif +#if !defined(CONFIG_XEN) +# define PAGE_FLAGS_XEN 0 +#elif defined(CONFIG_X86) +# define PAGE_FLAGS_XEN ((1 << PG_pinned) | (1 << PG_foreign)) +#else +# define PAGE_FLAGS_XEN (1 << PG_foreign) +#endif + #define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ - 1 << PG_buddy | 1 << PG_writeback | \ + 1 << PG_buddy | 1 << PG_writeback | PAGE_FLAGS_XEN | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active) /* --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -211,6 +211,9 @@ struct pci_dev { * directly, use the values stored here. They might be different! */ unsigned int irq; +#ifdef CONFIG_XEN + unsigned int irq_old; +#endif struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ /* These fields are used by common fixups */ @@ -767,6 +770,10 @@ static inline void msi_remove_pci_irq_ve static inline void pci_restore_msi_state(struct pci_dev *dev) { } +#ifdef CONFIG_XEN +#define register_msi_get_owner(func) 0 +#define unregister_msi_get_owner(func) 0 +#endif #else extern int pci_enable_msi(struct pci_dev *dev); extern void pci_msi_shutdown(struct pci_dev *dev); @@ -777,6 +784,10 @@ extern void pci_msix_shutdown(struct pci extern void pci_disable_msix(struct pci_dev *dev); extern void msi_remove_pci_irq_vectors(struct pci_dev *dev); extern void pci_restore_msi_state(struct pci_dev *dev); +#ifdef CONFIG_XEN +extern int register_msi_get_owner(int (*func)(struct pci_dev *dev)); +extern int unregister_msi_get_owner(int (*func)(struct pci_dev *dev)); +#endif #endif #ifdef CONFIG_HT_IRQ #--- a/include/linux/skbuff.h #+++ b/include/linux/skbuff.h #@@ -217,6 +217,8 @@ typedef unsigned char *sk_buff_data_t; # * @local_df: allow local fragmentation # * @cloned: Head may be cloned (check refcnt to be sure) # * @nohdr: Payload reference only, must not modify header #+ * @proto_data_valid: Protocol data validated since arriving at localhost #+ * @proto_csum_blank: Protocol csum must be added before leaving localhost # * @pkt_type: Packet class # * @fclone: skbuff clone status # * @ip_summed: Driver fed us an IP checksum #@@ -323,7 +325,11 @@ struct sk_buff { # #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) # __u8 do_not_encrypt:1; # #endif #- /* 0/13/14 bit hole */ #+#ifdef CONFIG_XEN #+ __u8 proto_data_valid:1, #+ proto_csum_blank:1; #+#endif #+ /* 10-16 bit hole */ #ifdef CONFIG_NET_DMA # dma_cookie_t dma_cookie; --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -22,6 +22,11 @@ #else #define MODULE_VERMAGIC_MODVERSIONS "" #endif +#ifdef CONFIG_XEN +#define MODULE_VERMAGIC_XEN "Xen " +#else +#define MODULE_VERMAGIC_XEN +#endif #ifndef MODULE_ARCH_VERMAGIC #define MODULE_ARCH_VERMAGIC "" #endif @@ -30,5 +35,5 @@ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ - MODULE_ARCH_VERMAGIC + MODULE_VERMAGIC_XEN MODULE_ARCH_VERMAGIC --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -182,7 +182,7 @@ void note_interrupt(unsigned int irq, st */ if (time_after(jiffies, desc->last_unhandled + HZ/10)) desc->irqs_unhandled = 1; - else + else if (!irq_ignore_unhandled(irq)) desc->irqs_unhandled++; desc->last_unhandled = jiffies; if (unlikely(action_ret != IRQ_NONE)) --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -359,13 +359,26 @@ static int kimage_is_destination_range(s return 0; } -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit) { struct page *pages; pages = alloc_pages(gfp_mask, order); if (pages) { unsigned int count, i; +#ifdef CONFIG_XEN + int address_bits; + + if (limit == ~0UL) + address_bits = BITS_PER_LONG; + else + address_bits = long_log2(limit); + + if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) { + __free_pages(pages, order); + return NULL; + } +#endif pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; @@ -384,6 +397,9 @@ static void kimage_free_pages(struct pag count = 1 << order; for (i = 0; i < count; i++) ClearPageReserved(page + i); +#ifdef CONFIG_XEN + xen_destroy_contiguous_region((unsigned long)page_address(page), order); +#endif __free_pages(page, order); } @@ -429,10 +445,10 @@ static struct page *kimage_alloc_normal_ do { unsigned long pfn, epfn, addr, eaddr; - pages = kimage_alloc_pages(GFP_KERNEL, order); + pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT); if (!pages) break; - pfn = page_to_pfn(pages); + pfn = kexec_page_to_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; @@ -466,6 +482,7 @@ static struct page *kimage_alloc_normal_ return pages; } +#ifndef CONFIG_XEN static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { @@ -519,7 +536,7 @@ static struct page *kimage_alloc_crash_c } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); + pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT); break; } } @@ -546,6 +563,13 @@ struct page *kimage_alloc_control_pages( return pages; } +#else /* !CONFIG_XEN */ +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + return kimage_alloc_normal_control_pages(image, order); +} +#endif static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { @@ -561,7 +585,7 @@ static int kimage_add_entry(struct kimag return -ENOMEM; ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); @@ -620,13 +644,13 @@ static void kimage_terminate(struct kima #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) + kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; - page = pfn_to_page(entry >> PAGE_SHIFT); + page = kexec_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } @@ -638,6 +662,10 @@ static void kimage_free(struct kimage *i if (!image) return; +#ifdef CONFIG_XEN + xen_machine_kexec_unload(image); +#endif + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { @@ -713,7 +741,7 @@ static struct page *kimage_alloc_page(st * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = kexec_page_to_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; @@ -724,16 +752,16 @@ static struct page *kimage_alloc_page(st kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); + page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT); if (!page) return NULL; /* If the page cannot be used file it away */ - if (page_to_pfn(page) > + if (kexec_page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unuseable_pages); continue; } - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = kexec_page_to_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) @@ -756,7 +784,7 @@ static struct page *kimage_alloc_page(st struct page *old_page; old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); @@ -812,7 +840,7 @@ static int kimage_load_normal_segment(st result = -ENOMEM; goto out; } - result = kimage_add_page(image, page_to_pfn(page) + result = kimage_add_page(image, kexec_page_to_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; @@ -844,6 +872,7 @@ out: return result; } +#ifndef CONFIG_XEN static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { @@ -866,7 +895,7 @@ static int kimage_load_crash_segment(str char *ptr; size_t uchunk, mchunk; - page = pfn_to_page(maddr >> PAGE_SHIFT); + page = kexec_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; @@ -915,6 +944,13 @@ static int kimage_load_segment(struct ki return result; } +#else /* CONFIG_XEN */ +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + return kimage_load_normal_segment(image, segment); +} +#endif /* * Exec Kernel system call: for obvious reasons only root may call it. @@ -1019,6 +1055,13 @@ asmlinkage long sys_kexec_load(unsigned } kimage_terminate(image); } +#ifdef CONFIG_XEN + if (image) { + result = xen_machine_kexec_load(image); + if (result) + goto out; + } +#endif /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -751,7 +751,7 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP) { .procname = "acpi_video_flags", .data = &acpi_realmode_flags, --- a/mm/memory.c +++ b/mm/memory.c @@ -446,6 +446,12 @@ struct page *vm_normal_page(struct vm_ar { unsigned long pfn; +#if defined(CONFIG_XEN) && defined(CONFIG_X86) + /* XEN: Covers user-space grant mappings (even of local pages). */ + if (unlikely(vma->vm_flags & VM_FOREIGN)) + return NULL; +#endif + if (HAVE_PTE_SPECIAL) { if (likely(!pte_special(pte))) { VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -474,7 +480,14 @@ struct page *vm_normal_page(struct vm_ar } } +#ifndef CONFIG_XEN VM_BUG_ON(!pfn_valid(pfn)); +#else + if (unlikely(!pfn_valid(pfn))) { + VM_BUG_ON(!(vma->vm_flags & VM_RESERVED)); + return NULL; + } +#endif /* * NOTE! We still have PageReserved() pages in the page tables. @@ -745,8 +758,12 @@ static unsigned long zap_pte_range(struc page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); + if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte)) + ptent = vma->vm_ops->zap_pte(vma, addr, pte, + tlb->fullmm); + else + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; @@ -996,6 +1013,7 @@ unsigned long zap_page_range(struct vm_a tlb_finish_mmu(tlb, address, end); return end; } +EXPORT_SYMBOL(zap_page_range); /** * zap_vma_ptes - remove ptes mapping the vma @@ -1193,6 +1211,26 @@ int get_user_pages(struct task_struct *t continue; } +#ifdef CONFIG_XEN + if (vma && (vma->vm_flags & VM_FOREIGN)) { + struct page **map = vma->vm_private_data; + int offset = (start - vma->vm_start) >> PAGE_SHIFT; + if (map[offset] != NULL) { + if (pages) { + struct page *page = map[offset]; + + pages[i] = page; + get_page(page); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + } +#endif if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -92,6 +92,8 @@ static inline void change_pmd_range(stru next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; + if (arch_change_pte_range(mm, pmd, addr, next, newprot)) + continue; change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); } while (pmd++, addr = next, addr != end); } --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -533,7 +533,14 @@ static void __free_pages_ok(struct page unsigned long flags; int i; int reserved = 0; +#ifdef CONFIG_XEN + if (PageForeign(page)) { + PageForeignDestructor(page); + return; + } +#endif + for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); if (reserved) @@ -995,6 +1001,12 @@ static void free_hot_cold_page(struct pa struct per_cpu_pages *pcp; unsigned long flags; +#ifdef CONFIG_XEN + if (PageForeign(page)) { + PageForeignDestructor(page); + return; + } +#endif if (PageAnon(page)) page->mapping = NULL; if (free_pages_check(page)) --- a/net/core/dev.c +++ b/net/core/dev.c @@ -131,6 +131,12 @@ #include "net-sysfs.h" +#ifdef CONFIG_XEN +#include +#include +#include +#endif + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -1734,6 +1740,42 @@ static struct netdev_queue *dev_pick_tx( return netdev_get_tx_queue(dev, queue_index); } +#ifdef CONFIG_XEN +inline int skb_checksum_setup(struct sk_buff *skb) +{ + if (skb->proto_csum_blank) { + if (skb->protocol != htons(ETH_P_IP)) + goto out; + skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl; + if (skb->h.raw >= skb->tail) + goto out; + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + skb->csum = offsetof(struct tcphdr, check); + break; + case IPPROTO_UDP: + skb->csum = offsetof(struct udphdr, check); + break; + default: + if (net_ratelimit()) + printk(KERN_ERR "Attempting to checksum a non-" + "TCP/UDP packet, dropping a protocol" + " %d packet", skb->nh.iph->protocol); + goto out; + } + if ((skb->h.raw + skb->csum + 2) > skb->tail) + goto out; + skb->ip_summed = CHECKSUM_HW; + skb->proto_csum_blank = 0; + } + return 0; +out: + return -EPROTO; +} +#else +inline int skb_checksum_setup(struct sk_buff *skb) { return 0; } +#endif + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -1766,6 +1808,12 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; + /* If a checksum-deferred packet is forwarded to a device that needs a + * checksum, correct the pointers and force checksumming. + */ + if (skb_checksum_setup(skb)) + goto out_kfree_skb; + /* GSO will handle the following emulations directly. */ if (netif_needs_gso(dev, skb)) goto gso; @@ -2271,6 +2319,19 @@ int netif_receive_skb(struct sk_buff *sk } #endif +#ifdef CONFIG_XEN + switch (skb->ip_summed) { + case CHECKSUM_UNNECESSARY: + skb->proto_data_valid = 1; + break; + case CHECKSUM_HW: + /* XXX Implement me. */ + default: + skb->proto_data_valid = 0; + break; + } +#endif + list_for_each_entry_rcu(ptype, &ptype_all, list) { if (ptype->dev == null_or_orig || ptype->dev == skb->dev || @@ -4925,6 +4986,7 @@ EXPORT_SYMBOL(unregister_netdevice_notif EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(skb_checksum_setup); #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) EXPORT_SYMBOL(br_handle_frame_hook); --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -559,6 +559,10 @@ static struct sk_buff *__skb_clone(struc n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; +#ifdef CONFIG_XEN + C(proto_data_valid); + C(proto_csum_blank); +#endif n->destructor = NULL; C(iif); C(tail); --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -75,6 +75,9 @@ tcp_manip_pkt(struct sk_buff *skb, if (hdrsize < sizeof(*hdr)) return true; + if (skb_checksum_setup(skb)) + return false; + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); return true; --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -60,6 +60,10 @@ udp_manip_pkt(struct sk_buff *skb, newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } + + if (skb_checksum_setup(skb)) + return false; + if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -81,7 +81,7 @@ static int xfrm4_output_finish(struct sk #endif skb->protocol = htons(ETH_P_IP); - return xfrm_output(skb); + return skb_checksum_setup(skb) ?: xfrm_output(skb); } int xfrm4_output(struct sk_buff *skb) --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -73,6 +73,20 @@ ifndef obj $(warning kbuild: Makefile.build is included improperly) endif +ifeq ($(CONFIG_XEN),y) +$(objtree)/scripts/Makefile.xen: $(srctree)/scripts/Makefile.xen.awk $(srctree)/scripts/Makefile.build + @echo ' Updating $@' + $(if $(shell echo a | $(AWK) '{ print gensub(/a/, "AA", "g"); }'),\ + ,$(error 'Your awk program does not define gensub. Use gawk or another awk with gensub')) + @$(AWK) -f $< $(filter-out $<,$^) >$@ + +xen-src-single-used-m := $(patsubst $(srctree)/%,%,$(wildcard $(addprefix $(srctree)/,$(single-used-m:.o=-xen.c)))) +xen-single-used-m := $(xen-src-single-used-m:-xen.c=.o) +single-used-m := $(filter-out $(xen-single-used-m),$(single-used-m)) + +-include $(objtree)/scripts/Makefile.xen +endif + # =========================================================================== ifneq ($(strip $(lib-y) $(lib-m) $(lib-n) $(lib-)),) --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -17,6 +17,12 @@ obj-m := $(filter-out $(obj-y),$(obj-m)) lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m))) +# Remove objects forcibly disabled + +obj-y := $(filter-out $(disabled-obj-y),$(obj-y)) +obj-m := $(filter-out $(disabled-obj-y),$(obj-m)) +lib-y := $(filter-out $(disabled-obj-y),$(lib-y)) + # Handle objects in subdirs # ---------------------------------------------------------------------------