protected: nVHE-based mode with support for guests whose
state is kept private from the host.
- Not valid if the kernel is running in EL2.
Defaults to VHE/nVHE based on hardware support. Setting
mode to "protected" will disable kexec and hibernation
F: arch/riscv/include/uapi/asm/kvm*
F: arch/riscv/kvm/
F: tools/testing/selftests/kvm/*/riscv/
-F: tools/testing/selftests/kvm/riscv/
KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
M: Christian Borntraeger <borntraeger@linux.ibm.com>
struct arch_timer_cpu timer_cpu;
struct kvm_pmu pmu;
- /*
- * Anything that is not used directly from assembly code goes
- * here.
- */
-
/*
* Guest registers we preserve during guest debugging.
*
/*
* Code only run in VHE/NVHE hyp context can assume VHE is present or
* absent. Otherwise fall back to caps.
+ * This allows the compiler to discard VHE-specific code from the
+ * nVHE object, reducing the number of external symbol references
+ * needed to link.
*/
if (is_vhe_hyp_code())
return true;
#ifdef CONFIG_KVM
static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused)
{
- if (kvm_get_mode() != KVM_MODE_PROTECTED)
- return false;
-
- if (is_kernel_in_hyp_mode()) {
- pr_warn("Protected KVM not available with VHE\n");
- return false;
- }
-
- return true;
+ return kvm_get_mode() == KVM_MODE_PROTECTED;
}
#endif /* CONFIG_KVM */
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
struct arch_timer_context *timer;
+ if (WARN(!vcpu, "No vcpu context!\n"))
+ return false;
+
if (vintid == vcpu_vtimer(vcpu)->irq.irq)
timer = vcpu_vtimer(vcpu);
else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
if (ret)
goto out_free_stage2_pgd;
- if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
+ ret = -ENOMEM;
goto out_free_stage2_pgd;
+ }
cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
kvm_vgic_early_init(kvm);
return -EINVAL;
if (strcmp(arg, "protected") == 0) {
- kvm_mode = KVM_MODE_PROTECTED;
+ if (!is_kernel_in_hyp_mode())
+ kvm_mode = KVM_MODE_PROTECTED;
+ else
+ pr_warn_once("Protected KVM not available with VHE\n");
+
return 0;
}
vcpu->arch.flags &= ~KVM_ARM64_FP_ENABLED;
vcpu->arch.flags |= KVM_ARM64_FP_HOST;
+ vcpu->arch.flags &= ~KVM_ARM64_HOST_SVE_ENABLED;
if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED;
* operations. Do this for ZA as well for now for simplicity.
*/
if (system_supports_sme()) {
+ vcpu->arch.flags &= ~KVM_ARM64_HOST_SME_ENABLED;
if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED;
int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
enum kvm_pgtable_prot prot)
{
- hyp_assert_lock_held(&host_kvm.lock);
-
return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
}
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
{
- hyp_assert_lock_held(&host_kvm.lock);
-
return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
addr, size, &host_s2_pool, owner_id);
}
case SYS_ID_AA64MMFR2_EL1:
return get_pvm_id_aa64mmfr2(vcpu);
default:
- /*
- * Should never happen because all cases are covered in
- * pvm_sys_reg_descs[].
- */
- WARN_ON(1);
- break;
+ /* Unhandled ID register, RAZ */
+ return 0;
}
-
- return 0;
}
static u64 read_id_reg(const struct kvm_vcpu *vcpu,
/* Mark the specified system register as an AArch64 feature id register. */
#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 }
+/*
+ * sys_reg_desc initialiser for architecturally unallocated cpufeature ID
+ * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
+ * (1 <= crm < 8, 0 <= Op2 < 8).
+ */
+#define ID_UNALLOCATED(crm, op2) { \
+ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \
+ .access = pvm_access_id_aarch64, \
+}
+
/* Mark the specified system register as Read-As-Zero/Write-Ignored */
#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi }
AARCH32(SYS_MVFR0_EL1),
AARCH32(SYS_MVFR1_EL1),
AARCH32(SYS_MVFR2_EL1),
+ ID_UNALLOCATED(3,3),
AARCH32(SYS_ID_PFR2_EL1),
AARCH32(SYS_ID_DFR1_EL1),
AARCH32(SYS_ID_MMFR5_EL1),
+ ID_UNALLOCATED(3,7),
/* AArch64 ID registers */
/* CRm=4 */
AARCH64(SYS_ID_AA64PFR0_EL1),
AARCH64(SYS_ID_AA64PFR1_EL1),
+ ID_UNALLOCATED(4,2),
+ ID_UNALLOCATED(4,3),
AARCH64(SYS_ID_AA64ZFR0_EL1),
+ ID_UNALLOCATED(4,5),
+ ID_UNALLOCATED(4,6),
+ ID_UNALLOCATED(4,7),
AARCH64(SYS_ID_AA64DFR0_EL1),
AARCH64(SYS_ID_AA64DFR1_EL1),
+ ID_UNALLOCATED(5,2),
+ ID_UNALLOCATED(5,3),
AARCH64(SYS_ID_AA64AFR0_EL1),
AARCH64(SYS_ID_AA64AFR1_EL1),
+ ID_UNALLOCATED(5,6),
+ ID_UNALLOCATED(5,7),
AARCH64(SYS_ID_AA64ISAR0_EL1),
AARCH64(SYS_ID_AA64ISAR1_EL1),
+ AARCH64(SYS_ID_AA64ISAR2_EL1),
+ ID_UNALLOCATED(6,3),
+ ID_UNALLOCATED(6,4),
+ ID_UNALLOCATED(6,5),
+ ID_UNALLOCATED(6,6),
+ ID_UNALLOCATED(6,7),
AARCH64(SYS_ID_AA64MMFR0_EL1),
AARCH64(SYS_ID_AA64MMFR1_EL1),
AARCH64(SYS_ID_AA64MMFR2_EL1),
+ ID_UNALLOCATED(7,3),
+ ID_UNALLOCATED(7,4),
+ ID_UNALLOCATED(7,5),
+ ID_UNALLOCATED(7,6),
+ ID_UNALLOCATED(7,7),
/* Scalable Vector Registers are restricted. */
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
vgic_mmio_read_pending, vgic_mmio_write_spending,
- NULL, vgic_uaccess_write_spending, 1,
+ vgic_uaccess_read_pending, vgic_uaccess_write_spending, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
vgic_mmio_read_pending, vgic_mmio_write_cpending,
- NULL, vgic_uaccess_write_cpending, 1,
+ vgic_uaccess_read_pending, vgic_uaccess_write_cpending, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
vgic_mmio_read_active, vgic_mmio_write_sactive,
return 0;
}
-static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len)
-{
- u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- u32 value = 0;
- int i;
-
- /*
- * pending state of interrupt is latched in pending_latch variable.
- * Userspace will save and restore pending state and line_level
- * separately.
- * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
- * for handling of ISPENDR and ICPENDR.
- */
- for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- bool state = irq->pending_latch;
-
- if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
- int err;
-
- err = irq_get_irqchip_state(irq->host_irq,
- IRQCHIP_STATE_PENDING,
- &state);
- WARN_ON(err);
- }
-
- if (state)
- value |= (1U << i);
-
- vgic_put_irq(vcpu->kvm, irq);
- }
-
- return value;
-}
-
static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
vgic_mmio_read_pending, vgic_mmio_write_spending,
- vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
+ vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
vgic_mmio_read_pending, vgic_mmio_write_cpending,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
vgic_mmio_read_pending, vgic_mmio_write_spending,
- vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
+ vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
vgic_mmio_read_pending, vgic_mmio_write_cpending,
return 0;
}
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len)
+static unsigned long __read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ bool is_user)
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
u32 value = 0;
unsigned long flags;
bool val;
+ /*
+ * When used from userspace with a GICv3 model:
+ *
+ * Pending state of interrupt is latched in pending_latch
+ * variable. Userspace will save and restore pending state
+ * and line_level separately.
+ * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
+ * for handling of ISPENDR and ICPENDR.
+ */
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
int err;
IRQCHIP_STATE_PENDING,
&val);
WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
- } else if (vgic_irq_is_mapped_level(irq)) {
+ } else if (!is_user && vgic_irq_is_mapped_level(irq)) {
val = vgic_get_phys_line_level(irq);
} else {
- val = irq_is_pending(irq);
+ switch (vcpu->kvm->arch.vgic.vgic_model) {
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ if (is_user) {
+ val = irq->pending_latch;
+ break;
+ }
+ fallthrough;
+ default:
+ val = irq_is_pending(irq);
+ break;
+ }
}
value |= ((u32)val << i);
return value;
}
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return __read_pending(vcpu, addr, len, false);
+}
+
+unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return __read_pending(vcpu, addr, len, true);
+}
+
static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
return (vgic_irq_is_sgi(irq->intid) &&
unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len);
+unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val);
* the next context-switch, we broadcast TLB flush + I-cache
* invalidation over the inner shareable domain on rollover.
*/
- kvm_call_hyp(__kvm_flush_vm_context);
+ kvm_call_hyp(__kvm_flush_vm_context);
}
static bool check_update_reserved_vmid(u64 vmid, u64 newvmid)
* We ran out of VMIDs so we increment vmid_version and
* start assigning VMIDs from 1.
*
- * This also means existing VMIDs assignement to all Guest
+ * This also means existing VMIDs assignment to all Guest
* instances is invalid and we have force VMID re-assignement
* for all Guest instances. The Guest instances that were not
* running will automatically pick-up new VMIDs because will
};
enum kvm_apicv_inhibit {
+
+ /********************************************************************/
+ /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
+ /********************************************************************/
+
+ /*
+ * APIC acceleration is disabled by a module parameter
+ * and/or not supported in hardware.
+ */
APICV_INHIBIT_REASON_DISABLE,
+
+ /*
+ * APIC acceleration is inhibited because AutoEOI feature is
+ * being used by a HyperV guest.
+ */
APICV_INHIBIT_REASON_HYPERV,
+
+ /*
+ * APIC acceleration is inhibited because the userspace didn't yet
+ * enable the kernel/split irqchip.
+ */
+ APICV_INHIBIT_REASON_ABSENT,
+
+ /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
+ * (out of band, debug measure of blocking all interrupts on this vCPU)
+ * was enabled, to avoid AVIC/APICv bypassing it.
+ */
+ APICV_INHIBIT_REASON_BLOCKIRQ,
+
+ /*
+ * For simplicity, the APIC acceleration is inhibited
+ * first time either APIC ID or APIC base are changed by the guest
+ * from their reset values.
+ */
+ APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
+
+ /******************************************************/
+ /* INHIBITs that are relevant only to the AMD's AVIC. */
+ /******************************************************/
+
+ /*
+ * AVIC is inhibited on a vCPU because it runs a nested guest.
+ *
+ * This is needed because unlike APICv, the peers of this vCPU
+ * cannot use the doorbell mechanism to signal interrupts via AVIC when
+ * a vCPU runs nested.
+ */
APICV_INHIBIT_REASON_NESTED,
+
+ /*
+ * On SVM, the wait for the IRQ window is implemented with pending vIRQ,
+ * which cannot be injected when the AVIC is enabled, thus AVIC
+ * is inhibited while KVM waits for IRQ window.
+ */
APICV_INHIBIT_REASON_IRQWIN,
+
+ /*
+ * PIT (i8254) 're-inject' mode, relies on EOI intercept,
+ * which AVIC doesn't support for edge triggered interrupts.
+ */
APICV_INHIBIT_REASON_PIT_REINJ,
+
+ /*
+ * AVIC is inhibited because the guest has x2apic in its CPUID.
+ */
APICV_INHIBIT_REASON_X2APIC,
- APICV_INHIBIT_REASON_BLOCKIRQ,
- APICV_INHIBIT_REASON_ABSENT,
+
+ /*
+ * AVIC is disabled because SEV doesn't support it.
+ */
APICV_INHIBIT_REASON_SEV,
};
}
}
+static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
+{
+ struct kvm *kvm = apic->vcpu->kvm;
+
+ if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
+ return;
+
+ if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
+ return;
+
+ kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+}
+
static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
int ret = 0;
switch (reg) {
case APIC_ID: /* Local APIC ID */
- if (!apic_x2apic_mode(apic))
+ if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
- else
+ kvm_lapic_xapic_id_updated(apic);
+ } else {
ret = 1;
+ }
break;
case APIC_TASKPRI:
MSR_IA32_APICBASE_BASE;
if ((value & MSR_IA32_APICBASE_ENABLE) &&
- apic->base_address != APIC_DEFAULT_PHYS_BASE)
- pr_warn_once("APIC base relocation is unsupported by KVM");
+ apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+ kvm_set_apicv_inhibit(apic->vcpu->kvm,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+ }
}
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
+ } else {
+ kvm_lapic_xapic_id_updated(vcpu->arch.apic);
}
return 0;
root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, true);
mmu->pae_root[i] = root | PT_PRESENT_MASK |
- shadow_me_mask;
+ shadow_me_value;
}
mmu->root.hpa = __pa(mmu->pae_root);
} else {
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
- u32 dest, apic_id;
- struct kvm_vcpu *vcpu;
+ u32 l1_physical_id, dest;
+ struct kvm_vcpu *target_vcpu;
int dest_mode = icrl & APIC_DEST_MASK;
int shorthand = icrl & APIC_SHORT_MASK;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
- u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
if (shorthand != APIC_DEST_NOSHORT)
return -EINVAL;
- /*
- * The AVIC incomplete IPI #vmexit info provides index into
- * the physical APIC ID table, which can be used to derive
- * guest physical APIC ID.
- */
+ if (apic_x2apic_mode(source))
+ dest = icrh;
+ else
+ dest = GET_APIC_DEST_FIELD(icrh);
+
if (dest_mode == APIC_DEST_PHYSICAL) {
- apic_id = index;
+ /* broadcast destination, use slow path */
+ if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+ return -EINVAL;
+ if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+ return -EINVAL;
+
+ l1_physical_id = dest;
+
+ if (WARN_ON_ONCE(l1_physical_id != index))
+ return -EINVAL;
+
} else {
- if (!apic_x2apic_mode(source)) {
- /* For xAPIC logical mode, the index is for logical APIC table. */
- apic_id = avic_logical_id_table[index] & 0x1ff;
+ u32 bitmap, cluster;
+ int logid_index;
+
+ if (apic_x2apic_mode(source)) {
+ /* 16 bit dest mask, 16 bit cluster id */
+ bitmap = dest & 0xFFFF0000;
+ cluster = (dest >> 16) << 4;
+ } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+ /* 8 bit dest mask*/
+ bitmap = dest;
+ cluster = 0;
} else {
- return -EINVAL;
+ /* 4 bit desk mask, 4 bit cluster id */
+ bitmap = dest & 0xF;
+ cluster = (dest >> 4) << 2;
}
- }
- /*
- * Assuming vcpu ID is the same as physical apic ID,
- * and use it to retrieve the target vCPU.
- */
- vcpu = kvm_get_vcpu_by_id(kvm, apic_id);
- if (!vcpu)
- return -EINVAL;
+ if (unlikely(!bitmap))
+ /* guest bug: nobody to send the logical interrupt to */
+ return 0;
- if (apic_x2apic_mode(vcpu->arch.apic))
- dest = icrh;
- else
- dest = GET_APIC_DEST_FIELD(icrh);
+ if (!is_power_of_2(bitmap))
+ /* multiple logical destinations, use slow path */
+ return -EINVAL;
- /*
- * Try matching the destination APIC ID with the vCPU.
- */
- if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) {
- vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
- return 0;
+ logid_index = cluster + __ffs(bitmap);
+
+ if (apic_x2apic_mode(source)) {
+ l1_physical_id = logid_index;
+ } else {
+ u32 *avic_logical_id_table =
+ page_address(kvm_svm->avic_logical_id_table_page);
+
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ if (WARN_ON_ONCE(index != logid_index))
+ return -EINVAL;
+
+ /* guest bug: non existing/reserved logical destination */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return 0;
+
+ l1_physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ }
}
- return -EINVAL;
+ target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
+ if (unlikely(!target_vcpu))
+ /* guest bug: non existing vCPU is a target of this IPI*/
+ return 0;
+
+ target_vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(target_vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ return 0;
}
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
return ret;
}
-static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
-{
- u64 *old, *new;
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 id = kvm_xapic_id(vcpu->arch.apic);
-
- if (vcpu->vcpu_id == id)
- return 0;
-
- old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
- new = avic_get_physical_id_entry(vcpu, id);
- if (!new || !old)
- return 1;
-
- /* We need to move physical_id_entry to new offset */
- *new = *old;
- *old = 0ULL;
- to_svm(vcpu)->avic_physical_id_cache = new;
-
- /*
- * Also update the guest physical APIC ID in the logical
- * APIC ID table entry if already setup the LDR.
- */
- if (svm->ldr_reg)
- avic_handle_ldr_update(vcpu);
-
- return 0;
-}
-
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
switch (offset) {
- case APIC_ID:
- if (avic_handle_apic_id_update(vcpu))
- return 0;
- break;
case APIC_LDR:
if (avic_handle_ldr_update(vcpu))
return 0;
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
- if (avic_handle_apic_id_update(vcpu) != 0)
- return;
avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
BIT(APICV_INHIBIT_REASON_X2APIC) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_SEV);
+ BIT(APICV_INHIBIT_REASON_SEV) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
return supported & BIT(reason);
}
return ret;
}
-void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
-void __avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-static void avic_vcpu_load(struct kvm_vcpu *vcpu)
-{
- int cpu = get_cpu();
-
- WARN_ON(cpu != vcpu->cpu);
-
- __avic_vcpu_load(vcpu, cpu);
-
- put_cpu();
-}
-
-static void avic_vcpu_put(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
-
- __avic_vcpu_put(vcpu);
-
- preempt_enable();
-}
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
vmcb_mark_dirty(vmcb, VMCB_AVIC);
if (activated)
- avic_vcpu_load(vcpu);
+ avic_vcpu_load(vcpu, vcpu->cpu);
else
avic_vcpu_put(vcpu);
if (!kvm_vcpu_apicv_active(vcpu))
return;
- avic_vcpu_load(vcpu);
+ avic_vcpu_load(vcpu, vcpu->cpu);
}
struct kvm_vcpu *vcpu = &svm->vcpu;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ u32 pause_count12;
+ u32 pause_thresh12;
/*
* Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
+ pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
if (kvm_pause_in_guest(svm->vcpu.kvm)) {
- /* use guest values since host doesn't use them */
- vmcb02->control.pause_filter_count =
- svm->pause_filter_enabled ?
- svm->nested.ctl.pause_filter_count : 0;
+ /* use guest values since host doesn't intercept PAUSE */
+ vmcb02->control.pause_filter_count = pause_count12;
+ vmcb02->control.pause_filter_thresh = pause_thresh12;
- vmcb02->control.pause_filter_thresh =
- svm->pause_threshold_enabled ?
- svm->nested.ctl.pause_filter_thresh : 0;
-
- } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
- /* use host values when guest doesn't use them */
+ } else {
+ /* start from host values otherwise */
vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
- } else {
- /*
- * Intercept every PAUSE otherwise and
- * ignore both host and guest values
- */
- vmcb02->control.pause_filter_count = 0;
- vmcb02->control.pause_filter_thresh = 0;
+
+ /* ... but ensure filtering is disabled if so requested. */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (!pause_count12)
+ vmcb02->control.pause_filter_count = 0;
+ if (!pause_thresh12)
+ vmcb02->control.pause_filter_thresh = 0;
+ }
}
nested_svm_transition_tlb_flush(vcpu);
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
- if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+ }
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
- if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ if (kvm_pause_in_guest(vcpu->kvm))
return;
control->pause_filter_count = __grow_ple_window(old,
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
- if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ if (kvm_pause_in_guest(vcpu->kvm))
return;
control->pause_filter_count =
indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
- __avic_vcpu_load(vcpu, cpu);
+ avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
if (kvm_vcpu_apicv_active(vcpu))
- __avic_vcpu_put(vcpu);
+ avic_vcpu_put(vcpu);
svm_prepare_host_switch(vcpu);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
-void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void __avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
return supported & BIT(reason);
}
return;
down_read(&vcpu->kvm->arch.apicv_update_lock);
+ preempt_disable();
activate = kvm_vcpu_apicv_activated(vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
out:
+ preempt_enable();
up_read(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
UNAME_M := riscv
endif
-LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c
-LIBKVM_x86_64 = lib/x86_64/apic.c lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S
-LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handlers.S lib/aarch64/spinlock.c lib/aarch64/gic.c lib/aarch64/gic_v3.c lib/aarch64/vgic.c
-LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
-LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
+LIBKVM += lib/assert.c
+LIBKVM += lib/elf.c
+LIBKVM += lib/guest_modes.c
+LIBKVM += lib/io.c
+LIBKVM += lib/kvm_util.c
+LIBKVM += lib/perf_test_util.c
+LIBKVM += lib/rbtree.c
+LIBKVM += lib/sparsebit.c
+LIBKVM += lib/test_util.c
+
+LIBKVM_x86_64 += lib/x86_64/apic.c
+LIBKVM_x86_64 += lib/x86_64/handlers.S
+LIBKVM_x86_64 += lib/x86_64/perf_test_util.c
+LIBKVM_x86_64 += lib/x86_64/processor.c
+LIBKVM_x86_64 += lib/x86_64/svm.c
+LIBKVM_x86_64 += lib/x86_64/ucall.c
+LIBKVM_x86_64 += lib/x86_64/vmx.c
+
+LIBKVM_aarch64 += lib/aarch64/gic.c
+LIBKVM_aarch64 += lib/aarch64/gic_v3.c
+LIBKVM_aarch64 += lib/aarch64/handlers.S
+LIBKVM_aarch64 += lib/aarch64/processor.c
+LIBKVM_aarch64 += lib/aarch64/spinlock.c
+LIBKVM_aarch64 += lib/aarch64/ucall.c
+LIBKVM_aarch64 += lib/aarch64/vgic.c
+
+LIBKVM_s390x += lib/s390x/diag318_test_handler.c
+LIBKVM_s390x += lib/s390x/processor.c
+LIBKVM_s390x += lib/s390x/ucall.c
+
+LIBKVM_riscv += lib/riscv/processor.c
+LIBKVM_riscv += lib/riscv/ucall.c
TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
# $(TEST_GEN_PROGS) starts with $(OUTPUT)/
include ../lib.mk
-STATIC_LIBS := $(OUTPUT)/libkvm.a
LIBKVM_C := $(filter %.c,$(LIBKVM))
LIBKVM_S := $(filter %.S,$(LIBKVM))
LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
-EXTRA_CLEAN += $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(STATIC_LIBS) cscope.*
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
+
+EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.*
x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
-$(OUTPUT)/libkvm.a: $(LIBKVM_OBJS)
- $(AR) crs $@ $^
-
x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
-all: $(STATIC_LIBS)
-$(TEST_GEN_PROGS): $(STATIC_LIBS)
+$(TEST_GEN_PROGS): $(LIBKVM_OBJS)
cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
cscope:
static void help(char *name)
{
puts("");
- printf("usage: %s [-h] [-i iterations] [-p offset] [-g]"
- "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]"
+ printf("usage: %s [-h] [-i iterations] [-p offset] [-g] "
+ "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]"
"[-x memslots]\n", name);
puts("");
printf(" -i: specify iteration counts (default: %"PRIu64")\n",
printf(" -p: specify guest physical test memory offset\n"
" Warning: a low offset can conflict with the loaded test code.\n");
guest_modes_help();
+ printf(" -n: Run the vCPUs in nested mode (L2)\n");
printf(" -b: specify the size of the memory region which should be\n"
" dirtied by each vCPU. e.g. 10M or 3G.\n"
" (default: 1G)\n");
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "ghi:p:m:b:f:v:os:x:")) != -1) {
+ while ((opt = getopt(argc, argv, "ghi:p:m:nb:f:v:os:x:")) != -1) {
switch (opt) {
case 'g':
dirty_log_manual_caps = 0;
case 'm':
guest_modes_cmdline(optarg);
break;
+ case 'n':
+ perf_test_args.nested = true;
+ break;
case 'b':
guest_percpu_mem_size = parse_size(optarg);
break;
struct perf_test_args {
struct kvm_vm *vm;
+ /* The starting address and size of the guest test region. */
uint64_t gpa;
+ uint64_t size;
uint64_t guest_page_size;
int wr_fract;
+ /* Run vCPUs in L2 instead of L1, if the architecture supports it. */
+ bool nested;
+
struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS];
};
void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *));
void perf_test_join_vcpu_threads(int vcpus);
+void perf_test_guest_code(uint32_t vcpu_id);
+
+uint64_t perf_test_nested_pages(int nr_vcpus);
+void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus);
#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */
struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
void vm_xsave_req_perm(int bit);
-enum x86_page_size {
- X86_PAGE_SIZE_4K = 0,
- X86_PAGE_SIZE_2M,
- X86_PAGE_SIZE_1G,
+enum pg_level {
+ PG_LEVEL_NONE,
+ PG_LEVEL_4K,
+ PG_LEVEL_2M,
+ PG_LEVEL_1G,
+ PG_LEVEL_512G,
+ PG_LEVEL_NUM
};
-void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
- enum x86_page_size page_size);
+
+#define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12)
+#define PG_LEVEL_SIZE(_level) (1ull << PG_LEVEL_SHIFT(_level))
+
+#define PG_SIZE_4K PG_LEVEL_SIZE(PG_LEVEL_4K)
+#define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M)
+#define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G)
+
+void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level);
/*
* Basic CPU control in CR0
#define X86_CR0_CD (1UL<<30) /* Cache Disable */
#define X86_CR0_PG (1UL<<31) /* Paging */
-/* VMX_EPT_VPID_CAP bits */
-#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21)
-
#define XSTATE_XTILE_CFG_BIT 17
#define XSTATE_XTILE_DATA_BIT 18
#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
#define VMX_MISC_SAVE_EFER_LMA 0x00000020
+#define VMX_EPT_VPID_CAP_1G_PAGES 0x00020000
+#define VMX_EPT_VPID_CAP_AD_BITS 0x00200000
+
#define EXIT_REASON_FAILED_VMENTRY 0x80000000
#define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
bool nested_vmx_supported(void);
void nested_vmx_check_supported(void);
+bool ept_1g_pages_supported(void);
void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
uint64_t nested_paddr, uint64_t paddr);
uint64_t nested_paddr, uint64_t paddr, uint64_t size);
void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t memslot);
+void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t addr, uint64_t size);
void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t eptp_memslot);
void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
* Continuously write to the first 8 bytes of each page in the
* specified region.
*/
-static void guest_code(uint32_t vcpu_id)
+void perf_test_guest_code(uint32_t vcpu_id)
{
struct perf_test_args *pta = &perf_test_args;
struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_id];
{
struct perf_test_args *pta = &perf_test_args;
struct kvm_vm *vm;
- uint64_t guest_num_pages;
+ uint64_t guest_num_pages, slot0_pages = DEFAULT_GUEST_PHY_PAGES;
uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src);
+ uint64_t region_end_gfn;
int i;
pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
"Guest memory cannot be evenly divided into %d slots.",
slots);
+ /*
+ * If using nested, allocate extra pages for the nested page tables and
+ * in-memory data structures.
+ */
+ if (pta->nested)
+ slot0_pages += perf_test_nested_pages(vcpus);
+
/*
* Pass guest_num_pages to populate the page tables for test memory.
* The memory is also added to memslot 0, but that's a benign side
* effect as KVM allows aliasing HVAs in meslots.
*/
- vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES,
- guest_num_pages, 0, guest_code, NULL);
+ vm = vm_create_with_vcpus(mode, vcpus, slot0_pages, guest_num_pages, 0,
+ perf_test_guest_code, NULL);
pta->vm = vm;
+ /* Put the test region at the top guest physical memory. */
+ region_end_gfn = vm_get_max_gfn(vm) + 1;
+
+#ifdef __x86_64__
+ /*
+ * When running vCPUs in L2, restrict the test region to 48 bits to
+ * avoid needing 5-level page tables to identity map L2.
+ */
+ if (pta->nested)
+ region_end_gfn = min(region_end_gfn, (1UL << 48) / pta->guest_page_size);
+#endif
/*
* If there should be more memory in the guest test region than there
* can be pages in the guest, it will definitely cause problems.
*/
- TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
+ TEST_ASSERT(guest_num_pages < region_end_gfn,
"Requested more guest memory than address space allows.\n"
" guest pages: %" PRIx64 " max gfn: %" PRIx64
" vcpus: %d wss: %" PRIx64 "]\n",
- guest_num_pages, vm_get_max_gfn(vm), vcpus,
+ guest_num_pages, region_end_gfn - 1, vcpus,
vcpu_memory_bytes);
- pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * pta->guest_page_size;
+ pta->gpa = (region_end_gfn - guest_num_pages) * pta->guest_page_size;
pta->gpa = align_down(pta->gpa, backing_src_pagesz);
#ifdef __s390x__
/* Align to 1M (segment size) */
pta->gpa = align_down(pta->gpa, 1 << 20);
#endif
- pr_info("guest physical test memory offset: 0x%lx\n", pta->gpa);
+ pta->size = guest_num_pages * pta->guest_page_size;
+ pr_info("guest physical test memory: [0x%lx, 0x%lx)\n",
+ pta->gpa, pta->gpa + pta->size);
/* Add extra memory slots for testing */
for (i = 0; i < slots; i++) {
perf_test_setup_vcpus(vm, vcpus, vcpu_memory_bytes, partition_vcpu_memory_access);
+ if (pta->nested) {
+ pr_info("Configuring vCPUs to run in L2 (nested).\n");
+ perf_test_setup_nested(vm, vcpus);
+ }
+
ucall_init(vm, NULL);
/* Export the shared variables to the guest. */
sync_global_to_guest(vm, perf_test_args);
}
+uint64_t __weak perf_test_nested_pages(int nr_vcpus)
+{
+ return 0;
+}
+
+void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus)
+{
+ pr_info("%s() not support on this architecture, skipping.\n", __func__);
+ exit(KSFT_SKIP);
+}
+
static void *vcpu_thread_main(void *data)
{
struct vcpu_thread *vcpu = data;
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86_64-specific extensions to perf_test_util.c.
+ *
+ * Copyright (C) 2022, Google, Inc.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "perf_test_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+#include "vmx.h"
+
+void perf_test_l2_guest_code(uint64_t vcpu_id)
+{
+ perf_test_guest_code(vcpu_id);
+ vmcall();
+}
+
+extern char perf_test_l2_guest_entry[];
+__asm__(
+"perf_test_l2_guest_entry:"
+" mov (%rsp), %rdi;"
+" call perf_test_l2_guest_code;"
+" ud2;"
+);
+
+static void perf_test_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ unsigned long *rsp;
+
+ GUEST_ASSERT(vmx->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+ GUEST_ASSERT(load_vmcs(vmx));
+ GUEST_ASSERT(ept_1g_pages_supported());
+
+ rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1];
+ *rsp = vcpu_id;
+ prepare_vmcs(vmx, perf_test_l2_guest_entry, rsp);
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ GUEST_DONE();
+}
+
+uint64_t perf_test_nested_pages(int nr_vcpus)
+{
+ /*
+ * 513 page tables is enough to identity-map 256 TiB of L2 with 1G
+ * pages and 4-level paging, plus a few pages per-vCPU for data
+ * structures such as the VMCS.
+ */
+ return 513 + 10 * nr_vcpus;
+}
+
+void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
+{
+ uint64_t start, end;
+
+ prepare_eptp(vmx, vm, 0);
+
+ /*
+ * Identity map the first 4G and the test region with 1G pages so that
+ * KVM can shadow the EPT12 with the maximum huge page size supported
+ * by the backing source.
+ */
+ nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL);
+
+ start = align_down(perf_test_args.gpa, PG_SIZE_1G);
+ end = align_up(perf_test_args.gpa + perf_test_args.size, PG_SIZE_1G);
+ nested_identity_map_1g(vmx, vm, start, end - start);
+}
+
+void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus)
+{
+ struct vmx_pages *vmx, *vmx0 = NULL;
+ struct kvm_regs regs;
+ vm_vaddr_t vmx_gva;
+ int vcpu_id;
+
+ nested_vmx_check_supported();
+
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ vmx = vcpu_alloc_vmx(vm, &vmx_gva);
+
+ if (vcpu_id == 0) {
+ perf_test_setup_ept(vmx, vm);
+ vmx0 = vmx;
+ } else {
+ /* Share the same EPT table across all vCPUs. */
+ vmx->eptp = vmx0->eptp;
+ vmx->eptp_hva = vmx0->eptp_hva;
+ vmx->eptp_gpa = vmx0->eptp_gpa;
+ }
+
+ /*
+ * Override the vCPU to run perf_test_l1_guest_code() which will
+ * bounce it into L2 before calling perf_test_guest_code().
+ */
+ vcpu_regs_get(vm, vcpu_id, ®s);
+ regs.rip = (unsigned long) perf_test_l1_guest_code;
+ vcpu_regs_set(vm, vcpu_id, ®s);
+ vcpu_args_set(vm, vcpu_id, 2, vmx_gva, vcpu_id);
+ }
+}
int level)
{
uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift);
- int index = vaddr >> (vm->page_shift + level * 9) & 0x1ffu;
+ int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
return &page_table[index];
}
uint64_t pt_pfn,
uint64_t vaddr,
uint64_t paddr,
- int level,
- enum x86_page_size page_size)
+ int current_level,
+ int target_level)
{
- uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
+ uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, current_level);
if (!(*pte & PTE_PRESENT_MASK)) {
*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
- if (level == page_size)
+ if (current_level == target_level)
*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
else
*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
* a hugepage at this level, and that there isn't a hugepage at
* this level.
*/
- TEST_ASSERT(level != page_size,
+ TEST_ASSERT(current_level != target_level,
"Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
- page_size, vaddr);
+ current_level, vaddr);
TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
"Cannot create page table at level: %u, vaddr: 0x%lx\n",
- level, vaddr);
+ current_level, vaddr);
}
return pte;
}
-void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
- enum x86_page_size page_size)
+void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
{
- const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
+ const uint64_t pg_size = PG_LEVEL_SIZE(level);
uint64_t *pml4e, *pdpe, *pde;
uint64_t *pte;
* early if a hugepage was created.
*/
pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
- vaddr, paddr, 3, page_size);
+ vaddr, paddr, PG_LEVEL_512G, level);
if (*pml4e & PTE_LARGE_MASK)
return;
- pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
+ pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, PG_LEVEL_1G, level);
if (*pdpe & PTE_LARGE_MASK)
return;
- pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
+ pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, PG_LEVEL_2M, level);
if (*pde & PTE_LARGE_MASK)
return;
/* Fill in page table entry. */
- pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
+ pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, PG_LEVEL_4K);
TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
"PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
{
- __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
+ __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
}
static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
return true;
}
+static bool ept_vpid_cap_supported(uint64_t mask)
+{
+ return rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & mask;
+}
+
+bool ept_1g_pages_supported(void)
+{
+ return ept_vpid_cap_supported(VMX_EPT_VPID_CAP_1G_PAGES);
+}
+
/*
* Initialize the control fields to the most basic settings possible.
*/
struct eptPageTablePointer eptp = {
.memory_type = VMX_BASIC_MEM_TYPE_WB,
.page_walk_length = 3, /* + 1 */
- .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS),
+ .ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
.address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
};
}
}
-void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
- uint64_t nested_paddr, uint64_t paddr)
+static void nested_create_pte(struct kvm_vm *vm,
+ struct eptPageTableEntry *pte,
+ uint64_t nested_paddr,
+ uint64_t paddr,
+ int current_level,
+ int target_level)
+{
+ if (!pte->readable) {
+ pte->writable = true;
+ pte->readable = true;
+ pte->executable = true;
+ pte->page_size = (current_level == target_level);
+ if (pte->page_size)
+ pte->address = paddr >> vm->page_shift;
+ else
+ pte->address = vm_alloc_page_table(vm) >> vm->page_shift;
+ } else {
+ /*
+ * Entry already present. Assert that the caller doesn't want
+ * a hugepage at this level, and that there isn't a hugepage at
+ * this level.
+ */
+ TEST_ASSERT(current_level != target_level,
+ "Cannot create hugepage at level: %u, nested_paddr: 0x%lx\n",
+ current_level, nested_paddr);
+ TEST_ASSERT(!pte->page_size,
+ "Cannot create page table at level: %u, nested_paddr: 0x%lx\n",
+ current_level, nested_paddr);
+ }
+}
+
+
+void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, int target_level)
{
- uint16_t index[4];
- struct eptPageTableEntry *pml4e;
+ const uint64_t page_size = PG_LEVEL_SIZE(target_level);
+ struct eptPageTableEntry *pt = vmx->eptp_hva, *pte;
+ uint16_t index;
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
- TEST_ASSERT((nested_paddr % vm->page_size) == 0,
+ TEST_ASSERT((nested_paddr >> 48) == 0,
+ "Nested physical address 0x%lx requires 5-level paging",
+ nested_paddr);
+ TEST_ASSERT((nested_paddr % page_size) == 0,
"Nested physical address not on page boundary,\n"
- " nested_paddr: 0x%lx vm->page_size: 0x%x",
- nested_paddr, vm->page_size);
+ " nested_paddr: 0x%lx page_size: 0x%lx",
+ nested_paddr, page_size);
TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn,
"Physical address beyond beyond maximum supported,\n"
" nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
paddr, vm->max_gfn, vm->page_size);
- TEST_ASSERT((paddr % vm->page_size) == 0,
+ TEST_ASSERT((paddr % page_size) == 0,
"Physical address not on page boundary,\n"
- " paddr: 0x%lx vm->page_size: 0x%x",
- paddr, vm->page_size);
+ " paddr: 0x%lx page_size: 0x%lx",
+ paddr, page_size);
TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
"Physical address beyond beyond maximum supported,\n"
" paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
paddr, vm->max_gfn, vm->page_size);
- index[0] = (nested_paddr >> 12) & 0x1ffu;
- index[1] = (nested_paddr >> 21) & 0x1ffu;
- index[2] = (nested_paddr >> 30) & 0x1ffu;
- index[3] = (nested_paddr >> 39) & 0x1ffu;
-
- /* Allocate page directory pointer table if not present. */
- pml4e = vmx->eptp_hva;
- if (!pml4e[index[3]].readable) {
- pml4e[index[3]].address = vm_alloc_page_table(vm) >> vm->page_shift;
- pml4e[index[3]].writable = true;
- pml4e[index[3]].readable = true;
- pml4e[index[3]].executable = true;
- }
+ for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) {
+ index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
+ pte = &pt[index];
- /* Allocate page directory table if not present. */
- struct eptPageTableEntry *pdpe;
- pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
- if (!pdpe[index[2]].readable) {
- pdpe[index[2]].address = vm_alloc_page_table(vm) >> vm->page_shift;
- pdpe[index[2]].writable = true;
- pdpe[index[2]].readable = true;
- pdpe[index[2]].executable = true;
- }
+ nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level);
- /* Allocate page table if not present. */
- struct eptPageTableEntry *pde;
- pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
- if (!pde[index[1]].readable) {
- pde[index[1]].address = vm_alloc_page_table(vm) >> vm->page_shift;
- pde[index[1]].writable = true;
- pde[index[1]].readable = true;
- pde[index[1]].executable = true;
- }
+ if (pte->page_size)
+ break;
- /* Fill in page table entry. */
- struct eptPageTableEntry *pte;
- pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
- pte[index[0]].address = paddr >> vm->page_shift;
- pte[index[0]].writable = true;
- pte[index[0]].readable = true;
- pte[index[0]].executable = true;
+ pt = addr_gpa2hva(vm, pte->address * vm->page_size);
+ }
/*
* For now mark these as accessed and dirty because the only
* testcase we have needs that. Can be reconsidered later.
*/
- pte[index[0]].accessed = true;
- pte[index[0]].dirty = true;
+ pte->accessed = true;
+ pte->dirty = true;
+
+}
+
+void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr)
+{
+ __nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K);
}
/*
* nested_paddr - Nested guest physical address to map
* paddr - VM Physical Address
* size - The size of the range to map
- * eptp_memslot - Memory region slot for new virtual translation tables
+ * level - The level at which to map the range
*
* Output Args: None
*
* Within the VM given by vm, creates a nested guest translation for the
* page range starting at nested_paddr to the page range starting at paddr.
*/
-void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
- uint64_t nested_paddr, uint64_t paddr, uint64_t size)
+void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint64_t size,
+ int level)
{
- size_t page_size = vm->page_size;
+ size_t page_size = PG_LEVEL_SIZE(level);
size_t npages = size / page_size;
TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow");
TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
while (npages--) {
- nested_pg_map(vmx, vm, nested_paddr, paddr);
+ __nested_pg_map(vmx, vm, nested_paddr, paddr, level);
nested_paddr += page_size;
paddr += page_size;
}
}
+void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint64_t size)
+{
+ __nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K);
+}
+
/* Prepare an identity extended page table that maps all the
* physical pages in VM.
*/
}
}
+/* Identity map a region with 1GiB Pages. */
+void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t addr, uint64_t size)
+{
+ __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
+}
+
void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t eptp_memslot)
{
#ifdef __x86_64__
/* Identity map memory in the guest using 1gb pages. */
for (i = 0; i < slot_size; i += size_1gb)
- __virt_pg_map(vm, gpa + i, gpa + i, X86_PAGE_SIZE_1G);
+ __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G);
#else
for (i = 0; i < slot_size; i += vm_get_page_size(vm))
virt_pg_map(vm, gpa + i, gpa + i);
run = vcpu_state(vm, VCPU_ID);
/* Map 1gb page without a backing memlot. */
- __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, X86_PAGE_SIZE_1G);
+ __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, PG_LEVEL_1G);
r = _vcpu_run(vm, VCPU_ID);
vcpu->stat.generic.blocking = 1;
+ preempt_disable();
kvm_arch_vcpu_blocking(vcpu);
-
prepare_to_rcuwait(wait);
+ preempt_enable();
+
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
waited = true;
schedule();
}
- finish_rcuwait(wait);
+ preempt_disable();
+ finish_rcuwait(wait);
kvm_arch_vcpu_unblocking(vcpu);
+ preempt_enable();
vcpu->stat.generic.blocking = 0;