]> git.ipfire.org Git - people/ms/linux.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 14 Jun 2022 14:57:18 +0000 (07:57 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 14 Jun 2022 14:57:18 +0000 (07:57 -0700)
Pull kvm fixes from Paolo Bonzini:
 "While last week's pull request contained miscellaneous fixes for x86,
  this one covers other architectures, selftests changes, and a bigger
  series for APIC virtualization bugs that were discovered during 5.20
  development. The idea is to base 5.20 development for KVM on top of
  this tag.

  ARM64:

   - Properly reset the SVE/SME flags on vcpu load

   - Fix a vgic-v2 regression regarding accessing the pending state of a
     HW interrupt from userspace (and make the code common with vgic-v3)

   - Fix access to the idreg range for protected guests

   - Ignore 'kvm-arm.mode=protected' when using VHE

   - Return an error from kvm_arch_init_vm() on allocation failure

   - A bunch of small cleanups (comments, annotations, indentation)

  RISC-V:

   - Typo fix in arch/riscv/kvm/vmid.c

   - Remove broken reference pattern from MAINTAINERS entry

  x86-64:

   - Fix error in page tables with MKTME enabled

   - Dirty page tracking performance test extended to running a nested
     guest

   - Disable APICv/AVIC in cases that it cannot implement correctly"

[ This merge also fixes a misplaced end parenthesis bug introduced in
  commit 3743c2f02517 ("KVM: x86: inhibit APICv/AVIC on changes to APIC
  ID or APIC base") pointed out by Sean Christopherson ]

Link: https://lore.kernel.org/all/20220610191813.371682-1-seanjc@google.com/
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (34 commits)
  KVM: selftests: Restrict test region to 48-bit physical addresses when using nested
  KVM: selftests: Add option to run dirty_log_perf_test vCPUs in L2
  KVM: selftests: Clean up LIBKVM files in Makefile
  KVM: selftests: Link selftests directly with lib object files
  KVM: selftests: Drop unnecessary rule for STATIC_LIBS
  KVM: selftests: Add a helper to check EPT/VPID capabilities
  KVM: selftests: Move VMX_EPT_VPID_CAP_AD_BITS to vmx.h
  KVM: selftests: Refactor nested_map() to specify target level
  KVM: selftests: Drop stale function parameter comment for nested_map()
  KVM: selftests: Add option to create 2M and 1G EPT mappings
  KVM: selftests: Replace x86_page_size with PG_LEVEL_XX
  KVM: x86: SVM: fix nested PAUSE filtering when L0 intercepts PAUSE
  KVM: x86: SVM: drop preempt-safe wrappers for avic_vcpu_load/put
  KVM: x86: disable preemption around the call to kvm_arch_vcpu_{un|}blocking
  KVM: x86: disable preemption while updating apicv inhibition
  KVM: x86: SVM: fix avic_kick_target_vcpus_fast
  KVM: x86: SVM: remove avic's broken code that updated APIC ID
  KVM: x86: inhibit APICv/AVIC on changes to APIC ID or APIC base
  KVM: x86: document AVIC/APICv inhibit reasons
  KVM: x86/mmu: Set memory encryption "value", not "mask", in shadow PDPTRs
  ...

37 files changed:
Documentation/admin-guide/kernel-parameters.txt
MAINTAINERS
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/virt.h
arch/arm64/kernel/cpufeature.c
arch/arm64/kvm/arch_timer.c
arch/arm64/kvm/arm.c
arch/arm64/kvm/fpsimd.c
arch/arm64/kvm/hyp/nvhe/mem_protect.c
arch/arm64/kvm/hyp/nvhe/sys_regs.c
arch/arm64/kvm/vgic/vgic-mmio-v2.c
arch/arm64/kvm/vgic/vgic-mmio-v3.c
arch/arm64/kvm/vgic/vgic-mmio.c
arch/arm64/kvm/vgic/vgic-mmio.h
arch/arm64/kvm/vmid.c
arch/riscv/kvm/vmid.c
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/svm/avic.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/dirty_log_perf_test.c
tools/testing/selftests/kvm/include/perf_test_util.h
tools/testing/selftests/kvm/include/x86_64/processor.h
tools/testing/selftests/kvm/include/x86_64/vmx.h
tools/testing/selftests/kvm/lib/perf_test_util.c
tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c [new file with mode: 0644]
tools/testing/selftests/kvm/lib/x86_64/processor.c
tools/testing/selftests/kvm/lib/x86_64/vmx.c
tools/testing/selftests/kvm/max_guest_memory_test.c
tools/testing/selftests/kvm/x86_64/mmu_role_test.c
virt/kvm/kvm_main.c

index bba2f27d73582e2f9df6bd095b26531848e68e1f..2522b11e593f2397840d59541f9befcebed505b1 100644 (file)
 
                        protected: nVHE-based mode with support for guests whose
                                   state is kept private from the host.
-                                  Not valid if the kernel is running in EL2.
 
                        Defaults to VHE/nVHE based on hardware support. Setting
                        mode to "protected" will disable kexec and hibernation
index 1fc9ead83d2aa3e60ccc4cfa8ee16df09ef579bf..43d3d07afccd4b92c15a8dc6ec08ec618caa4304 100644 (file)
@@ -10872,7 +10872,6 @@ F:      arch/riscv/include/asm/kvm*
 F:     arch/riscv/include/uapi/asm/kvm*
 F:     arch/riscv/kvm/
 F:     tools/testing/selftests/kvm/*/riscv/
-F:     tools/testing/selftests/kvm/riscv/
 
 KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
 M:     Christian Borntraeger <borntraeger@linux.ibm.com>
index 47a1e25e25bbc836e2c99cec064fd9dfd4ce7a09..de32152cea0484fb03ea11d6417fb410db577b68 100644 (file)
@@ -362,11 +362,6 @@ struct kvm_vcpu_arch {
        struct arch_timer_cpu timer_cpu;
        struct kvm_pmu pmu;
 
-       /*
-        * Anything that is not used directly from assembly code goes
-        * here.
-        */
-
        /*
         * Guest registers we preserve during guest debugging.
         *
index 3c8af033a9976cdfd567a5a6086f013aa30e11dd..0e80db4327b604c3613890289fc07e01ed6df6ae 100644 (file)
@@ -113,6 +113,9 @@ static __always_inline bool has_vhe(void)
        /*
         * Code only run in VHE/NVHE hyp context can assume VHE is present or
         * absent. Otherwise fall back to caps.
+        * This allows the compiler to discard VHE-specific code from the
+        * nVHE object, reducing the number of external symbol references
+        * needed to link.
         */
        if (is_vhe_hyp_code())
                return true;
index 42ea2bd856c6059e3813d6219d754986aeca7669..79fac13ab2efc5d3ed09762f3860d6a4d67efe56 100644 (file)
@@ -1974,15 +1974,7 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
 #ifdef CONFIG_KVM
 static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused)
 {
-       if (kvm_get_mode() != KVM_MODE_PROTECTED)
-               return false;
-
-       if (is_kernel_in_hyp_mode()) {
-               pr_warn("Protected KVM not available with VHE\n");
-               return false;
-       }
-
-       return true;
+       return kvm_get_mode() == KVM_MODE_PROTECTED;
 }
 #endif /* CONFIG_KVM */
 
index 4e39ace073af0fa70ff255c301f869df6463ad2f..3b8d062e30ea412b148ccb6bc214b0907b3d16f2 100644 (file)
@@ -1230,6 +1230,9 @@ bool kvm_arch_timer_get_input_level(int vintid)
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
        struct arch_timer_context *timer;
 
+       if (WARN(!vcpu, "No vcpu context!\n"))
+               return false;
+
        if (vintid == vcpu_vtimer(vcpu)->irq.irq)
                timer = vcpu_vtimer(vcpu);
        else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
index 400bb0fe2745b4549a616b717a396f4df18ed697..a0188144a122be9ea8e7673f13a98c2dbd9eae64 100644 (file)
@@ -150,8 +150,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (ret)
                goto out_free_stage2_pgd;
 
-       if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL))
+       if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
+               ret = -ENOMEM;
                goto out_free_stage2_pgd;
+       }
        cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
 
        kvm_vgic_early_init(kvm);
@@ -2271,7 +2273,11 @@ static int __init early_kvm_mode_cfg(char *arg)
                return -EINVAL;
 
        if (strcmp(arg, "protected") == 0) {
-               kvm_mode = KVM_MODE_PROTECTED;
+               if (!is_kernel_in_hyp_mode())
+                       kvm_mode = KVM_MODE_PROTECTED;
+               else
+                       pr_warn_once("Protected KVM not available with VHE\n");
+
                return 0;
        }
 
index 3d251a4d2cf7bfdb3ed82df94d30b8d6b48a458c..6012b08ecb14e7f6464374cf780c9bf271af1cca 100644 (file)
@@ -80,6 +80,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
        vcpu->arch.flags &= ~KVM_ARM64_FP_ENABLED;
        vcpu->arch.flags |= KVM_ARM64_FP_HOST;
 
+       vcpu->arch.flags &= ~KVM_ARM64_HOST_SVE_ENABLED;
        if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
                vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED;
 
@@ -93,6 +94,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
         * operations. Do this for ZA as well for now for simplicity.
         */
        if (system_supports_sme()) {
+               vcpu->arch.flags &= ~KVM_ARM64_HOST_SME_ENABLED;
                if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
                        vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED;
 
index 78edf077fa3b61e9d99bcba88b8e370b8dd09bd3..1e78acf9662eb17490f2973ccd5a8831c876ee9b 100644 (file)
@@ -314,15 +314,11 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
                             enum kvm_pgtable_prot prot)
 {
-       hyp_assert_lock_held(&host_kvm.lock);
-
        return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
 }
 
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
 {
-       hyp_assert_lock_held(&host_kvm.lock);
-
        return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
                               addr, size, &host_s2_pool, owner_id);
 }
index b6d86e423319f5cce0fda2fd191a1dc994652b88..35a4331ba5f31e6be1ebdcf8b87e65910346c023 100644 (file)
@@ -243,15 +243,9 @@ u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id)
        case SYS_ID_AA64MMFR2_EL1:
                return get_pvm_id_aa64mmfr2(vcpu);
        default:
-               /*
-                * Should never happen because all cases are covered in
-                * pvm_sys_reg_descs[].
-                */
-               WARN_ON(1);
-               break;
+               /* Unhandled ID register, RAZ */
+               return 0;
        }
-
-       return 0;
 }
 
 static u64 read_id_reg(const struct kvm_vcpu *vcpu,
@@ -332,6 +326,16 @@ static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu,
 /* Mark the specified system register as an AArch64 feature id register. */
 #define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 }
 
+/*
+ * sys_reg_desc initialiser for architecturally unallocated cpufeature ID
+ * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
+ * (1 <= crm < 8, 0 <= Op2 < 8).
+ */
+#define ID_UNALLOCATED(crm, op2) {                     \
+       Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2),     \
+       .access = pvm_access_id_aarch64,                \
+}
+
 /* Mark the specified system register as Read-As-Zero/Write-Ignored */
 #define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi }
 
@@ -375,24 +379,46 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
        AARCH32(SYS_MVFR0_EL1),
        AARCH32(SYS_MVFR1_EL1),
        AARCH32(SYS_MVFR2_EL1),
+       ID_UNALLOCATED(3,3),
        AARCH32(SYS_ID_PFR2_EL1),
        AARCH32(SYS_ID_DFR1_EL1),
        AARCH32(SYS_ID_MMFR5_EL1),
+       ID_UNALLOCATED(3,7),
 
        /* AArch64 ID registers */
        /* CRm=4 */
        AARCH64(SYS_ID_AA64PFR0_EL1),
        AARCH64(SYS_ID_AA64PFR1_EL1),
+       ID_UNALLOCATED(4,2),
+       ID_UNALLOCATED(4,3),
        AARCH64(SYS_ID_AA64ZFR0_EL1),
+       ID_UNALLOCATED(4,5),
+       ID_UNALLOCATED(4,6),
+       ID_UNALLOCATED(4,7),
        AARCH64(SYS_ID_AA64DFR0_EL1),
        AARCH64(SYS_ID_AA64DFR1_EL1),
+       ID_UNALLOCATED(5,2),
+       ID_UNALLOCATED(5,3),
        AARCH64(SYS_ID_AA64AFR0_EL1),
        AARCH64(SYS_ID_AA64AFR1_EL1),
+       ID_UNALLOCATED(5,6),
+       ID_UNALLOCATED(5,7),
        AARCH64(SYS_ID_AA64ISAR0_EL1),
        AARCH64(SYS_ID_AA64ISAR1_EL1),
+       AARCH64(SYS_ID_AA64ISAR2_EL1),
+       ID_UNALLOCATED(6,3),
+       ID_UNALLOCATED(6,4),
+       ID_UNALLOCATED(6,5),
+       ID_UNALLOCATED(6,6),
+       ID_UNALLOCATED(6,7),
        AARCH64(SYS_ID_AA64MMFR0_EL1),
        AARCH64(SYS_ID_AA64MMFR1_EL1),
        AARCH64(SYS_ID_AA64MMFR2_EL1),
+       ID_UNALLOCATED(7,3),
+       ID_UNALLOCATED(7,4),
+       ID_UNALLOCATED(7,5),
+       ID_UNALLOCATED(7,6),
+       ID_UNALLOCATED(7,7),
 
        /* Scalable Vector Registers are restricted. */
 
index 77a67e9d3d14b947fa8e8369da4a184024bf03df..e070cda86e12ffeb12a94284cc34e93d55cf4353 100644 (file)
@@ -429,11 +429,11 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = {
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
                vgic_mmio_read_pending, vgic_mmio_write_spending,
-               NULL, vgic_uaccess_write_spending, 1,
+               vgic_uaccess_read_pending, vgic_uaccess_write_spending, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
                vgic_mmio_read_pending, vgic_mmio_write_cpending,
-               NULL, vgic_uaccess_write_cpending, 1,
+               vgic_uaccess_read_pending, vgic_uaccess_write_cpending, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
                vgic_mmio_read_active, vgic_mmio_write_sactive,
index f7aa7bcd6fb8cdaed3272b577fa5fa15de290203..f15e29cc63ce1bf6b1dbac3502c1ea370bae1e3d 100644 (file)
@@ -353,42 +353,6 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
-                                                 gpa_t addr, unsigned int len)
-{
-       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-       u32 value = 0;
-       int i;
-
-       /*
-        * pending state of interrupt is latched in pending_latch variable.
-        * Userspace will save and restore pending state and line_level
-        * separately.
-        * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
-        * for handling of ISPENDR and ICPENDR.
-        */
-       for (i = 0; i < len * 8; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-               bool state = irq->pending_latch;
-
-               if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-                       int err;
-
-                       err = irq_get_irqchip_state(irq->host_irq,
-                                                   IRQCHIP_STATE_PENDING,
-                                                   &state);
-                       WARN_ON(err);
-               }
-
-               if (state)
-                       value |= (1U << i);
-
-               vgic_put_irq(vcpu->kvm, irq);
-       }
-
-       return value;
-}
-
 static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
                                         gpa_t addr, unsigned int len,
                                         unsigned long val)
@@ -666,7 +630,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
                vgic_mmio_read_pending, vgic_mmio_write_spending,
-               vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
+               vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
                vgic_mmio_read_pending, vgic_mmio_write_cpending,
@@ -750,7 +714,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
                vgic_mmio_read_pending, vgic_mmio_write_spending,
-               vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
+               vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
                vgic_mmio_read_pending, vgic_mmio_write_cpending,
index 49837d3a3ef5625ba0c5e4d2fed57e362b55ffc0..997d0fce2088345891b22d45e7728412abd90f7d 100644 (file)
@@ -226,8 +226,9 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
-                                    gpa_t addr, unsigned int len)
+static unsigned long __read_pending(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   bool is_user)
 {
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        u32 value = 0;
@@ -239,6 +240,15 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
                unsigned long flags;
                bool val;
 
+               /*
+                * When used from userspace with a GICv3 model:
+                *
+                * Pending state of interrupt is latched in pending_latch
+                * variable.  Userspace will save and restore pending state
+                * and line_level separately.
+                * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
+                * for handling of ISPENDR and ICPENDR.
+                */
                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
                        int err;
@@ -248,10 +258,20 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
                                                    IRQCHIP_STATE_PENDING,
                                                    &val);
                        WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-               } else if (vgic_irq_is_mapped_level(irq)) {
+               } else if (!is_user && vgic_irq_is_mapped_level(irq)) {
                        val = vgic_get_phys_line_level(irq);
                } else {
-                       val = irq_is_pending(irq);
+                       switch (vcpu->kvm->arch.vgic.vgic_model) {
+                       case KVM_DEV_TYPE_ARM_VGIC_V3:
+                               if (is_user) {
+                                       val = irq->pending_latch;
+                                       break;
+                               }
+                               fallthrough;
+                       default:
+                               val = irq_is_pending(irq);
+                               break;
+                       }
                }
 
                value |= ((u32)val << i);
@@ -263,6 +283,18 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
        return value;
 }
 
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len)
+{
+       return __read_pending(vcpu, addr, len, false);
+}
+
+unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
+                                       gpa_t addr, unsigned int len)
+{
+       return __read_pending(vcpu, addr, len, true);
+}
+
 static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
 {
        return (vgic_irq_is_sgi(irq->intid) &&
index 3fa696f198a37ab9aad81832db4b76d61bcf1ece..6082d4b66d3983495967dc02f63db8d72106b77c 100644 (file)
@@ -149,6 +149,9 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
 unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
                                     gpa_t addr, unsigned int len);
 
+unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
+                                       gpa_t addr, unsigned int len);
+
 void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
                              gpa_t addr, unsigned int len,
                              unsigned long val);
index 8d5f0506fd87f392de4792431c1fdf7c0b2c2745..d78ae63d7c15f802c3d4796d56daa5e6bde319c2 100644 (file)
@@ -66,7 +66,7 @@ static void flush_context(void)
         * the next context-switch, we broadcast TLB flush + I-cache
         * invalidation over the inner shareable domain on rollover.
         */
-        kvm_call_hyp(__kvm_flush_vm_context);
+       kvm_call_hyp(__kvm_flush_vm_context);
 }
 
 static bool check_update_reserved_vmid(u64 vmid, u64 newvmid)
index 9f764df125db9f1190249f6f3c69fac6d41e468e..6cd93995fb65e1931a5739dc19e5d58429e3fa1b 100644 (file)
@@ -97,7 +97,7 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu)
                 * We ran out of VMIDs so we increment vmid_version and
                 * start assigning VMIDs from 1.
                 *
-                * This also means existing VMIDs assignement to all Guest
+                * This also means existing VMIDs assignment to all Guest
                 * instances is invalid and we have force VMID re-assignement
                 * for all Guest instances. The Guest instances that were not
                 * running will automatically pick-up new VMIDs because will
index 3a240a64ac68f4274f093b99b54f4a8914f9372d..9217bd6cf0d14888bd2e8e7034858100934feaf1 100644 (file)
@@ -1047,14 +1047,77 @@ struct kvm_x86_msr_filter {
 };
 
 enum kvm_apicv_inhibit {
+
+       /********************************************************************/
+       /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
+       /********************************************************************/
+
+       /*
+        * APIC acceleration is disabled by a module parameter
+        * and/or not supported in hardware.
+        */
        APICV_INHIBIT_REASON_DISABLE,
+
+       /*
+        * APIC acceleration is inhibited because AutoEOI feature is
+        * being used by a HyperV guest.
+        */
        APICV_INHIBIT_REASON_HYPERV,
+
+       /*
+        * APIC acceleration is inhibited because the userspace didn't yet
+        * enable the kernel/split irqchip.
+        */
+       APICV_INHIBIT_REASON_ABSENT,
+
+       /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
+        * (out of band, debug measure of blocking all interrupts on this vCPU)
+        * was enabled, to avoid AVIC/APICv bypassing it.
+        */
+       APICV_INHIBIT_REASON_BLOCKIRQ,
+
+       /*
+        * For simplicity, the APIC acceleration is inhibited
+        * first time either APIC ID or APIC base are changed by the guest
+        * from their reset values.
+        */
+       APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
+       APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
+
+       /******************************************************/
+       /* INHIBITs that are relevant only to the AMD's AVIC. */
+       /******************************************************/
+
+       /*
+        * AVIC is inhibited on a vCPU because it runs a nested guest.
+        *
+        * This is needed because unlike APICv, the peers of this vCPU
+        * cannot use the doorbell mechanism to signal interrupts via AVIC when
+        * a vCPU runs nested.
+        */
        APICV_INHIBIT_REASON_NESTED,
+
+       /*
+        * On SVM, the wait for the IRQ window is implemented with pending vIRQ,
+        * which cannot be injected when the AVIC is enabled, thus AVIC
+        * is inhibited while KVM waits for IRQ window.
+        */
        APICV_INHIBIT_REASON_IRQWIN,
+
+       /*
+        * PIT (i8254) 're-inject' mode, relies on EOI intercept,
+        * which AVIC doesn't support for edge triggered interrupts.
+        */
        APICV_INHIBIT_REASON_PIT_REINJ,
+
+       /*
+        * AVIC is inhibited because the guest has x2apic in its CPUID.
+        */
        APICV_INHIBIT_REASON_X2APIC,
-       APICV_INHIBIT_REASON_BLOCKIRQ,
-       APICV_INHIBIT_REASON_ABSENT,
+
+       /*
+        * AVIC is disabled because SEV doesn't support it.
+        */
        APICV_INHIBIT_REASON_SEV,
 };
 
index f1bdac3f5aa8a845cb34dbb02cb3321878be8048..0e68b4c937fcd9971d5f24c943a8868618bb3fb2 100644 (file)
@@ -2039,6 +2039,19 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
        }
 }
 
+static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
+{
+       struct kvm *kvm = apic->vcpu->kvm;
+
+       if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
+               return;
+
+       if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
+               return;
+
+       kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+}
+
 static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 {
        int ret = 0;
@@ -2047,10 +2060,12 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
        switch (reg) {
        case APIC_ID:           /* Local APIC ID */
-               if (!apic_x2apic_mode(apic))
+               if (!apic_x2apic_mode(apic)) {
                        kvm_apic_set_xapic_id(apic, val >> 24);
-               else
+                       kvm_lapic_xapic_id_updated(apic);
+               } else {
                        ret = 1;
+               }
                break;
 
        case APIC_TASKPRI:
@@ -2336,8 +2351,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                             MSR_IA32_APICBASE_BASE;
 
        if ((value & MSR_IA32_APICBASE_ENABLE) &&
-            apic->base_address != APIC_DEFAULT_PHYS_BASE)
-               pr_warn_once("APIC base relocation is unsupported by KVM");
+            apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+               kvm_set_apicv_inhibit(apic->vcpu->kvm,
+                                     APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+       }
 }
 
 void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
@@ -2648,6 +2665,8 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
                        icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
                        __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
                }
+       } else {
+               kvm_lapic_xapic_id_updated(vcpu->arch.apic);
        }
 
        return 0;
index e826ee9138fa895dcc48ce5b7d784dd202c0c4bd..17252f39bd7c2ef466b1519e6c97de025f0f6562 100644 (file)
@@ -3411,7 +3411,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                        root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
                                              i << 30, PT32_ROOT_LEVEL, true);
                        mmu->pae_root[i] = root | PT_PRESENT_MASK |
-                                          shadow_me_mask;
+                                          shadow_me_value;
                }
                mmu->root.hpa = __pa(mmu->pae_root);
        } else {
index 54fe03714f8a6af9cc929f595d2f0b2ff86cb439..d1bc5820ea469349bedb6f61cc8d209cdb75190f 100644 (file)
@@ -291,58 +291,91 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
 static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
                                       u32 icrl, u32 icrh, u32 index)
 {
-       u32 dest, apic_id;
-       struct kvm_vcpu *vcpu;
+       u32 l1_physical_id, dest;
+       struct kvm_vcpu *target_vcpu;
        int dest_mode = icrl & APIC_DEST_MASK;
        int shorthand = icrl & APIC_SHORT_MASK;
        struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
-       u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
 
        if (shorthand != APIC_DEST_NOSHORT)
                return -EINVAL;
 
-       /*
-        * The AVIC incomplete IPI #vmexit info provides index into
-        * the physical APIC ID table, which can be used to derive
-        * guest physical APIC ID.
-        */
+       if (apic_x2apic_mode(source))
+               dest = icrh;
+       else
+               dest = GET_APIC_DEST_FIELD(icrh);
+
        if (dest_mode == APIC_DEST_PHYSICAL) {
-               apic_id = index;
+               /* broadcast destination, use slow path */
+               if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+                       return -EINVAL;
+               if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+                       return -EINVAL;
+
+               l1_physical_id = dest;
+
+               if (WARN_ON_ONCE(l1_physical_id != index))
+                       return -EINVAL;
+
        } else {
-               if (!apic_x2apic_mode(source)) {
-                       /* For xAPIC logical mode, the index is for logical APIC table. */
-                       apic_id = avic_logical_id_table[index] & 0x1ff;
+               u32 bitmap, cluster;
+               int logid_index;
+
+               if (apic_x2apic_mode(source)) {
+                       /* 16 bit dest mask, 16 bit cluster id */
+                       bitmap = dest & 0xFFFF0000;
+                       cluster = (dest >> 16) << 4;
+               } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+                       /* 8 bit dest mask*/
+                       bitmap = dest;
+                       cluster = 0;
                } else {
-                       return -EINVAL;
+                       /* 4 bit desk mask, 4 bit cluster id */
+                       bitmap = dest & 0xF;
+                       cluster = (dest >> 4) << 2;
                }
-       }
 
-       /*
-        * Assuming vcpu ID is the same as physical apic ID,
-        * and use it to retrieve the target vCPU.
-        */
-       vcpu = kvm_get_vcpu_by_id(kvm, apic_id);
-       if (!vcpu)
-               return -EINVAL;
+               if (unlikely(!bitmap))
+                       /* guest bug: nobody to send the logical interrupt to */
+                       return 0;
 
-       if (apic_x2apic_mode(vcpu->arch.apic))
-               dest = icrh;
-       else
-               dest = GET_APIC_DEST_FIELD(icrh);
+               if (!is_power_of_2(bitmap))
+                       /* multiple logical destinations, use slow path */
+                       return -EINVAL;
 
-       /*
-        * Try matching the destination APIC ID with the vCPU.
-        */
-       if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) {
-               vcpu->arch.apic->irr_pending = true;
-               svm_complete_interrupt_delivery(vcpu,
-                                               icrl & APIC_MODE_MASK,
-                                               icrl & APIC_INT_LEVELTRIG,
-                                               icrl & APIC_VECTOR_MASK);
-               return 0;
+               logid_index = cluster + __ffs(bitmap);
+
+               if (apic_x2apic_mode(source)) {
+                       l1_physical_id = logid_index;
+               } else {
+                       u32 *avic_logical_id_table =
+                               page_address(kvm_svm->avic_logical_id_table_page);
+
+                       u32 logid_entry = avic_logical_id_table[logid_index];
+
+                       if (WARN_ON_ONCE(index != logid_index))
+                               return -EINVAL;
+
+                       /* guest bug: non existing/reserved logical destination */
+                       if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+                               return 0;
+
+                       l1_physical_id = logid_entry &
+                                        AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+               }
        }
 
-       return -EINVAL;
+       target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
+       if (unlikely(!target_vcpu))
+               /* guest bug: non existing vCPU is a target of this IPI*/
+               return 0;
+
+       target_vcpu->arch.apic->irr_pending = true;
+       svm_complete_interrupt_delivery(target_vcpu,
+                                       icrl & APIC_MODE_MASK,
+                                       icrl & APIC_INT_LEVELTRIG,
+                                       icrl & APIC_VECTOR_MASK);
+       return 0;
 }
 
 static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
@@ -508,35 +541,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
        return ret;
 }
 
-static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
-{
-       u64 *old, *new;
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u32 id = kvm_xapic_id(vcpu->arch.apic);
-
-       if (vcpu->vcpu_id == id)
-               return 0;
-
-       old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
-       new = avic_get_physical_id_entry(vcpu, id);
-       if (!new || !old)
-               return 1;
-
-       /* We need to move physical_id_entry to new offset */
-       *new = *old;
-       *old = 0ULL;
-       to_svm(vcpu)->avic_physical_id_cache = new;
-
-       /*
-        * Also update the guest physical APIC ID in the logical
-        * APIC ID table entry if already setup the LDR.
-        */
-       if (svm->ldr_reg)
-               avic_handle_ldr_update(vcpu);
-
-       return 0;
-}
-
 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -555,10 +559,6 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
                                AVIC_UNACCEL_ACCESS_OFFSET_MASK;
 
        switch (offset) {
-       case APIC_ID:
-               if (avic_handle_apic_id_update(vcpu))
-                       return 0;
-               break;
        case APIC_LDR:
                if (avic_handle_ldr_update(vcpu))
                        return 0;
@@ -650,8 +650,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
 
 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 {
-       if (avic_handle_apic_id_update(vcpu) != 0)
-               return;
        avic_handle_dfr_update(vcpu);
        avic_handle_ldr_update(vcpu);
 }
@@ -910,7 +908,9 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
                          BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
                          BIT(APICV_INHIBIT_REASON_X2APIC) |
                          BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
-                         BIT(APICV_INHIBIT_REASON_SEV);
+                         BIT(APICV_INHIBIT_REASON_SEV)      |
+                         BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+                         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
 
        return supported & BIT(reason);
 }
@@ -946,7 +946,7 @@ out:
        return ret;
 }
 
-void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        u64 entry;
        int h_physical_id = kvm_cpu_get_apicid(cpu);
@@ -978,7 +978,7 @@ void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
 }
 
-void __avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
 {
        u64 entry;
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -997,25 +997,6 @@ void __avic_vcpu_put(struct kvm_vcpu *vcpu)
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 }
 
-static void avic_vcpu_load(struct kvm_vcpu *vcpu)
-{
-       int cpu = get_cpu();
-
-       WARN_ON(cpu != vcpu->cpu);
-
-       __avic_vcpu_load(vcpu, cpu);
-
-       put_cpu();
-}
-
-static void avic_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       preempt_disable();
-
-       __avic_vcpu_put(vcpu);
-
-       preempt_enable();
-}
 
 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 {
@@ -1042,7 +1023,7 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
        vmcb_mark_dirty(vmcb, VMCB_AVIC);
 
        if (activated)
-               avic_vcpu_load(vcpu);
+               avic_vcpu_load(vcpu, vcpu->cpu);
        else
                avic_vcpu_put(vcpu);
 
@@ -1075,5 +1056,5 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
        if (!kvm_vcpu_apicv_active(vcpu))
                return;
 
-       avic_vcpu_load(vcpu);
+       avic_vcpu_load(vcpu, vcpu->cpu);
 }
index 3361258640a27e15f3ab04f46e6daedbfb09079a..ba7cd26f438fc8a1eb7bb87e053a7e2117d2eb3b 100644 (file)
@@ -616,6 +616,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
        struct kvm_vcpu *vcpu = &svm->vcpu;
        struct vmcb *vmcb01 = svm->vmcb01.ptr;
        struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+       u32 pause_count12;
+       u32 pause_thresh12;
 
        /*
         * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
@@ -671,27 +673,25 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
        if (!nested_vmcb_needs_vls_intercept(svm))
                vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
 
+       pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
+       pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
        if (kvm_pause_in_guest(svm->vcpu.kvm)) {
-               /* use guest values since host doesn't use them */
-               vmcb02->control.pause_filter_count =
-                               svm->pause_filter_enabled ?
-                               svm->nested.ctl.pause_filter_count : 0;
+               /* use guest values since host doesn't intercept PAUSE */
+               vmcb02->control.pause_filter_count = pause_count12;
+               vmcb02->control.pause_filter_thresh = pause_thresh12;
 
-               vmcb02->control.pause_filter_thresh =
-                               svm->pause_threshold_enabled ?
-                               svm->nested.ctl.pause_filter_thresh : 0;
-
-       } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
-               /* use host values when guest doesn't use them */
+       } else {
+               /* start from host values otherwise */
                vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
                vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
-       } else {
-               /*
-                * Intercept every PAUSE otherwise and
-                * ignore both host and guest values
-                */
-               vmcb02->control.pause_filter_count = 0;
-               vmcb02->control.pause_filter_thresh = 0;
+
+               /* ... but ensure filtering is disabled if so requested.  */
+               if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+                       if (!pause_count12)
+                               vmcb02->control.pause_filter_count = 0;
+                       if (!pause_thresh12)
+                               vmcb02->control.pause_filter_thresh = 0;
+               }
        }
 
        nested_svm_transition_tlb_flush(vcpu);
@@ -951,8 +951,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
        vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
 
-       if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
+       if (!kvm_pause_in_guest(vcpu->kvm)) {
                vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+               vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+       }
 
        nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
 
index 1dc02cdf69602e19934f9ac08f7cdcc2ce7c25ee..87da90360bc76215ceb0342c612ef264b0acf1d0 100644 (file)
@@ -921,7 +921,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
        struct vmcb_control_area *control = &svm->vmcb->control;
        int old = control->pause_filter_count;
 
-       if (kvm_pause_in_guest(vcpu->kvm) || !old)
+       if (kvm_pause_in_guest(vcpu->kvm))
                return;
 
        control->pause_filter_count = __grow_ple_window(old,
@@ -942,7 +942,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
        struct vmcb_control_area *control = &svm->vmcb->control;
        int old = control->pause_filter_count;
 
-       if (kvm_pause_in_guest(vcpu->kvm) || !old)
+       if (kvm_pause_in_guest(vcpu->kvm))
                return;
 
        control->pause_filter_count =
@@ -1400,13 +1400,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                indirect_branch_prediction_barrier();
        }
        if (kvm_vcpu_apicv_active(vcpu))
-               __avic_vcpu_load(vcpu, cpu);
+               avic_vcpu_load(vcpu, cpu);
 }
 
 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 {
        if (kvm_vcpu_apicv_active(vcpu))
-               __avic_vcpu_put(vcpu);
+               avic_vcpu_put(vcpu);
 
        svm_prepare_host_switch(vcpu);
 
index 500348c1cb350871b0d996d3154a1a509662ad70..1bddd336a27e088c16d747237e369dd3f13d73fd 100644 (file)
@@ -610,8 +610,8 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
 int avic_init_vcpu(struct vcpu_svm *svm);
-void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void __avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
 void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
index 6df17ef81905f5a14f60ab0eda9ea8787b81b53c..3a919e49129bf5458692ea1c058f2bc28971bdf0 100644 (file)
@@ -7779,7 +7779,9 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
        ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
                          BIT(APICV_INHIBIT_REASON_ABSENT) |
                          BIT(APICV_INHIBIT_REASON_HYPERV) |
-                         BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
+                         BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+                         BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+                         BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
 
        return supported & BIT(reason);
 }
index 5e78d5889e5e0f92b3ddff19712c6caaa10ee0e6..1910e1e78b1534346cbf0fd85a35ad05c4c31003 100644 (file)
@@ -9853,6 +9853,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
                return;
 
        down_read(&vcpu->kvm->arch.apicv_update_lock);
+       preempt_disable();
 
        activate = kvm_vcpu_apicv_activated(vcpu);
 
@@ -9873,6 +9874,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 out:
+       preempt_enable();
        up_read(&vcpu->kvm->arch.apicv_update_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
index 81470a99ed1c04ab4bfeee5ca438b170327573e8..22423c871ed6173a354faf3d3dc6cb5103515938 100644 (file)
@@ -37,11 +37,38 @@ ifeq ($(ARCH),riscv)
        UNAME_M := riscv
 endif
 
-LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c
-LIBKVM_x86_64 = lib/x86_64/apic.c lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S
-LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handlers.S lib/aarch64/spinlock.c lib/aarch64/gic.c lib/aarch64/gic_v3.c lib/aarch64/vgic.c
-LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
-LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
+LIBKVM += lib/assert.c
+LIBKVM += lib/elf.c
+LIBKVM += lib/guest_modes.c
+LIBKVM += lib/io.c
+LIBKVM += lib/kvm_util.c
+LIBKVM += lib/perf_test_util.c
+LIBKVM += lib/rbtree.c
+LIBKVM += lib/sparsebit.c
+LIBKVM += lib/test_util.c
+
+LIBKVM_x86_64 += lib/x86_64/apic.c
+LIBKVM_x86_64 += lib/x86_64/handlers.S
+LIBKVM_x86_64 += lib/x86_64/perf_test_util.c
+LIBKVM_x86_64 += lib/x86_64/processor.c
+LIBKVM_x86_64 += lib/x86_64/svm.c
+LIBKVM_x86_64 += lib/x86_64/ucall.c
+LIBKVM_x86_64 += lib/x86_64/vmx.c
+
+LIBKVM_aarch64 += lib/aarch64/gic.c
+LIBKVM_aarch64 += lib/aarch64/gic_v3.c
+LIBKVM_aarch64 += lib/aarch64/handlers.S
+LIBKVM_aarch64 += lib/aarch64/processor.c
+LIBKVM_aarch64 += lib/aarch64/spinlock.c
+LIBKVM_aarch64 += lib/aarch64/ucall.c
+LIBKVM_aarch64 += lib/aarch64/vgic.c
+
+LIBKVM_s390x += lib/s390x/diag318_test_handler.c
+LIBKVM_s390x += lib/s390x/processor.c
+LIBKVM_s390x += lib/s390x/ucall.c
+
+LIBKVM_riscv += lib/riscv/processor.c
+LIBKVM_riscv += lib/riscv/ucall.c
 
 TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
@@ -173,12 +200,13 @@ LDFLAGS += -pthread $(no-pie-option) $(pgste-option)
 # $(TEST_GEN_PROGS) starts with $(OUTPUT)/
 include ../lib.mk
 
-STATIC_LIBS := $(OUTPUT)/libkvm.a
 LIBKVM_C := $(filter %.c,$(LIBKVM))
 LIBKVM_S := $(filter %.S,$(LIBKVM))
 LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
 LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
-EXTRA_CLEAN += $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(STATIC_LIBS) cscope.*
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
+
+EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.*
 
 x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
 $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
@@ -187,13 +215,8 @@ $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
 $(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
        $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
-$(OUTPUT)/libkvm.a: $(LIBKVM_OBJS)
-       $(AR) crs $@ $^
-
 x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
-all: $(STATIC_LIBS)
-$(TEST_GEN_PROGS): $(STATIC_LIBS)
+$(TEST_GEN_PROGS): $(LIBKVM_OBJS)
 
 cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
 cscope:
index 7b47ae4f952e68a5003092321814c3265d3bc08f..d60a34cdfaee1fb58195c71e07cf20efbd038166 100644 (file)
@@ -336,8 +336,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 static void help(char *name)
 {
        puts("");
-       printf("usage: %s [-h] [-i iterations] [-p offset] [-g]"
-              "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]"
+       printf("usage: %s [-h] [-i iterations] [-p offset] [-g] "
+              "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]"
               "[-x memslots]\n", name);
        puts("");
        printf(" -i: specify iteration counts (default: %"PRIu64")\n",
@@ -351,6 +351,7 @@ static void help(char *name)
        printf(" -p: specify guest physical test memory offset\n"
               "     Warning: a low offset can conflict with the loaded test code.\n");
        guest_modes_help();
+       printf(" -n: Run the vCPUs in nested mode (L2)\n");
        printf(" -b: specify the size of the memory region which should be\n"
               "     dirtied by each vCPU. e.g. 10M or 3G.\n"
               "     (default: 1G)\n");
@@ -387,7 +388,7 @@ int main(int argc, char *argv[])
 
        guest_modes_append_default();
 
-       while ((opt = getopt(argc, argv, "ghi:p:m:b:f:v:os:x:")) != -1) {
+       while ((opt = getopt(argc, argv, "ghi:p:m:nb:f:v:os:x:")) != -1) {
                switch (opt) {
                case 'g':
                        dirty_log_manual_caps = 0;
@@ -401,6 +402,9 @@ int main(int argc, char *argv[])
                case 'm':
                        guest_modes_cmdline(optarg);
                        break;
+               case 'n':
+                       perf_test_args.nested = true;
+                       break;
                case 'b':
                        guest_percpu_mem_size = parse_size(optarg);
                        break;
index a86f953d8d36569389f786797f54b76dcb153114..d822cb670f1cd46e9078ea1613b3abc38106c29c 100644 (file)
@@ -30,10 +30,15 @@ struct perf_test_vcpu_args {
 
 struct perf_test_args {
        struct kvm_vm *vm;
+       /* The starting address and size of the guest test region. */
        uint64_t gpa;
+       uint64_t size;
        uint64_t guest_page_size;
        int wr_fract;
 
+       /* Run vCPUs in L2 instead of L1, if the architecture supports it. */
+       bool nested;
+
        struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS];
 };
 
@@ -49,5 +54,9 @@ void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract);
 
 void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *));
 void perf_test_join_vcpu_threads(int vcpus);
+void perf_test_guest_code(uint32_t vcpu_id);
+
+uint64_t perf_test_nested_pages(int nr_vcpus);
+void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus);
 
 #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */
index d0d51adec76eb88f12564fa38e52d0df2ba535f2..6ce185449259f329cac0f80669466bd49a942547 100644 (file)
@@ -482,13 +482,23 @@ void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
 struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
 void vm_xsave_req_perm(int bit);
 
-enum x86_page_size {
-       X86_PAGE_SIZE_4K = 0,
-       X86_PAGE_SIZE_2M,
-       X86_PAGE_SIZE_1G,
+enum pg_level {
+       PG_LEVEL_NONE,
+       PG_LEVEL_4K,
+       PG_LEVEL_2M,
+       PG_LEVEL_1G,
+       PG_LEVEL_512G,
+       PG_LEVEL_NUM
 };
-void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-                  enum x86_page_size page_size);
+
+#define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12)
+#define PG_LEVEL_SIZE(_level) (1ull << PG_LEVEL_SHIFT(_level))
+
+#define PG_SIZE_4K PG_LEVEL_SIZE(PG_LEVEL_4K)
+#define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M)
+#define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G)
+
+void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level);
 
 /*
  * Basic CPU control in CR0
@@ -505,9 +515,6 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 #define X86_CR0_CD          (1UL<<30) /* Cache Disable */
 #define X86_CR0_PG          (1UL<<31) /* Paging */
 
-/* VMX_EPT_VPID_CAP bits */
-#define VMX_EPT_VPID_CAP_AD_BITS       (1ULL << 21)
-
 #define XSTATE_XTILE_CFG_BIT           17
 #define XSTATE_XTILE_DATA_BIT          18
 
index 583ceb0d14574ab456c14cf12290a216ca7b32a0..cc3604f8f1d3c52c74ba0f307b69bf7bf7e78f0e 100644 (file)
@@ -96,6 +96,9 @@
 #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK    0x0000001f
 #define VMX_MISC_SAVE_EFER_LMA                 0x00000020
 
+#define VMX_EPT_VPID_CAP_1G_PAGES              0x00020000
+#define VMX_EPT_VPID_CAP_AD_BITS               0x00200000
+
 #define EXIT_REASON_FAILED_VMENTRY     0x80000000
 #define EXIT_REASON_EXCEPTION_NMI      0
 #define EXIT_REASON_EXTERNAL_INTERRUPT 1
@@ -606,6 +609,7 @@ bool load_vmcs(struct vmx_pages *vmx);
 
 bool nested_vmx_supported(void);
 void nested_vmx_check_supported(void);
+bool ept_1g_pages_supported(void);
 
 void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
                   uint64_t nested_paddr, uint64_t paddr);
@@ -613,6 +617,8 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
                 uint64_t nested_paddr, uint64_t paddr, uint64_t size);
 void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
                        uint32_t memslot);
+void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
+                           uint64_t addr, uint64_t size);
 void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
                  uint32_t eptp_memslot);
 void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
index 722df3a28791c3b7eb3fd967c9f1ea195f815796..f989ff91f022abf2f84f595f7fea48250bb8f0eb 100644 (file)
@@ -40,7 +40,7 @@ static bool all_vcpu_threads_running;
  * Continuously write to the first 8 bytes of each page in the
  * specified region.
  */
-static void guest_code(uint32_t vcpu_id)
+void perf_test_guest_code(uint32_t vcpu_id)
 {
        struct perf_test_args *pta = &perf_test_args;
        struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_id];
@@ -108,8 +108,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
 {
        struct perf_test_args *pta = &perf_test_args;
        struct kvm_vm *vm;
-       uint64_t guest_num_pages;
+       uint64_t guest_num_pages, slot0_pages = DEFAULT_GUEST_PHY_PAGES;
        uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src);
+       uint64_t region_end_gfn;
        int i;
 
        pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
@@ -134,34 +135,54 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
                    "Guest memory cannot be evenly divided into %d slots.",
                    slots);
 
+       /*
+        * If using nested, allocate extra pages for the nested page tables and
+        * in-memory data structures.
+        */
+       if (pta->nested)
+               slot0_pages += perf_test_nested_pages(vcpus);
+
        /*
         * Pass guest_num_pages to populate the page tables for test memory.
         * The memory is also added to memslot 0, but that's a benign side
         * effect as KVM allows aliasing HVAs in meslots.
         */
-       vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES,
-                                 guest_num_pages, 0, guest_code, NULL);
+       vm = vm_create_with_vcpus(mode, vcpus, slot0_pages, guest_num_pages, 0,
+                                 perf_test_guest_code, NULL);
 
        pta->vm = vm;
 
+       /* Put the test region at the top guest physical memory. */
+       region_end_gfn = vm_get_max_gfn(vm) + 1;
+
+#ifdef __x86_64__
+       /*
+        * When running vCPUs in L2, restrict the test region to 48 bits to
+        * avoid needing 5-level page tables to identity map L2.
+        */
+       if (pta->nested)
+               region_end_gfn = min(region_end_gfn, (1UL << 48) / pta->guest_page_size);
+#endif
        /*
         * If there should be more memory in the guest test region than there
         * can be pages in the guest, it will definitely cause problems.
         */
-       TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
+       TEST_ASSERT(guest_num_pages < region_end_gfn,
                    "Requested more guest memory than address space allows.\n"
                    "    guest pages: %" PRIx64 " max gfn: %" PRIx64
                    " vcpus: %d wss: %" PRIx64 "]\n",
-                   guest_num_pages, vm_get_max_gfn(vm), vcpus,
+                   guest_num_pages, region_end_gfn - 1, vcpus,
                    vcpu_memory_bytes);
 
-       pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * pta->guest_page_size;
+       pta->gpa = (region_end_gfn - guest_num_pages) * pta->guest_page_size;
        pta->gpa = align_down(pta->gpa, backing_src_pagesz);
 #ifdef __s390x__
        /* Align to 1M (segment size) */
        pta->gpa = align_down(pta->gpa, 1 << 20);
 #endif
-       pr_info("guest physical test memory offset: 0x%lx\n", pta->gpa);
+       pta->size = guest_num_pages * pta->guest_page_size;
+       pr_info("guest physical test memory: [0x%lx, 0x%lx)\n",
+               pta->gpa, pta->gpa + pta->size);
 
        /* Add extra memory slots for testing */
        for (i = 0; i < slots; i++) {
@@ -178,6 +199,11 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
 
        perf_test_setup_vcpus(vm, vcpus, vcpu_memory_bytes, partition_vcpu_memory_access);
 
+       if (pta->nested) {
+               pr_info("Configuring vCPUs to run in L2 (nested).\n");
+               perf_test_setup_nested(vm, vcpus);
+       }
+
        ucall_init(vm, NULL);
 
        /* Export the shared variables to the guest. */
@@ -198,6 +224,17 @@ void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract)
        sync_global_to_guest(vm, perf_test_args);
 }
 
+uint64_t __weak perf_test_nested_pages(int nr_vcpus)
+{
+       return 0;
+}
+
+void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus)
+{
+       pr_info("%s() not support on this architecture, skipping.\n", __func__);
+       exit(KSFT_SKIP);
+}
+
 static void *vcpu_thread_main(void *data)
 {
        struct vcpu_thread *vcpu = data;
diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c
new file mode 100644 (file)
index 0000000..e258524
--- /dev/null
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86_64-specific extensions to perf_test_util.c.
+ *
+ * Copyright (C) 2022, Google, Inc.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "perf_test_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+#include "vmx.h"
+
+void perf_test_l2_guest_code(uint64_t vcpu_id)
+{
+       perf_test_guest_code(vcpu_id);
+       vmcall();
+}
+
+extern char perf_test_l2_guest_entry[];
+__asm__(
+"perf_test_l2_guest_entry:"
+"      mov (%rsp), %rdi;"
+"      call perf_test_l2_guest_code;"
+"      ud2;"
+);
+
+static void perf_test_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
+{
+#define L2_GUEST_STACK_SIZE 64
+       unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+       unsigned long *rsp;
+
+       GUEST_ASSERT(vmx->vmcs_gpa);
+       GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+       GUEST_ASSERT(load_vmcs(vmx));
+       GUEST_ASSERT(ept_1g_pages_supported());
+
+       rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1];
+       *rsp = vcpu_id;
+       prepare_vmcs(vmx, perf_test_l2_guest_entry, rsp);
+
+       GUEST_ASSERT(!vmlaunch());
+       GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+       GUEST_DONE();
+}
+
+uint64_t perf_test_nested_pages(int nr_vcpus)
+{
+       /*
+        * 513 page tables is enough to identity-map 256 TiB of L2 with 1G
+        * pages and 4-level paging, plus a few pages per-vCPU for data
+        * structures such as the VMCS.
+        */
+       return 513 + 10 * nr_vcpus;
+}
+
+void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
+{
+       uint64_t start, end;
+
+       prepare_eptp(vmx, vm, 0);
+
+       /*
+        * Identity map the first 4G and the test region with 1G pages so that
+        * KVM can shadow the EPT12 with the maximum huge page size supported
+        * by the backing source.
+        */
+       nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL);
+
+       start = align_down(perf_test_args.gpa, PG_SIZE_1G);
+       end = align_up(perf_test_args.gpa + perf_test_args.size, PG_SIZE_1G);
+       nested_identity_map_1g(vmx, vm, start, end - start);
+}
+
+void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus)
+{
+       struct vmx_pages *vmx, *vmx0 = NULL;
+       struct kvm_regs regs;
+       vm_vaddr_t vmx_gva;
+       int vcpu_id;
+
+       nested_vmx_check_supported();
+
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               vmx = vcpu_alloc_vmx(vm, &vmx_gva);
+
+               if (vcpu_id == 0) {
+                       perf_test_setup_ept(vmx, vm);
+                       vmx0 = vmx;
+               } else {
+                       /* Share the same EPT table across all vCPUs. */
+                       vmx->eptp = vmx0->eptp;
+                       vmx->eptp_hva = vmx0->eptp_hva;
+                       vmx->eptp_gpa = vmx0->eptp_gpa;
+               }
+
+               /*
+                * Override the vCPU to run perf_test_l1_guest_code() which will
+                * bounce it into L2 before calling perf_test_guest_code().
+                */
+               vcpu_regs_get(vm, vcpu_id, &regs);
+               regs.rip = (unsigned long) perf_test_l1_guest_code;
+               vcpu_regs_set(vm, vcpu_id, &regs);
+               vcpu_args_set(vm, vcpu_id, 2, vmx_gva, vcpu_id);
+       }
+}
index 33ea5e9955d9bddbe844a36cdbf886d6637f8a53..ead7011ee8f615cc2ea6017bd33925f4772439eb 100644 (file)
@@ -158,7 +158,7 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
                          int level)
 {
        uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift);
-       int index = vaddr >> (vm->page_shift + level * 9) & 0x1ffu;
+       int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
 
        return &page_table[index];
 }
@@ -167,14 +167,14 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
                                       uint64_t pt_pfn,
                                       uint64_t vaddr,
                                       uint64_t paddr,
-                                      int level,
-                                      enum x86_page_size page_size)
+                                      int current_level,
+                                      int target_level)
 {
-       uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
+       uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, current_level);
 
        if (!(*pte & PTE_PRESENT_MASK)) {
                *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
-               if (level == page_size)
+               if (current_level == target_level)
                        *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
                else
                        *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
@@ -184,20 +184,19 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
                 * a hugepage at this level, and that there isn't a hugepage at
                 * this level.
                 */
-               TEST_ASSERT(level != page_size,
+               TEST_ASSERT(current_level != target_level,
                            "Cannot create hugepage at level: %u, vaddr: 0x%lx\n",
-                           page_size, vaddr);
+                           current_level, vaddr);
                TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
                            "Cannot create page table at level: %u, vaddr: 0x%lx\n",
-                           level, vaddr);
+                           current_level, vaddr);
        }
        return pte;
 }
 
-void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
-                  enum x86_page_size page_size)
+void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
 {
-       const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
+       const uint64_t pg_size = PG_LEVEL_SIZE(level);
        uint64_t *pml4e, *pdpe, *pde;
        uint64_t *pte;
 
@@ -222,20 +221,20 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
         * early if a hugepage was created.
         */
        pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
-                                     vaddr, paddr, 3, page_size);
+                                     vaddr, paddr, PG_LEVEL_512G, level);
        if (*pml4e & PTE_LARGE_MASK)
                return;
 
-       pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
+       pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, PG_LEVEL_1G, level);
        if (*pdpe & PTE_LARGE_MASK)
                return;
 
-       pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
+       pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, PG_LEVEL_2M, level);
        if (*pde & PTE_LARGE_MASK)
                return;
 
        /* Fill in page table entry. */
-       pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
+       pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, PG_LEVEL_4K);
        TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
                    "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
        *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
@@ -243,7 +242,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 
 void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 {
-       __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K);
+       __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
 }
 
 static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid,
index d089d8b850b5c0daa879718ea18b043c64664ded..b77a01d0a27139b94845286361dc5750417a8f0c 100644 (file)
@@ -198,6 +198,16 @@ bool load_vmcs(struct vmx_pages *vmx)
        return true;
 }
 
+static bool ept_vpid_cap_supported(uint64_t mask)
+{
+       return rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & mask;
+}
+
+bool ept_1g_pages_supported(void)
+{
+       return ept_vpid_cap_supported(VMX_EPT_VPID_CAP_1G_PAGES);
+}
+
 /*
  * Initialize the control fields to the most basic settings possible.
  */
@@ -215,7 +225,7 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
                struct eptPageTablePointer eptp = {
                        .memory_type = VMX_BASIC_MEM_TYPE_WB,
                        .page_walk_length = 3, /* + 1 */
-                       .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS),
+                       .ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
                        .address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
                };
 
@@ -392,80 +402,93 @@ void nested_vmx_check_supported(void)
        }
 }
 
-void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-                  uint64_t nested_paddr, uint64_t paddr)
+static void nested_create_pte(struct kvm_vm *vm,
+                             struct eptPageTableEntry *pte,
+                             uint64_t nested_paddr,
+                             uint64_t paddr,
+                             int current_level,
+                             int target_level)
+{
+       if (!pte->readable) {
+               pte->writable = true;
+               pte->readable = true;
+               pte->executable = true;
+               pte->page_size = (current_level == target_level);
+               if (pte->page_size)
+                       pte->address = paddr >> vm->page_shift;
+               else
+                       pte->address = vm_alloc_page_table(vm) >> vm->page_shift;
+       } else {
+               /*
+                * Entry already present.  Assert that the caller doesn't want
+                * a hugepage at this level, and that there isn't a hugepage at
+                * this level.
+                */
+               TEST_ASSERT(current_level != target_level,
+                           "Cannot create hugepage at level: %u, nested_paddr: 0x%lx\n",
+                           current_level, nested_paddr);
+               TEST_ASSERT(!pte->page_size,
+                           "Cannot create page table at level: %u, nested_paddr: 0x%lx\n",
+                           current_level, nested_paddr);
+       }
+}
+
+
+void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+                    uint64_t nested_paddr, uint64_t paddr, int target_level)
 {
-       uint16_t index[4];
-       struct eptPageTableEntry *pml4e;
+       const uint64_t page_size = PG_LEVEL_SIZE(target_level);
+       struct eptPageTableEntry *pt = vmx->eptp_hva, *pte;
+       uint16_t index;
 
        TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
                    "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 
-       TEST_ASSERT((nested_paddr % vm->page_size) == 0,
+       TEST_ASSERT((nested_paddr >> 48) == 0,
+                   "Nested physical address 0x%lx requires 5-level paging",
+                   nested_paddr);
+       TEST_ASSERT((nested_paddr % page_size) == 0,
                    "Nested physical address not on page boundary,\n"
-                   "  nested_paddr: 0x%lx vm->page_size: 0x%x",
-                   nested_paddr, vm->page_size);
+                   "  nested_paddr: 0x%lx page_size: 0x%lx",
+                   nested_paddr, page_size);
        TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn,
                    "Physical address beyond beyond maximum supported,\n"
                    "  nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
                    paddr, vm->max_gfn, vm->page_size);
-       TEST_ASSERT((paddr % vm->page_size) == 0,
+       TEST_ASSERT((paddr % page_size) == 0,
                    "Physical address not on page boundary,\n"
-                   "  paddr: 0x%lx vm->page_size: 0x%x",
-                   paddr, vm->page_size);
+                   "  paddr: 0x%lx page_size: 0x%lx",
+                   paddr, page_size);
        TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
                    "Physical address beyond beyond maximum supported,\n"
                    "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
                    paddr, vm->max_gfn, vm->page_size);
 
-       index[0] = (nested_paddr >> 12) & 0x1ffu;
-       index[1] = (nested_paddr >> 21) & 0x1ffu;
-       index[2] = (nested_paddr >> 30) & 0x1ffu;
-       index[3] = (nested_paddr >> 39) & 0x1ffu;
-
-       /* Allocate page directory pointer table if not present. */
-       pml4e = vmx->eptp_hva;
-       if (!pml4e[index[3]].readable) {
-               pml4e[index[3]].address = vm_alloc_page_table(vm) >> vm->page_shift;
-               pml4e[index[3]].writable = true;
-               pml4e[index[3]].readable = true;
-               pml4e[index[3]].executable = true;
-       }
+       for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) {
+               index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
+               pte = &pt[index];
 
-       /* Allocate page directory table if not present. */
-       struct eptPageTableEntry *pdpe;
-       pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
-       if (!pdpe[index[2]].readable) {
-               pdpe[index[2]].address = vm_alloc_page_table(vm) >> vm->page_shift;
-               pdpe[index[2]].writable = true;
-               pdpe[index[2]].readable = true;
-               pdpe[index[2]].executable = true;
-       }
+               nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level);
 
-       /* Allocate page table if not present. */
-       struct eptPageTableEntry *pde;
-       pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
-       if (!pde[index[1]].readable) {
-               pde[index[1]].address = vm_alloc_page_table(vm) >> vm->page_shift;
-               pde[index[1]].writable = true;
-               pde[index[1]].readable = true;
-               pde[index[1]].executable = true;
-       }
+               if (pte->page_size)
+                       break;
 
-       /* Fill in page table entry. */
-       struct eptPageTableEntry *pte;
-       pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
-       pte[index[0]].address = paddr >> vm->page_shift;
-       pte[index[0]].writable = true;
-       pte[index[0]].readable = true;
-       pte[index[0]].executable = true;
+               pt = addr_gpa2hva(vm, pte->address * vm->page_size);
+       }
 
        /*
         * For now mark these as accessed and dirty because the only
         * testcase we have needs that.  Can be reconsidered later.
         */
-       pte[index[0]].accessed = true;
-       pte[index[0]].dirty = true;
+       pte->accessed = true;
+       pte->dirty = true;
+
+}
+
+void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+                  uint64_t nested_paddr, uint64_t paddr)
+{
+       __nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K);
 }
 
 /*
@@ -476,7 +499,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
  *   nested_paddr - Nested guest physical address to map
  *   paddr - VM Physical Address
  *   size - The size of the range to map
- *   eptp_memslot - Memory region slot for new virtual translation tables
+ *   level - The level at which to map the range
  *
  * Output Args: None
  *
@@ -485,22 +508,29 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
  * Within the VM given by vm, creates a nested guest translation for the
  * page range starting at nested_paddr to the page range starting at paddr.
  */
-void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-               uint64_t nested_paddr, uint64_t paddr, uint64_t size)
+void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+                 uint64_t nested_paddr, uint64_t paddr, uint64_t size,
+                 int level)
 {
-       size_t page_size = vm->page_size;
+       size_t page_size = PG_LEVEL_SIZE(level);
        size_t npages = size / page_size;
 
        TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow");
        TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
 
        while (npages--) {
-               nested_pg_map(vmx, vm, nested_paddr, paddr);
+               __nested_pg_map(vmx, vm, nested_paddr, paddr, level);
                nested_paddr += page_size;
                paddr += page_size;
        }
 }
 
+void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+               uint64_t nested_paddr, uint64_t paddr, uint64_t size)
+{
+       __nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K);
+}
+
 /* Prepare an identity extended page table that maps all the
  * physical pages in VM.
  */
@@ -525,6 +555,13 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
        }
 }
 
+/* Identity map a region with 1GiB Pages. */
+void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
+                           uint64_t addr, uint64_t size)
+{
+       __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
+}
+
 void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
                  uint32_t eptp_memslot)
 {
index 3875c4b23a04f37fd67f88c8774e6aa91caf09a2..15f046e19cb2c21a72cbaeba1ed26382fb690aea 100644 (file)
@@ -244,7 +244,7 @@ int main(int argc, char *argv[])
 #ifdef __x86_64__
                /* Identity map memory in the guest using 1gb pages. */
                for (i = 0; i < slot_size; i += size_1gb)
-                       __virt_pg_map(vm, gpa + i, gpa + i, X86_PAGE_SIZE_1G);
+                       __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G);
 #else
                for (i = 0; i < slot_size; i += vm_get_page_size(vm))
                        virt_pg_map(vm, gpa + i, gpa + i);
index da2325fcad87bf52b65da4652f7c72e536f524a7..bdecd532f9356fa761974d259664c24fed271ef1 100644 (file)
@@ -35,7 +35,7 @@ static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val)
        run = vcpu_state(vm, VCPU_ID);
 
        /* Map 1gb page without a backing memlot. */
-       __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, X86_PAGE_SIZE_1G);
+       __virt_pg_map(vm, MMIO_GPA, MMIO_GPA, PG_LEVEL_1G);
 
        r = _vcpu_run(vm, VCPU_ID);
 
index 44c47670447aa4cc250f255ae9e5ec31ad1df377..a49df8988cd6a46660c222f19ae2fb654a873e65 100644 (file)
@@ -3328,9 +3328,11 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
 
        vcpu->stat.generic.blocking = 1;
 
+       preempt_disable();
        kvm_arch_vcpu_blocking(vcpu);
-
        prepare_to_rcuwait(wait);
+       preempt_enable();
+
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
 
@@ -3340,9 +3342,11 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
                waited = true;
                schedule();
        }
-       finish_rcuwait(wait);
 
+       preempt_disable();
+       finish_rcuwait(wait);
        kvm_arch_vcpu_unblocking(vcpu);
+       preempt_enable();
 
        vcpu->stat.generic.blocking = 0;