From: Greg Kroah-Hartman Date: Wed, 12 May 2021 10:53:26 +0000 (+0200) Subject: 5.11-stable patches X-Git-Tag: v5.4.119~41 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=81dddb2fb8924a1b34eac21eebd878acad51ebd4;p=thirdparty%2Fkernel%2Fstable-queue.git 5.11-stable patches added patches: kvm-arm-arm64-fix-kvm_vgic_v3_addr_type_redist-read.patch kvm-arm64-fix-kvm_vgic_v3_addr_type_redist_region-read.patch kvm-arm64-fully-zero-the-vcpu-state-on-reset.patch kvm-destroy-i-o-bus-devices-on-unregister-failure-_after_-sync-ing-srcu.patch kvm-nsvm-set-the-shadow-root-level-to-the-tdp-level-for-nested-npt.patch kvm-nvmx-defer-the-mmu-reload-to-the-normal-path-on-an-eptp-switch.patch kvm-nvmx-truncate-base-index-gpr-value-on-address-calc-in-64-bit.patch kvm-nvmx-truncate-bits-63-32-of-vmcs-field-on-nested-check-in-64-bit.patch kvm-selftests-always-run-vcpu-thread-with-blocked-sig_ipi.patch kvm-selftests-sync-data-verify-of-dirty-logging-with-guest-sync.patch kvm-stop-looking-for-coalesced-mmio-zones-if-the-bus-is-destroyed.patch kvm-svm-do-not-allow-sev-sev-es-initialization-after-vcpus-are-created.patch kvm-svm-do-not-set-sev-es_active-until-kvm_sev_es_init-completes.patch kvm-svm-don-t-strip-the-c-bit-from-cr2-on-pf-interception.patch kvm-svm-inject-gp-on-guest-msr_tsc_aux-accesses-if-rdtscp-unsupported.patch kvm-svm-use-online_vcpus-not-created_vcpus-to-iterate-over-vcpus.patch kvm-x86-fix-failure-to-boost-kernel-lock-holder-candidate-in-sev-es-guests.patch kvm-x86-mmu-alloc-page-for-pdptes-when-shadowing-32-bit-npt-with-64-bit.patch kvm-x86-remove-emulator-s-broken-checks-on-cr0-cr3-cr4-loads.patch --- diff --git a/queue-5.11/kvm-arm-arm64-fix-kvm_vgic_v3_addr_type_redist-read.patch b/queue-5.11/kvm-arm-arm64-fix-kvm_vgic_v3_addr_type_redist-read.patch new file mode 100644 index 00000000000..6e0d832b99d --- /dev/null +++ b/queue-5.11/kvm-arm-arm64-fix-kvm_vgic_v3_addr_type_redist-read.patch @@ -0,0 +1,42 @@ +From 94ac0835391efc1a30feda6fc908913ec012951e Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Mon, 12 Apr 2021 17:00:34 +0200 +Subject: KVM: arm/arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST read + +From: Eric Auger + +commit 94ac0835391efc1a30feda6fc908913ec012951e upstream. + +When reading the base address of the a REDIST region +through KVM_VGIC_V3_ADDR_TYPE_REDIST we expect the +redistributor region list to be populated with a single +element. + +However list_first_entry() expects the list to be non empty. +Instead we should use list_first_entry_or_null which effectively +returns NULL if the list is empty. + +Fixes: dbd9733ab674 ("KVM: arm/arm64: Replace the single rdist region by a list") +Cc: # v4.18+ +Signed-off-by: Eric Auger +Reported-by: Gavin Shan +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20210412150034.29185-1-eric.auger@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/vgic/vgic-kvm-device.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c ++++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c +@@ -87,8 +87,8 @@ int kvm_vgic_addr(struct kvm *kvm, unsig + r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); + goto out; + } +- rdreg = list_first_entry(&vgic->rd_regions, +- struct vgic_redist_region, list); ++ rdreg = list_first_entry_or_null(&vgic->rd_regions, ++ struct vgic_redist_region, list); + if (!rdreg) + addr_ptr = &undef_value; + else diff --git a/queue-5.11/kvm-arm64-fix-kvm_vgic_v3_addr_type_redist_region-read.patch b/queue-5.11/kvm-arm64-fix-kvm_vgic_v3_addr_type_redist_region-read.patch new file mode 100644 index 00000000000..dddcaf36d2f --- /dev/null +++ b/queue-5.11/kvm-arm64-fix-kvm_vgic_v3_addr_type_redist_region-read.patch @@ -0,0 +1,39 @@ +From 53b16dd6ba5cf64ed147ac3523ec34651d553cb0 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Mon, 5 Apr 2021 18:39:34 +0200 +Subject: KVM: arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION read + +From: Eric Auger + +commit 53b16dd6ba5cf64ed147ac3523ec34651d553cb0 upstream. + +The doc says: +"The characteristics of a specific redistributor region can + be read by presetting the index field in the attr data. + Only valid for KVM_DEV_TYPE_ARM_VGIC_V3" + +Unfortunately the existing code fails to read the input attr data. + +Fixes: 04c110932225 ("KVM: arm/arm64: Implement KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION") +Cc: stable@vger.kernel.org#v4.17+ +Signed-off-by: Eric Auger +Reviewed-by: Alexandru Elisei +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20210405163941.510258-3-eric.auger@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/vgic/vgic-kvm-device.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c ++++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c +@@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct k + u64 addr; + unsigned long type = (unsigned long)attr->attr; + ++ if (copy_from_user(&addr, uaddr, sizeof(addr))) ++ return -EFAULT; ++ + r = kvm_vgic_addr(dev->kvm, type, &addr, false); + if (r) + return (r == -ENODEV) ? -ENXIO : r; diff --git a/queue-5.11/kvm-arm64-fully-zero-the-vcpu-state-on-reset.patch b/queue-5.11/kvm-arm64-fully-zero-the-vcpu-state-on-reset.patch new file mode 100644 index 00000000000..d4030876c26 --- /dev/null +++ b/queue-5.11/kvm-arm64-fully-zero-the-vcpu-state-on-reset.patch @@ -0,0 +1,40 @@ +From 85d703746154cdc6794b6654b587b0b0354c97e9 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Wed, 7 Apr 2021 18:54:16 +0100 +Subject: KVM: arm64: Fully zero the vcpu state on reset + +From: Marc Zyngier + +commit 85d703746154cdc6794b6654b587b0b0354c97e9 upstream. + +On vcpu reset, we expect all the registers to be brought back +to their initial state, which happens to be a bunch of zeroes. + +However, some recent commit broke this, and is now leaving a bunch +of registers (such as the FP state) with whatever was left by the +guest. My bad. + +Zero the reset of the state (32bit SPSRs and FPSIMD state). + +Cc: stable@vger.kernel.org +Fixes: e47c2055c68e ("KVM: arm64: Make struct kvm_regs userspace-only") +Signed-off-by: Marc Zyngier +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kvm/reset.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/arch/arm64/kvm/reset.c ++++ b/arch/arm64/kvm/reset.c +@@ -242,6 +242,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu + + /* Reset core registers */ + memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); ++ memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs)); ++ vcpu->arch.ctxt.spsr_abt = 0; ++ vcpu->arch.ctxt.spsr_und = 0; ++ vcpu->arch.ctxt.spsr_irq = 0; ++ vcpu->arch.ctxt.spsr_fiq = 0; + vcpu_gp_regs(vcpu)->pstate = pstate; + + /* Reset system registers */ diff --git a/queue-5.11/kvm-destroy-i-o-bus-devices-on-unregister-failure-_after_-sync-ing-srcu.patch b/queue-5.11/kvm-destroy-i-o-bus-devices-on-unregister-failure-_after_-sync-ing-srcu.patch new file mode 100644 index 00000000000..05898f09e0c --- /dev/null +++ b/queue-5.11/kvm-destroy-i-o-bus-devices-on-unregister-failure-_after_-sync-ing-srcu.patch @@ -0,0 +1,51 @@ +From 2ee3757424be7c1cd1d0bbfa6db29a7edd82a250 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Mon, 12 Apr 2021 15:20:48 -0700 +Subject: KVM: Destroy I/O bus devices on unregister failure _after_ sync'ing SRCU + +From: Sean Christopherson + +commit 2ee3757424be7c1cd1d0bbfa6db29a7edd82a250 upstream. + +If allocating a new instance of an I/O bus fails when unregistering a +device, wait to destroy the device until after all readers are guaranteed +to see the new null bus. Destroying devices before the bus is nullified +could lead to use-after-free since readers expect the devices on their +reference of the bus to remain valid. + +Fixes: f65886606c2d ("KVM: fix memory leak in kvm_io_bus_unregister_dev()") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210412222050.876100-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + virt/kvm/kvm_main.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -4487,7 +4487,13 @@ void kvm_io_bus_unregister_dev(struct kv + new_bus->dev_count--; + memcpy(new_bus->range + i, bus->range + i + 1, + flex_array_size(new_bus, range, new_bus->dev_count - i)); +- } else { ++ } ++ ++ rcu_assign_pointer(kvm->buses[bus_idx], new_bus); ++ synchronize_srcu_expedited(&kvm->srcu); ++ ++ /* Destroy the old bus _after_ installing the (null) bus. */ ++ if (!new_bus) { + pr_err("kvm: failed to shrink bus, removing it completely\n"); + for (j = 0; j < bus->dev_count; j++) { + if (j == i) +@@ -4496,8 +4502,6 @@ void kvm_io_bus_unregister_dev(struct kv + } + } + +- rcu_assign_pointer(kvm->buses[bus_idx], new_bus); +- synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); + return; + } diff --git a/queue-5.11/kvm-nsvm-set-the-shadow-root-level-to-the-tdp-level-for-nested-npt.patch b/queue-5.11/kvm-nsvm-set-the-shadow-root-level-to-the-tdp-level-for-nested-npt.patch new file mode 100644 index 00000000000..b3082c98612 --- /dev/null +++ b/queue-5.11/kvm-nsvm-set-the-shadow-root-level-to-the-tdp-level-for-nested-npt.patch @@ -0,0 +1,46 @@ +From a3322d5cd87fef5ec0037fd1b14068a533f9a60f Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 4 Mar 2021 17:10:45 -0800 +Subject: KVM: nSVM: Set the shadow root level to the TDP level for nested NPT + +From: Sean Christopherson + +commit a3322d5cd87fef5ec0037fd1b14068a533f9a60f upstream. + +Override the shadow root level in the MMU context when configuring +NPT for shadowing nested NPT. The level is always tied to the TDP level +of the host, not whatever level the guest happens to be using. + +Fixes: 096586fda522 ("KVM: nSVM: Correctly set the shadow NPT root level in its MMU role") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210305011101.3597423-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -4630,12 +4630,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_ + struct kvm_mmu *context = &vcpu->arch.guest_mmu; + union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); + +- context->shadow_root_level = new_role.base.level; +- + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + +- if (new_role.as_u64 != context->mmu_role.as_u64) ++ if (new_role.as_u64 != context->mmu_role.as_u64) { + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); ++ ++ /* ++ * Override the level set by the common init helper, nested TDP ++ * always uses the host's TDP configuration. ++ */ ++ context->shadow_root_level = new_role.base.level; ++ } + } + EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); + diff --git a/queue-5.11/kvm-nvmx-defer-the-mmu-reload-to-the-normal-path-on-an-eptp-switch.patch b/queue-5.11/kvm-nvmx-defer-the-mmu-reload-to-the-normal-path-on-an-eptp-switch.patch new file mode 100644 index 00000000000..aa6182013bc --- /dev/null +++ b/queue-5.11/kvm-nvmx-defer-the-mmu-reload-to-the-normal-path-on-an-eptp-switch.patch @@ -0,0 +1,47 @@ +From c805f5d5585ab5e0cdac6b1ccf7086eb120fb7db Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 4 Mar 2021 17:10:57 -0800 +Subject: KVM: nVMX: Defer the MMU reload to the normal path on an EPTP switch + +From: Sean Christopherson + +commit c805f5d5585ab5e0cdac6b1ccf7086eb120fb7db upstream. + +Defer reloading the MMU after a EPTP successful EPTP switch. The VMFUNC +instruction itself is executed in the previous EPTP context, any side +effects, e.g. updating RIP, should occur in the old context. Practically +speaking, this bug is benign as VMX doesn't touch the MMU when skipping +an emulated instruction, nor does queuing a single-step #DB. No other +post-switch side effects exist. + +Fixes: 41ab93727467 ("KVM: nVMX: Emulate EPTP switching for the L1 hypervisor") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210305011101.3597423-14-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/nested.c | 9 ++------- + 1 file changed, 2 insertions(+), 7 deletions(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -5517,16 +5517,11 @@ static int nested_vmx_eptp_switching(str + if (!nested_vmx_check_eptp(vcpu, new_eptp)) + return 1; + +- kvm_mmu_unload(vcpu); + mmu->ept_ad = accessed_dirty; + mmu->mmu_role.base.ad_disabled = !accessed_dirty; + vmcs12->ept_pointer = new_eptp; +- /* +- * TODO: Check what's the correct approach in case +- * mmu reload fails. Currently, we just let the next +- * reload potentially fail +- */ +- kvm_mmu_reload(vcpu); ++ ++ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + } + + return 0; diff --git a/queue-5.11/kvm-nvmx-truncate-base-index-gpr-value-on-address-calc-in-64-bit.patch b/queue-5.11/kvm-nvmx-truncate-base-index-gpr-value-on-address-calc-in-64-bit.patch new file mode 100644 index 00000000000..58f7fc9aa43 --- /dev/null +++ b/queue-5.11/kvm-nvmx-truncate-base-index-gpr-value-on-address-calc-in-64-bit.patch @@ -0,0 +1,37 @@ +From 82277eeed65eed6c6ee5b8f97bd978763eab148f Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 21 Apr 2021 19:21:25 -0700 +Subject: KVM: nVMX: Truncate base/index GPR value on address calc in !64-bit + +From: Sean Christopherson + +commit 82277eeed65eed6c6ee5b8f97bd978763eab148f upstream. + +Drop bits 63:32 of the base and/or index GPRs when calculating the +effective address of a VMX instruction memory operand. Outside of 64-bit +mode, memory encodings are strictly limited to E*X and below. + +Fixes: 064aea774768 ("KVM: nVMX: Decoding memory operands of VMX instructions") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210422022128.3464144-7-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/nested.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -4639,9 +4639,9 @@ int get_vmx_mem_address(struct kvm_vcpu + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); + if (base_is_valid) +- off += kvm_register_read(vcpu, base_reg); ++ off += kvm_register_readl(vcpu, base_reg); + if (index_is_valid) +- off += kvm_register_read(vcpu, index_reg) << scaling; ++ off += kvm_register_readl(vcpu, index_reg) << scaling; + vmx_get_segment(vcpu, &s, seg_reg); + + /* diff --git a/queue-5.11/kvm-nvmx-truncate-bits-63-32-of-vmcs-field-on-nested-check-in-64-bit.patch b/queue-5.11/kvm-nvmx-truncate-bits-63-32-of-vmcs-field-on-nested-check-in-64-bit.patch new file mode 100644 index 00000000000..a835bf07569 --- /dev/null +++ b/queue-5.11/kvm-nvmx-truncate-bits-63-32-of-vmcs-field-on-nested-check-in-64-bit.patch @@ -0,0 +1,38 @@ +From ee050a577523dfd5fac95e6cc182ebe0293ead59 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 21 Apr 2021 19:21:24 -0700 +Subject: KVM: nVMX: Truncate bits 63:32 of VMCS field on nested check in !64-bit + +From: Sean Christopherson + +commit ee050a577523dfd5fac95e6cc182ebe0293ead59 upstream. + +Drop bits 63:32 of the VMCS field encoding when checking for a nested +VM-Exit on VMREAD/VMWRITE in !64-bit mode. VMREAD and VMWRITE always +use 32-bit operands outside of 64-bit mode. + +The actual emulation of VMREAD/VMWRITE does the right thing, this bug is +purely limited to incorrectly causing a nested VM-Exit if a GPR happens +to have bits 63:32 set outside of 64-bit mode. + +Fixes: a7cde481b6e8 ("KVM: nVMX: Do not forward VMREAD/VMWRITE VMExits to L1 if required so by vmcs12 vmread/vmwrite bitmaps") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210422022128.3464144-6-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/vmx/nested.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -5750,7 +5750,7 @@ static bool nested_vmx_exit_handled_vmcs + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); +- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); ++ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) diff --git a/queue-5.11/kvm-selftests-always-run-vcpu-thread-with-blocked-sig_ipi.patch b/queue-5.11/kvm-selftests-always-run-vcpu-thread-with-blocked-sig_ipi.patch new file mode 100644 index 00000000000..0fd602529dc --- /dev/null +++ b/queue-5.11/kvm-selftests-always-run-vcpu-thread-with-blocked-sig_ipi.patch @@ -0,0 +1,57 @@ +From bf1e15a82e3b74ee86bb119d6038b41e1ed2b319 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 20 Apr 2021 04:13:03 -0400 +Subject: KVM: selftests: Always run vCPU thread with blocked SIG_IPI + +From: Paolo Bonzini + +commit bf1e15a82e3b74ee86bb119d6038b41e1ed2b319 upstream. + +The main thread could start to send SIG_IPI at any time, even before signal +blocked on vcpu thread. Therefore, start the vcpu thread with the signal +blocked. + +Without this patch, on very busy cores the dirty_log_test could fail directly +on receiving a SIGUSR1 without a handler (when vcpu runs far slower than main). + +Reported-by: Peter Xu +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/kvm/dirty_log_test.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/tools/testing/selftests/kvm/dirty_log_test.c ++++ b/tools/testing/selftests/kvm/dirty_log_test.c +@@ -527,9 +527,8 @@ static void *vcpu_worker(void *data) + */ + sigmask->len = 8; + pthread_sigmask(0, NULL, sigset); ++ sigdelset(sigset, SIG_IPI); + vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask); +- sigaddset(sigset, SIG_IPI); +- pthread_sigmask(SIG_BLOCK, sigset, NULL); + + sigemptyset(sigset); + sigaddset(sigset, SIG_IPI); +@@ -858,6 +857,7 @@ int main(int argc, char *argv[]) + .interval = TEST_HOST_LOOP_INTERVAL, + }; + int opt, i; ++ sigset_t sigset; + + sem_init(&sem_vcpu_stop, 0, 0); + sem_init(&sem_vcpu_cont, 0, 0); +@@ -916,6 +916,11 @@ int main(int argc, char *argv[]) + + srandom(time(0)); + ++ /* Ensure that vCPU threads start with SIG_IPI blocked. */ ++ sigemptyset(&sigset); ++ sigaddset(&sigset, SIG_IPI); ++ pthread_sigmask(SIG_BLOCK, &sigset, NULL); ++ + if (host_log_mode_option == LOG_MODE_ALL) { + /* Run each log mode */ + for (i = 0; i < LOG_MODE_NUM; i++) { diff --git a/queue-5.11/kvm-selftests-sync-data-verify-of-dirty-logging-with-guest-sync.patch b/queue-5.11/kvm-selftests-sync-data-verify-of-dirty-logging-with-guest-sync.patch new file mode 100644 index 00000000000..a6628e094d1 --- /dev/null +++ b/queue-5.11/kvm-selftests-sync-data-verify-of-dirty-logging-with-guest-sync.patch @@ -0,0 +1,212 @@ +From 016ff1a442d9a8f36dcb3beca0bcdfc35e281e18 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Sat, 17 Apr 2021 10:36:01 -0400 +Subject: KVM: selftests: Sync data verify of dirty logging with guest sync + +From: Peter Xu + +commit 016ff1a442d9a8f36dcb3beca0bcdfc35e281e18 upstream. + +This fixes a bug that can trigger with e.g. "taskset -c 0 ./dirty_log_test" or +when the testing host is very busy. + +A similar previous attempt is done [1] but that is not enough, the reason is +stated in the reply [2]. + +As a summary (partly quotting from [2]): + +The problem is I think one guest memory write operation (of this specific test) +contains a few micro-steps when page is during kvm dirty tracking (here I'm +only considering write-protect rather than pml but pml should be similar at +least when the log buffer is full): + + (1) Guest read 'iteration' number into register, prepare to write, page fault + (2) Set dirty bit in either dirty bitmap or dirty ring + (3) Return to guest, data written + +When we verify the data, we assumed that all these steps are "atomic", say, +when (1) happened for this page, we assume (2) & (3) must have happened. We +had some trick to workaround "un-atomicity" of above three steps, as previous +version of this patch wanted to fix atomicity of step (2)+(3) by explicitly +letting the main thread wait for at least one vmenter of vcpu thread, which +should work. However what I overlooked is probably that we still have race +when (1) and (2) can be interrupted. + +One example calltrace when it could happen that we read an old interation, got +interrupted before even setting the dirty bit and flushing data: + + __schedule+1742 + __cond_resched+52 + __get_user_pages+530 + get_user_pages_unlocked+197 + hva_to_pfn+206 + try_async_pf+132 + direct_page_fault+320 + kvm_mmu_page_fault+103 + vmx_handle_exit+288 + vcpu_enter_guest+2460 + kvm_arch_vcpu_ioctl_run+325 + kvm_vcpu_ioctl+526 + __x64_sys_ioctl+131 + do_syscall_64+51 + entry_SYSCALL_64_after_hwframe+68 + +It means iteration number cached in vcpu register can be very old when dirty +bit set and data flushed. + +So far I don't see an easy way to guarantee all steps 1-3 atomicity but to sync +at the GUEST_SYNC() point of guest code when we do verification of the dirty +bits as what this patch does. + +[1] https://lore.kernel.org/lkml/20210413213641.23742-1-peterx@redhat.com/ +[2] https://lore.kernel.org/lkml/20210417140956.GV4440@xz-x1/ + +Cc: Paolo Bonzini +Cc: Sean Christopherson +Cc: Andrew Jones +Cc: stable@vger.kernel.org +Signed-off-by: Peter Xu +Message-Id: <20210417143602.215059-2-peterx@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/kvm/dirty_log_test.c | 60 ++++++++++++++++++++++----- + 1 file changed, 50 insertions(+), 10 deletions(-) + +--- a/tools/testing/selftests/kvm/dirty_log_test.c ++++ b/tools/testing/selftests/kvm/dirty_log_test.c +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + + #include "kvm_util.h" + #include "test_util.h" +@@ -137,12 +138,20 @@ static uint64_t host_clear_count; + static uint64_t host_track_next_count; + + /* Whether dirty ring reset is requested, or finished */ +-static sem_t dirty_ring_vcpu_stop; +-static sem_t dirty_ring_vcpu_cont; ++static sem_t sem_vcpu_stop; ++static sem_t sem_vcpu_cont; ++/* ++ * This is only set by main thread, and only cleared by vcpu thread. It is ++ * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC ++ * is the only place that we'll guarantee both "dirty bit" and "dirty data" ++ * will match. E.g., SIG_IPI won't guarantee that if the vcpu is interrupted ++ * after setting dirty bit but before the data is written. ++ */ ++static atomic_t vcpu_sync_stop_requested; + /* + * This is updated by the vcpu thread to tell the host whether it's a + * ring-full event. It should only be read until a sem_wait() of +- * dirty_ring_vcpu_stop and before vcpu continues to run. ++ * sem_vcpu_stop and before vcpu continues to run. + */ + static bool dirty_ring_vcpu_ring_full; + /* +@@ -234,6 +243,17 @@ static void clear_log_collect_dirty_page + kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages); + } + ++/* Should only be called after a GUEST_SYNC */ ++static void vcpu_handle_sync_stop(void) ++{ ++ if (atomic_read(&vcpu_sync_stop_requested)) { ++ /* It means main thread is sleeping waiting */ ++ atomic_set(&vcpu_sync_stop_requested, false); ++ sem_post(&sem_vcpu_stop); ++ sem_wait_until(&sem_vcpu_cont); ++ } ++} ++ + static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err) + { + struct kvm_run *run = vcpu_state(vm, VCPU_ID); +@@ -244,6 +264,8 @@ static void default_after_vcpu_run(struc + TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); ++ ++ vcpu_handle_sync_stop(); + } + + static bool dirty_ring_supported(void) +@@ -301,13 +323,13 @@ static void dirty_ring_wait_vcpu(void) + { + /* This makes sure that hardware PML cache flushed */ + vcpu_kick(); +- sem_wait_until(&dirty_ring_vcpu_stop); ++ sem_wait_until(&sem_vcpu_stop); + } + + static void dirty_ring_continue_vcpu(void) + { + pr_info("Notifying vcpu to continue\n"); +- sem_post(&dirty_ring_vcpu_cont); ++ sem_post(&sem_vcpu_cont); + } + + static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot, +@@ -361,11 +383,11 @@ static void dirty_ring_after_vcpu_run(st + /* Update the flag first before pause */ + WRITE_ONCE(dirty_ring_vcpu_ring_full, + run->exit_reason == KVM_EXIT_DIRTY_RING_FULL); +- sem_post(&dirty_ring_vcpu_stop); ++ sem_post(&sem_vcpu_stop); + pr_info("vcpu stops because %s...\n", + dirty_ring_vcpu_ring_full ? + "dirty ring is full" : "vcpu is kicked out"); +- sem_wait_until(&dirty_ring_vcpu_cont); ++ sem_wait_until(&sem_vcpu_cont); + pr_info("vcpu continues now.\n"); + } else { + TEST_ASSERT(false, "Invalid guest sync status: " +@@ -377,7 +399,7 @@ static void dirty_ring_after_vcpu_run(st + static void dirty_ring_before_vcpu_join(void) + { + /* Kick another round of vcpu just to make sure it will quit */ +- sem_post(&dirty_ring_vcpu_cont); ++ sem_post(&sem_vcpu_cont); + } + + struct log_mode { +@@ -768,7 +790,25 @@ static void run_test(enum vm_guest_mode + usleep(p->interval * 1000); + log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX, + bmap, host_num_pages); ++ ++ /* ++ * See vcpu_sync_stop_requested definition for details on why ++ * we need to stop vcpu when verify data. ++ */ ++ atomic_set(&vcpu_sync_stop_requested, true); ++ sem_wait_until(&sem_vcpu_stop); ++ /* ++ * NOTE: for dirty ring, it's possible that we didn't stop at ++ * GUEST_SYNC but instead we stopped because ring is full; ++ * that's okay too because ring full means we're only missing ++ * the flush of the last page, and since we handle the last ++ * page specially verification will succeed anyway. ++ */ ++ assert(host_log_mode == LOG_MODE_DIRTY_RING || ++ atomic_read(&vcpu_sync_stop_requested) == false); + vm_dirty_log_verify(mode, bmap); ++ sem_post(&sem_vcpu_cont); ++ + iteration++; + sync_global_to_guest(vm, iteration); + } +@@ -819,8 +859,8 @@ int main(int argc, char *argv[]) + }; + int opt, i; + +- sem_init(&dirty_ring_vcpu_stop, 0, 0); +- sem_init(&dirty_ring_vcpu_cont, 0, 0); ++ sem_init(&sem_vcpu_stop, 0, 0); ++ sem_init(&sem_vcpu_cont, 0, 0); + + guest_modes_append_default(); + diff --git a/queue-5.11/kvm-stop-looking-for-coalesced-mmio-zones-if-the-bus-is-destroyed.patch b/queue-5.11/kvm-stop-looking-for-coalesced-mmio-zones-if-the-bus-is-destroyed.patch new file mode 100644 index 00000000000..63cdae35209 --- /dev/null +++ b/queue-5.11/kvm-stop-looking-for-coalesced-mmio-zones-if-the-bus-is-destroyed.patch @@ -0,0 +1,128 @@ +From 5d3c4c79384af06e3c8e25b7770b6247496b4417 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Mon, 12 Apr 2021 15:20:49 -0700 +Subject: KVM: Stop looking for coalesced MMIO zones if the bus is destroyed + +From: Sean Christopherson + +commit 5d3c4c79384af06e3c8e25b7770b6247496b4417 upstream. + +Abort the walk of coalesced MMIO zones if kvm_io_bus_unregister_dev() +fails to allocate memory for the new instance of the bus. If it can't +instantiate a new bus, unregister_dev() destroys all devices _except_ the +target device. But, it doesn't tell the caller that it obliterated the +bus and invoked the destructor for all devices that were on the bus. In +the coalesced MMIO case, this can result in a deleted list entry +dereference due to attempting to continue iterating on coalesced_zones +after future entries (in the walk) have been deleted. + +Opportunistically add curly braces to the for-loop, which encompasses +many lines but sneaks by without braces due to the guts being a single +if statement. + +Fixes: f65886606c2d ("KVM: fix memory leak in kvm_io_bus_unregister_dev()") +Cc: stable@vger.kernel.org +Reported-by: Hao Sun +Signed-off-by: Sean Christopherson +Message-Id: <20210412222050.876100-3-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/kvm_host.h | 4 ++-- + virt/kvm/coalesced_mmio.c | 19 +++++++++++++++++-- + virt/kvm/kvm_main.c | 10 +++++----- + 3 files changed, 24 insertions(+), 9 deletions(-) + +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -191,8 +191,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcp + int len, void *val); + int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, struct kvm_io_device *dev); +-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, +- struct kvm_io_device *dev); ++int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, ++ struct kvm_io_device *dev); + struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, + gpa_t addr); + +--- a/virt/kvm/coalesced_mmio.c ++++ b/virt/kvm/coalesced_mmio.c +@@ -174,21 +174,36 @@ int kvm_vm_ioctl_unregister_coalesced_mm + struct kvm_coalesced_mmio_zone *zone) + { + struct kvm_coalesced_mmio_dev *dev, *tmp; ++ int r; + + if (zone->pio != 1 && zone->pio != 0) + return -EINVAL; + + mutex_lock(&kvm->slots_lock); + +- list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) ++ list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) { + if (zone->pio == dev->zone.pio && + coalesced_mmio_in_range(dev, zone->addr, zone->size)) { +- kvm_io_bus_unregister_dev(kvm, ++ r = kvm_io_bus_unregister_dev(kvm, + zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev); + kvm_iodevice_destructor(&dev->dev); ++ ++ /* ++ * On failure, unregister destroys all devices on the ++ * bus _except_ the target device, i.e. coalesced_zones ++ * has been modified. No need to restart the walk as ++ * there aren't any zones left. ++ */ ++ if (r) ++ break; + } ++ } + + mutex_unlock(&kvm->slots_lock); + ++ /* ++ * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's ++ * perspective, the coalesced MMIO is most definitely unregistered. ++ */ + return 0; + } +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -4462,15 +4462,15 @@ int kvm_io_bus_register_dev(struct kvm * + } + + /* Caller must hold slots_lock. */ +-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, +- struct kvm_io_device *dev) ++int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, ++ struct kvm_io_device *dev) + { + int i, j; + struct kvm_io_bus *new_bus, *bus; + + bus = kvm_get_bus(kvm, bus_idx); + if (!bus) +- return; ++ return 0; + + for (i = 0; i < bus->dev_count; i++) + if (bus->range[i].dev == dev) { +@@ -4478,7 +4478,7 @@ void kvm_io_bus_unregister_dev(struct kv + } + + if (i == bus->dev_count) +- return; ++ return 0; + + new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), + GFP_KERNEL_ACCOUNT); +@@ -4503,7 +4503,7 @@ void kvm_io_bus_unregister_dev(struct kv + } + + kfree(bus); +- return; ++ return new_bus ? 0 : -ENOMEM; + } + + struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, diff --git a/queue-5.11/kvm-svm-do-not-allow-sev-sev-es-initialization-after-vcpus-are-created.patch b/queue-5.11/kvm-svm-do-not-allow-sev-sev-es-initialization-after-vcpus-are-created.patch new file mode 100644 index 00000000000..26e89584f82 --- /dev/null +++ b/queue-5.11/kvm-svm-do-not-allow-sev-sev-es-initialization-after-vcpus-are-created.patch @@ -0,0 +1,42 @@ +From 8727906fde6ea665b52e68ddc58833772537f40a Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 30 Mar 2021 20:19:36 -0700 +Subject: KVM: SVM: Do not allow SEV/SEV-ES initialization after vCPUs are created + +From: Sean Christopherson + +commit 8727906fde6ea665b52e68ddc58833772537f40a upstream. + +Reject KVM_SEV_INIT and KVM_SEV_ES_INIT if they are attempted after one +or more vCPUs have been created. KVM assumes a VM is tagged SEV/SEV-ES +prior to vCPU creation, e.g. init_vmcb() needs to mark the VMCB as SEV +enabled, and svm_create_vcpu() needs to allocate the VMSA. At best, +creating vCPUs before SEV/SEV-ES init will lead to unexpected errors +and/or behavior, and at worst it will crash the host, e.g. +sev_launch_update_vmsa() will dereference a null svm->vmsa pointer. + +Fixes: 1654efcbc431 ("KVM: SVM: Add KVM_SEV_INIT command") +Fixes: ad73109ae7ec ("KVM: SVM: Provide support to launch and run an SEV-ES guest") +Cc: stable@vger.kernel.org +Cc: Brijesh Singh +Cc: Tom Lendacky +Signed-off-by: Sean Christopherson +Message-Id: <20210331031936.2495277-4-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/sev.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -181,6 +181,9 @@ static int sev_guest_init(struct kvm *kv + bool es_active = argp->id == KVM_SEV_ES_INIT; + int asid, ret; + ++ if (kvm->created_vcpus) ++ return -EINVAL; ++ + ret = -EBUSY; + if (unlikely(sev->active)) + return ret; diff --git a/queue-5.11/kvm-svm-do-not-set-sev-es_active-until-kvm_sev_es_init-completes.patch b/queue-5.11/kvm-svm-do-not-set-sev-es_active-until-kvm_sev_es_init-completes.patch new file mode 100644 index 00000000000..9b3be5be00c --- /dev/null +++ b/queue-5.11/kvm-svm-do-not-set-sev-es_active-until-kvm_sev_es_init-completes.patch @@ -0,0 +1,113 @@ +From 9fa1521daafb58d878d03d75f6863a11312fae22 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 30 Mar 2021 20:19:35 -0700 +Subject: KVM: SVM: Do not set sev->es_active until KVM_SEV_ES_INIT completes + +From: Sean Christopherson + +commit 9fa1521daafb58d878d03d75f6863a11312fae22 upstream. + +Set sev->es_active only after the guts of KVM_SEV_ES_INIT succeeds. If +the command fails, e.g. because SEV is already active or there are no +available ASIDs, then es_active will be left set even though the VM is +not fully SEV-ES capable. + +Refactor the code so that "es_active" is passed on the stack instead of +being prematurely shoved into sev_info, both to avoid having to unwind +sev_info and so that it's more obvious what actually consumes es_active +in sev_guest_init() and its helpers. + +Fixes: ad73109ae7ec ("KVM: SVM: Provide support to launch and run an SEV-ES guest") +Cc: stable@vger.kernel.org +Cc: Brijesh Singh +Cc: Tom Lendacky +Signed-off-by: Sean Christopherson +Message-Id: <20210331031936.2495277-3-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/sev.c | 29 ++++++++++++----------------- + 1 file changed, 12 insertions(+), 17 deletions(-) + +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -86,7 +86,7 @@ static bool __sev_recycle_asids(int min_ + return true; + } + +-static int sev_asid_new(struct kvm_sev_info *sev) ++static int sev_asid_new(bool es_active) + { + int pos, min_asid, max_asid; + bool retry = true; +@@ -97,8 +97,8 @@ static int sev_asid_new(struct kvm_sev_i + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. + */ +- min_asid = sev->es_active ? 0 : min_sev_asid - 1; +- max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; ++ min_asid = es_active ? 0 : min_sev_asid - 1; ++ max_asid = es_active ? min_sev_asid - 1 : max_sev_asid; + again: + pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid); + if (pos >= max_asid) { +@@ -178,13 +178,14 @@ static void sev_unbind_asid(struct kvm * + static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) + { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; ++ bool es_active = argp->id == KVM_SEV_ES_INIT; + int asid, ret; + + ret = -EBUSY; + if (unlikely(sev->active)) + return ret; + +- asid = sev_asid_new(sev); ++ asid = sev_asid_new(es_active); + if (asid < 0) + return ret; + +@@ -193,6 +194,7 @@ static int sev_guest_init(struct kvm *kv + goto e_free; + + sev->active = true; ++ sev->es_active = es_active; + sev->asid = asid; + INIT_LIST_HEAD(&sev->regions_list); + +@@ -203,16 +205,6 @@ e_free: + return ret; + } + +-static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +-{ +- if (!sev_es) +- return -ENOTTY; +- +- to_kvm_svm(kvm)->sev_info.es_active = true; +- +- return sev_guest_init(kvm, argp); +-} +- + static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) + { + struct sev_data_activate *data; +@@ -1059,12 +1051,15 @@ int svm_mem_enc_op(struct kvm *kvm, void + mutex_lock(&kvm->lock); + + switch (sev_cmd.id) { ++ case KVM_SEV_ES_INIT: ++ if (!sev_es) { ++ r = -ENOTTY; ++ goto out; ++ } ++ fallthrough; + case KVM_SEV_INIT: + r = sev_guest_init(kvm, &sev_cmd); + break; +- case KVM_SEV_ES_INIT: +- r = sev_es_guest_init(kvm, &sev_cmd); +- break; + case KVM_SEV_LAUNCH_START: + r = sev_launch_start(kvm, &sev_cmd); + break; diff --git a/queue-5.11/kvm-svm-don-t-strip-the-c-bit-from-cr2-on-pf-interception.patch b/queue-5.11/kvm-svm-don-t-strip-the-c-bit-from-cr2-on-pf-interception.patch new file mode 100644 index 00000000000..afdbdcb24c5 --- /dev/null +++ b/queue-5.11/kvm-svm-don-t-strip-the-c-bit-from-cr2-on-pf-interception.patch @@ -0,0 +1,35 @@ +From 6d1b867d045699d6ce0dfa0ef35d1b87dd36db56 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 4 Mar 2021 17:10:56 -0800 +Subject: KVM: SVM: Don't strip the C-bit from CR2 on #PF interception + +From: Sean Christopherson + +commit 6d1b867d045699d6ce0dfa0ef35d1b87dd36db56 upstream. + +Don't strip the C-bit from the faulting address on an intercepted #PF, +the address is a virtual address, not a physical address. + +Fixes: 0ede79e13224 ("KVM: SVM: Clear C-bit from the page fault address") +Cc: stable@vger.kernel.org +Cc: Brijesh Singh +Cc: Tom Lendacky +Signed-off-by: Sean Christopherson +Message-Id: <20210305011101.3597423-13-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -1888,7 +1888,7 @@ static void svm_set_dr7(struct kvm_vcpu + + static int pf_interception(struct vcpu_svm *svm) + { +- u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); ++ u64 fault_address = svm->vmcb->control.exit_info_2; + u64 error_code = svm->vmcb->control.exit_info_1; + + return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, diff --git a/queue-5.11/kvm-svm-inject-gp-on-guest-msr_tsc_aux-accesses-if-rdtscp-unsupported.patch b/queue-5.11/kvm-svm-inject-gp-on-guest-msr_tsc_aux-accesses-if-rdtscp-unsupported.patch new file mode 100644 index 00000000000..728591f6c00 --- /dev/null +++ b/queue-5.11/kvm-svm-inject-gp-on-guest-msr_tsc_aux-accesses-if-rdtscp-unsupported.patch @@ -0,0 +1,46 @@ +From 6f2b296aa6432d8274e258cc3220047ca04f5de0 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 23 Apr 2021 15:34:01 -0700 +Subject: KVM: SVM: Inject #GP on guest MSR_TSC_AUX accesses if RDTSCP unsupported + +From: Sean Christopherson + +commit 6f2b296aa6432d8274e258cc3220047ca04f5de0 upstream. + +Inject #GP on guest accesses to MSR_TSC_AUX if RDTSCP is unsupported in +the guest's CPUID model. + +Fixes: 46896c73c1a4 ("KVM: svm: add support for RDTSCP") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20210423223404.3860547-2-seanjc@google.com> +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/svm.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/arch/x86/kvm/svm/svm.c ++++ b/arch/x86/kvm/svm/svm.c +@@ -2651,6 +2651,9 @@ static int svm_get_msr(struct kvm_vcpu * + case MSR_TSC_AUX: + if (!boot_cpu_has(X86_FEATURE_RDTSCP)) + return 1; ++ if (!msr_info->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) ++ return 1; + msr_info->data = svm->tsc_aux; + break; + /* +@@ -2859,6 +2862,10 @@ static int svm_set_msr(struct kvm_vcpu * + if (!boot_cpu_has(X86_FEATURE_RDTSCP)) + return 1; + ++ if (!msr->host_initiated && ++ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) ++ return 1; ++ + /* + * This is rare, so we update the MSR here instead of using + * direct_access_msrs. Doing that would require a rdmsr in diff --git a/queue-5.11/kvm-svm-use-online_vcpus-not-created_vcpus-to-iterate-over-vcpus.patch b/queue-5.11/kvm-svm-use-online_vcpus-not-created_vcpus-to-iterate-over-vcpus.patch new file mode 100644 index 00000000000..0eed7b0995f --- /dev/null +++ b/queue-5.11/kvm-svm-use-online_vcpus-not-created_vcpus-to-iterate-over-vcpus.patch @@ -0,0 +1,50 @@ +From c36b16d29f3af5f32fc1b2a3401bf48f71cabee1 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 30 Mar 2021 20:19:34 -0700 +Subject: KVM: SVM: Use online_vcpus, not created_vcpus, to iterate over vCPUs + +From: Sean Christopherson + +commit c36b16d29f3af5f32fc1b2a3401bf48f71cabee1 upstream. + +Use the kvm_for_each_vcpu() helper to iterate over vCPUs when encrypting +VMSAs for SEV, which effectively switches to use online_vcpus instead of +created_vcpus. This fixes a possible null-pointer dereference as +created_vcpus does not guarantee a vCPU exists, since it is updated at +the very beginning of KVM_CREATE_VCPU. created_vcpus exists to allow the +bulk of vCPU creation to run in parallel, while still correctly +restricting the max number of max vCPUs. + +Fixes: ad73109ae7ec ("KVM: SVM: Provide support to launch and run an SEV-ES guest") +Cc: stable@vger.kernel.org +Cc: Brijesh Singh +Cc: Tom Lendacky +Signed-off-by: Sean Christopherson +Message-Id: <20210331031936.2495277-2-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/svm/sev.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/svm/sev.c ++++ b/arch/x86/kvm/svm/sev.c +@@ -563,6 +563,7 @@ static int sev_launch_update_vmsa(struct + { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_launch_update_vmsa *vmsa; ++ struct kvm_vcpu *vcpu; + int i, ret; + + if (!sev_es_guest(kvm)) +@@ -572,8 +573,8 @@ static int sev_launch_update_vmsa(struct + if (!vmsa) + return -ENOMEM; + +- for (i = 0; i < kvm->created_vcpus; i++) { +- struct vcpu_svm *svm = to_svm(kvm->vcpus[i]); ++ kvm_for_each_vcpu(i, vcpu, kvm) { ++ struct vcpu_svm *svm = to_svm(vcpu); + + /* Perform some pre-encryption checks against the VMSA */ + ret = sev_es_sync_vmsa(svm); diff --git a/queue-5.11/kvm-x86-fix-failure-to-boost-kernel-lock-holder-candidate-in-sev-es-guests.patch b/queue-5.11/kvm-x86-fix-failure-to-boost-kernel-lock-holder-candidate-in-sev-es-guests.patch new file mode 100644 index 00000000000..be4b661fd77 --- /dev/null +++ b/queue-5.11/kvm-x86-fix-failure-to-boost-kernel-lock-holder-candidate-in-sev-es-guests.patch @@ -0,0 +1,39 @@ +From b86bb11e3a79ac0db9a6786b1fe80f74321cb076 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li +Date: Thu, 22 Apr 2021 16:34:19 +0800 +Subject: KVM: X86: Fix failure to boost kernel lock holder candidate in SEV-ES guests + +From: Wanpeng Li + +commit b86bb11e3a79ac0db9a6786b1fe80f74321cb076 upstream. + +Commit f1c6366e3043 ("KVM: SVM: Add required changes to support intercepts under +SEV-ES") prevents hypervisor accesses guest register state when the guest is +running under SEV-ES. The initial value of vcpu->arch.guest_state_protected +is false, it will not be updated in preemption notifiers after this commit which +means that the kernel spinlock lock holder will always be skipped to boost. Let's +fix it by always treating preempted is in the guest kernel mode, false positive +is better than skip completely. + +Fixes: f1c6366e3043 (KVM: SVM: Add required changes to support intercepts under SEV-ES) +Signed-off-by: Wanpeng Li +Message-Id: <1619080459-30032-1-git-send-email-wanpengli@tencent.com> +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10888,6 +10888,9 @@ bool kvm_arch_dy_runnable(struct kvm_vcp + + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) + { ++ if (vcpu->arch.guest_state_protected) ++ return true; ++ + return vcpu->arch.preempted_in_kernel; + } + diff --git a/queue-5.11/kvm-x86-mmu-alloc-page-for-pdptes-when-shadowing-32-bit-npt-with-64-bit.patch b/queue-5.11/kvm-x86-mmu-alloc-page-for-pdptes-when-shadowing-32-bit-npt-with-64-bit.patch new file mode 100644 index 00000000000..f9bad8bb243 --- /dev/null +++ b/queue-5.11/kvm-x86-mmu-alloc-page-for-pdptes-when-shadowing-32-bit-npt-with-64-bit.patch @@ -0,0 +1,118 @@ +From 04d45551a1eefbea42655da52f56e846c0af721a Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Thu, 4 Mar 2021 17:10:46 -0800 +Subject: KVM: x86/mmu: Alloc page for PDPTEs when shadowing 32-bit NPT with 64-bit + +From: Sean Christopherson + +commit 04d45551a1eefbea42655da52f56e846c0af721a upstream. + +Allocate the so called pae_root page on-demand, along with the lm_root +page, when shadowing 32-bit NPT with 64-bit NPT, i.e. when running a +32-bit L1. KVM currently only allocates the page when NPT is disabled, +or when L0 is 32-bit (using PAE paging). + +Note, there is an existing memory leak involving the MMU roots, as KVM +fails to free the PAE roots on failure. This will be addressed in a +future commit. + +Fixes: ee6268ba3a68 ("KVM: x86: Skip pae_root shadow allocation if tdp enabled") +Fixes: b6b80c78af83 ("KVM: x86/mmu: Allocate PAE root array when using SVM's 32-bit NPT") +Cc: stable@vger.kernel.org +Reviewed-by: Ben Gardon +Signed-off-by: Sean Christopherson +Message-Id: <20210305011101.3597423-3-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++++++++++--------------- + 1 file changed, 29 insertions(+), 15 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -3203,14 +3203,14 @@ void kvm_mmu_free_roots(struct kvm_vcpu + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { + mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); +- } else { ++ } else if (mmu->pae_root) { + for (i = 0; i < 4; ++i) + if (mmu->pae_root[i] != 0) + mmu_free_root_page(kvm, + &mmu->pae_root[i], + &invalid_list); +- mmu->root_hpa = INVALID_PAGE; + } ++ mmu->root_hpa = INVALID_PAGE; + mmu->root_pgd = 0; + } + +@@ -3322,9 +3322,23 @@ static int mmu_alloc_shadow_roots(struct + * the shadow page table may be a PAE or a long mode page table. + */ + pm_mask = PT_PRESENT_MASK; +- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) ++ if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + ++ /* ++ * Allocate the page for the PDPTEs when shadowing 32-bit NPT ++ * with 64-bit only when needed. Unlike 32-bit NPT, it doesn't ++ * need to be in low mem. See also lm_root below. ++ */ ++ if (!vcpu->arch.mmu->pae_root) { ++ WARN_ON_ONCE(!tdp_enabled); ++ ++ vcpu->arch.mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); ++ if (!vcpu->arch.mmu->pae_root) ++ return -ENOMEM; ++ } ++ } ++ + for (i = 0; i < 4; ++i) { + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); + if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { +@@ -3347,21 +3361,19 @@ static int mmu_alloc_shadow_roots(struct + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); + + /* +- * If we shadow a 32 bit page table with a long mode page +- * table we enter this path. ++ * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP ++ * tables are allocated and initialized at MMU creation as there is no ++ * equivalent level in the guest's NPT to shadow. Allocate the tables ++ * on demand, as running a 32-bit L1 VMM is very rare. The PDP is ++ * handled above (to share logic with PAE), deal with the PML4 here. + */ + if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->lm_root == NULL) { +- /* +- * The additional page necessary for this is only +- * allocated on demand. +- */ +- + u64 *lm_root; + + lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); +- if (lm_root == NULL) +- return 1; ++ if (!lm_root) ++ return -ENOMEM; + + lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; + +@@ -5310,9 +5322,11 @@ static int __kvm_mmu_create(struct kvm_v + * while the PDP table is a per-vCPU construct that's allocated at MMU + * creation. When emulating 32-bit mode, cr3 is only 32 bits even on + * x86_64. Therefore we need to allocate the PDP table in the first +- * 4GB of memory, which happens to fit the DMA32 zone. Except for +- * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can +- * skip allocating the PDP table. ++ * 4GB of memory, which happens to fit the DMA32 zone. TDP paging ++ * generally doesn't use PAE paging and can skip allocating the PDP ++ * table. The main exception, handled here, is SVM's 32-bit NPT. The ++ * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit ++ * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots(). + */ + if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) + return 0; diff --git a/queue-5.11/kvm-x86-remove-emulator-s-broken-checks-on-cr0-cr3-cr4-loads.patch b/queue-5.11/kvm-x86-remove-emulator-s-broken-checks-on-cr0-cr3-cr4-loads.patch new file mode 100644 index 00000000000..25a127e6337 --- /dev/null +++ b/queue-5.11/kvm-x86-remove-emulator-s-broken-checks-on-cr0-cr3-cr4-loads.patch @@ -0,0 +1,132 @@ +From d0fe7b6404408835ed60232cb3bf28324b2f95db Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 21 Apr 2021 19:21:20 -0700 +Subject: KVM: x86: Remove emulator's broken checks on CR0/CR3/CR4 loads + +From: Sean Christopherson + +commit d0fe7b6404408835ed60232cb3bf28324b2f95db upstream. + +Remove the emulator's checks for illegal CR0, CR3, and CR4 values, as +the checks are redundant, outdated, and in the case of SEV's C-bit, +broken. The emulator manually calculates MAXPHYADDR from CPUID and +neglects to mask off the C-bit. For all other checks, kvm_set_cr*() are +a superset of the emulator checks, e.g. see CR4.LA57. + +Fixes: a780a3ea6282 ("KVM: X86: Fix reserved bits check for MOV to CR3") +Cc: Babu Moger +Signed-off-by: Sean Christopherson +Message-Id: <20210422022128.3464144-2-seanjc@google.com> +Cc: stable@vger.kernel.org +[Unify check_cr_read and check_cr_write. - Paolo] +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/emulate.c | 80 +------------------------------------------------ + 1 file changed, 3 insertions(+), 77 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -4220,7 +4220,7 @@ static bool valid_cr(int nr) + } + } + +-static int check_cr_read(struct x86_emulate_ctxt *ctxt) ++static int check_cr_access(struct x86_emulate_ctxt *ctxt) + { + if (!valid_cr(ctxt->modrm_reg)) + return emulate_ud(ctxt); +@@ -4228,80 +4228,6 @@ static int check_cr_read(struct x86_emul + return X86EMUL_CONTINUE; + } + +-static int check_cr_write(struct x86_emulate_ctxt *ctxt) +-{ +- u64 new_val = ctxt->src.val64; +- int cr = ctxt->modrm_reg; +- u64 efer = 0; +- +- static u64 cr_reserved_bits[] = { +- 0xffffffff00000000ULL, +- 0, 0, 0, /* CR3 checked later */ +- CR4_RESERVED_BITS, +- 0, 0, 0, +- CR8_RESERVED_BITS, +- }; +- +- if (!valid_cr(cr)) +- return emulate_ud(ctxt); +- +- if (new_val & cr_reserved_bits[cr]) +- return emulate_gp(ctxt, 0); +- +- switch (cr) { +- case 0: { +- u64 cr4; +- if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || +- ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) +- return emulate_gp(ctxt, 0); +- +- cr4 = ctxt->ops->get_cr(ctxt, 4); +- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); +- +- if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && +- !(cr4 & X86_CR4_PAE)) +- return emulate_gp(ctxt, 0); +- +- break; +- } +- case 3: { +- u64 rsvd = 0; +- +- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); +- if (efer & EFER_LMA) { +- u64 maxphyaddr; +- u32 eax, ebx, ecx, edx; +- +- eax = 0x80000008; +- ecx = 0; +- if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, +- &edx, true)) +- maxphyaddr = eax & 0xff; +- else +- maxphyaddr = 36; +- rsvd = rsvd_bits(maxphyaddr, 63); +- if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE) +- rsvd &= ~X86_CR3_PCID_NOFLUSH; +- } +- +- if (new_val & rsvd) +- return emulate_gp(ctxt, 0); +- +- break; +- } +- case 4: { +- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); +- +- if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) +- return emulate_gp(ctxt, 0); +- +- break; +- } +- } +- +- return X86EMUL_CONTINUE; +-} +- + static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) + { + unsigned long dr7; +@@ -4841,10 +4767,10 @@ static const struct opcode twobyte_table + D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */ + D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */ + /* 0x20 - 0x2F */ +- DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), ++ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, +- check_cr_write), ++ check_cr_access), + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, + check_dr_write), + N, N, N, N, diff --git a/queue-5.11/series b/queue-5.11/series index 3c8aba1f842..6bd50483d30 100644 --- a/queue-5.11/series +++ b/queue-5.11/series @@ -93,3 +93,22 @@ s390-fix-detection-of-vector-enhancements-facility-1-vs.-vector-packed-decimal-f kvm-s390-vsie-fix-mvpg-handling-for-prefixing-and-mso.patch kvm-s390-split-kvm_s390_real_to_abs.patch kvm-s390-extend-kvm_s390_shadow_fault-to-return-entry-pointer.patch +kvm-x86-mmu-alloc-page-for-pdptes-when-shadowing-32-bit-npt-with-64-bit.patch +kvm-x86-fix-failure-to-boost-kernel-lock-holder-candidate-in-sev-es-guests.patch +kvm-x86-remove-emulator-s-broken-checks-on-cr0-cr3-cr4-loads.patch +kvm-nsvm-set-the-shadow-root-level-to-the-tdp-level-for-nested-npt.patch +kvm-svm-don-t-strip-the-c-bit-from-cr2-on-pf-interception.patch +kvm-svm-use-online_vcpus-not-created_vcpus-to-iterate-over-vcpus.patch +kvm-svm-do-not-set-sev-es_active-until-kvm_sev_es_init-completes.patch +kvm-svm-do-not-allow-sev-sev-es-initialization-after-vcpus-are-created.patch +kvm-svm-inject-gp-on-guest-msr_tsc_aux-accesses-if-rdtscp-unsupported.patch +kvm-nvmx-defer-the-mmu-reload-to-the-normal-path-on-an-eptp-switch.patch +kvm-nvmx-truncate-bits-63-32-of-vmcs-field-on-nested-check-in-64-bit.patch +kvm-nvmx-truncate-base-index-gpr-value-on-address-calc-in-64-bit.patch +kvm-arm-arm64-fix-kvm_vgic_v3_addr_type_redist-read.patch +kvm-destroy-i-o-bus-devices-on-unregister-failure-_after_-sync-ing-srcu.patch +kvm-stop-looking-for-coalesced-mmio-zones-if-the-bus-is-destroyed.patch +kvm-arm64-fully-zero-the-vcpu-state-on-reset.patch +kvm-arm64-fix-kvm_vgic_v3_addr_type_redist_region-read.patch +kvm-selftests-sync-data-verify-of-dirty-logging-with-guest-sync.patch +kvm-selftests-always-run-vcpu-thread-with-blocked-sig_ipi.patch