From 78c85d531a091f76202c233831016b8343e7f9d6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 9 Feb 2020 13:43:37 +0100 Subject: [PATCH] 5.5-stable patches added patches: kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch kvm-x86-reorganize-pvclock_gtod_data-members.patch kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch kvm-x86-use-raw-clock-values-consistently.patch mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch ocfs2-fix-oops-when-writing-cloned-file.patch x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch x86-kvm-cache-gfn-to-pfn-translation.patch x86-kvm-clean-up-host-s-steal-time-structure.patch x86-kvm-introduce-kvm_-un-map_gfn.patch x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch --- ...ters-during-guest-reset-store-status.patch | 51 ++ ...check-on-writes-to-rtit-address-msrs.patch | 34 + ...userspace-set-host-reserved-cr4-bits.patch | 122 ++++ ...-loaded-when-accessing-for-emulation.patch | 184 +++++ ...etween-spte_mmio_mask-and-generation.patch | 68 ++ ...vd_dirty_mask-if-vcpu-creation-fails.patch | 37 + ...fpu_load-in-kvm_-load-put-_guest_fpu.patch | 67 ++ ...reorganize-pvclock_gtod_data-members.patch | 112 +++ ...x86-fix-fpu-state-crash-in-kvm-guest.patch | 136 ++++ ...locate-host-page-table-reserved-bits.patch | 53 ++ ...gpa-to-fix-tdp-support-on-32-bit-kvm.patch | 650 ++++++++++++++++++ ...86-use-raw-clock-values-consistently.patch | 138 ++++ ...n-a-partially-populated-last-section.patch | 134 ++++ ...s2-fix-oops-when-writing-cloned-file.patch | 139 ++++ queue-5.5/series | 19 + ...-not-to-clear-kvm_vcpu_flush_tlb-bit.patch | 39 ++ ...x86-kvm-cache-gfn-to-pfn-translation.patch | 285 ++++++++ ...clean-up-host-s-steal-time-structure.patch | 81 +++ .../x86-kvm-introduce-kvm_-un-map_gfn.patch | 109 +++ ...vm_vcpu_flush_tlb-flag-is-not-missed.patch | 129 ++++ 20 files changed, 2587 insertions(+) create mode 100644 queue-5.5/kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch create mode 100644 queue-5.5/kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch create mode 100644 queue-5.5/kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch create mode 100644 queue-5.5/kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch create mode 100644 queue-5.5/kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch create mode 100644 queue-5.5/kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch create mode 100644 queue-5.5/kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch create mode 100644 queue-5.5/kvm-x86-reorganize-pvclock_gtod_data-members.patch create mode 100644 queue-5.5/kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch create mode 100644 queue-5.5/kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch create mode 100644 queue-5.5/kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch create mode 100644 queue-5.5/kvm-x86-use-raw-clock-values-consistently.patch create mode 100644 queue-5.5/mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch create mode 100644 queue-5.5/ocfs2-fix-oops-when-writing-cloned-file.patch create mode 100644 queue-5.5/x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch create mode 100644 queue-5.5/x86-kvm-cache-gfn-to-pfn-translation.patch create mode 100644 queue-5.5/x86-kvm-clean-up-host-s-steal-time-structure.patch create mode 100644 queue-5.5/x86-kvm-introduce-kvm_-un-map_gfn.patch create mode 100644 queue-5.5/x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch diff --git a/queue-5.5/kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch b/queue-5.5/kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch new file mode 100644 index 00000000000..c7d7d00c9db --- /dev/null +++ b/queue-5.5/kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch @@ -0,0 +1,51 @@ +From 55680890ea78be0df5e1384989f1be835043c084 Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Fri, 31 Jan 2020 05:02:00 -0500 +Subject: KVM: s390: do not clobber registers during guest reset/store status + +From: Christian Borntraeger + +commit 55680890ea78be0df5e1384989f1be835043c084 upstream. + +The initial CPU reset clobbers the userspace fpc and the store status +ioctl clobbers the guest acrs + fpr. As these calls are only done via +ioctl (and not via vcpu_run), no CPU context is loaded, so we can (and +must) act directly on the sync regs, not on the thread context. + +Cc: stable@kernel.org +Fixes: e1788bb995be ("KVM: s390: handle floating point registers in the run ioctl not in vcpu_put/load") +Fixes: 31d8b8d41a7e ("KVM: s390: handle access registers in the run ioctl not in vcpu_put/load") +Signed-off-by: Christian Borntraeger +Reviewed-by: David Hildenbrand +Reviewed-by: Cornelia Huck +Signed-off-by: Janosch Frank +Link: https://lore.kernel.org/r/20200131100205.74720-2-frankja@linux.ibm.com +Signed-off-by: Christian Borntraeger +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/kvm/kvm-s390.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -2860,9 +2860,7 @@ static void kvm_s390_vcpu_initial_reset( + vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 | + CR14_UNUSED_33 | + CR14_EXTERNAL_DAMAGE_SUBMASK; +- /* make sure the new fpc will be lazily loaded */ +- save_fpu_regs(); +- current->thread.fpu.fpc = 0; ++ vcpu->run->s.regs.fpc = 0; + vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; +@@ -4351,7 +4349,7 @@ long kvm_arch_vcpu_ioctl(struct file *fi + switch (ioctl) { + case KVM_S390_STORE_STATUS: + idx = srcu_read_lock(&vcpu->kvm->srcu); +- r = kvm_s390_vcpu_store_status(vcpu, arg); ++ r = kvm_s390_store_status_unloaded(vcpu, arg); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + break; + case KVM_S390_SET_INITIAL_PSW: { diff --git a/queue-5.5/kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch b/queue-5.5/kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch new file mode 100644 index 00000000000..fe36fbdcdf8 --- /dev/null +++ b/queue-5.5/kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch @@ -0,0 +1,34 @@ +From fe6ed369fca98e99df55c932b85782a5687526b5 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 10 Dec 2019 15:24:32 -0800 +Subject: KVM: VMX: Add non-canonical check on writes to RTIT address MSRs + +From: Sean Christopherson + +commit fe6ed369fca98e99df55c932b85782a5687526b5 upstream. + +Reject writes to RTIT address MSRs if the data being written is a +non-canonical address as the MSRs are subject to canonical checks, e.g. +KVM will trigger an unchecked #GP when loading the values to hardware +during pt_guest_enter(). + +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx/vmx.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -2146,6 +2146,8 @@ static int vmx_set_msr(struct kvm_vcpu * + (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges))) + return 1; ++ if (is_noncanonical_address(data, vcpu)) ++ return 1; + if (index % 2) + vmx->pt_desc.guest.addr_b[index / 2] = data; + else diff --git a/queue-5.5/kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch b/queue-5.5/kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch new file mode 100644 index 00000000000..8451ad13b9f --- /dev/null +++ b/queue-5.5/kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch @@ -0,0 +1,122 @@ +From b11306b53b2540c6ba068c4deddb6a17d9f8d95b Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 10 Dec 2019 14:44:13 -0800 +Subject: KVM: x86: Don't let userspace set host-reserved cr4 bits + +From: Sean Christopherson + +commit b11306b53b2540c6ba068c4deddb6a17d9f8d95b upstream. + +Calculate the host-reserved cr4 bits at runtime based on the system's +capabilities (using logic similar to __do_cpuid_func()), and use the +dynamically generated mask for the reserved bit check in kvm_set_cr4() +instead using of the static CR4_RESERVED_BITS define. This prevents +userspace from "enabling" features in cr4 that are not supported by the +system, e.g. by ignoring KVM_GET_SUPPORTED_CPUID and specifying a bogus +CPUID for the vCPU. + +Allowing userspace to set unsupported bits in cr4 can lead to a variety +of undesirable behavior, e.g. failed VM-Enter, and in general increases +KVM's attack surface. A crafty userspace can even abuse CR4.LA57 to +induce an unchecked #GP on a WRMSR. + +On a platform without LA57 support: + + KVM_SET_CPUID2 // CPUID_7_0_ECX.LA57 = 1 + KVM_SET_SREGS // CR4.LA57 = 1 + KVM_SET_MSRS // KERNEL_GS_BASE = 0x0004000000000000 + KVM_RUN + +leads to a #GP when writing KERNEL_GS_BASE into hardware: + + unchecked MSR access error: WRMSR to 0xc0000102 (tried to write 0x0004000000000000) + at rIP: 0xffffffffa00f239a (vmx_prepare_switch_to_guest+0x10a/0x1d0 [kvm_intel]) + Call Trace: + kvm_arch_vcpu_ioctl_run+0x671/0x1c70 [kvm] + kvm_vcpu_ioctl+0x36b/0x5d0 [kvm] + do_vfs_ioctl+0xa1/0x620 + ksys_ioctl+0x66/0x70 + __x64_sys_ioctl+0x16/0x20 + do_syscall_64+0x4c/0x170 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + RIP: 0033:0x7fc08133bf47 + +Note, the above sequence fails VM-Enter due to invalid guest state. +Userspace can allow VM-Enter to succeed (after the WRMSR #GP) by adding +a KVM_SET_SREGS w/ CR4.LA57=0 after KVM_SET_MSRS, in which case KVM will +technically leak the host's KERNEL_GS_BASE into the guest. But, as +KERNEL_GS_BASE is a userspace-defined value/address, the leak is largely +benign as a malicious userspace would simply be exposing its own data to +the guest, and attacking a benevolent userspace would require multiple +bugs in the userspace VMM. + +Cc: stable@vger.kernel.org +Cc: Jun Nakajima +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++++++++- + 1 file changed, 34 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -93,6 +93,8 @@ u64 __read_mostly efer_reserved_bits = ~ + static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); + #endif + ++static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; ++ + #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ + #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ + +@@ -879,9 +881,38 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u + } + EXPORT_SYMBOL_GPL(kvm_set_xcr); + ++static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) ++{ ++ u64 reserved_bits = CR4_RESERVED_BITS; ++ ++ if (!cpu_has(c, X86_FEATURE_XSAVE)) ++ reserved_bits |= X86_CR4_OSXSAVE; ++ ++ if (!cpu_has(c, X86_FEATURE_SMEP)) ++ reserved_bits |= X86_CR4_SMEP; ++ ++ if (!cpu_has(c, X86_FEATURE_SMAP)) ++ reserved_bits |= X86_CR4_SMAP; ++ ++ if (!cpu_has(c, X86_FEATURE_FSGSBASE)) ++ reserved_bits |= X86_CR4_FSGSBASE; ++ ++ if (!cpu_has(c, X86_FEATURE_PKU)) ++ reserved_bits |= X86_CR4_PKE; ++ ++ if (!cpu_has(c, X86_FEATURE_LA57) && ++ !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57))) ++ reserved_bits |= X86_CR4_LA57; ++ ++ if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated()) ++ reserved_bits |= X86_CR4_UMIP; ++ ++ return reserved_bits; ++} ++ + static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) + { +- if (cr4 & CR4_RESERVED_BITS) ++ if (cr4 & cr4_reserved_bits) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) +@@ -9369,6 +9400,8 @@ int kvm_arch_hardware_setup(void) + if (r != 0) + return r; + ++ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); ++ + if (kvm_has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that diff --git a/queue-5.5/kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch b/queue-5.5/kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch new file mode 100644 index 00000000000..a5db339931a --- /dev/null +++ b/queue-5.5/kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch @@ -0,0 +1,184 @@ +From a7baead7e312f5a05381d68585fb6dc68e19e90f Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 17 Jan 2020 11:30:50 -0800 +Subject: KVM: x86: Ensure guest's FPU state is loaded when accessing for emulation + +From: Sean Christopherson + +commit a7baead7e312f5a05381d68585fb6dc68e19e90f upstream. + +Lock the FPU regs and reload the current thread's FPU state, which holds +the guest's FPU state, to the CPU registers if necessary prior to +accessing guest FPU state as part of emulation. kernel_fpu_begin() can +be called from softirq context, therefore KVM must ensure softirqs are +disabled (locking the FPU regs disables softirqs) when touching CPU FPU +state. + +Note, for all intents and purposes this reverts commit 6ab0b9feb82a7 +("x86,kvm: remove KVM emulator get_fpu / put_fpu"), but at the time it +was applied, removing get/put_fpu() was correct. The re-introduction +of {get,put}_fpu() is necessitated by the deferring of FPU state load. + +Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 39 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 39 insertions(+) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -22,6 +22,7 @@ + #include "kvm_cache_regs.h" + #include + #include ++#include + #include + #include + +@@ -1075,8 +1076,23 @@ static void fetch_register_operand(struc + } + } + ++static void emulator_get_fpu(void) ++{ ++ fpregs_lock(); ++ ++ fpregs_assert_state_consistent(); ++ if (test_thread_flag(TIF_NEED_FPU_LOAD)) ++ switch_fpu_return(); ++} ++ ++static void emulator_put_fpu(void) ++{ ++ fpregs_unlock(); ++} ++ + static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) + { ++ emulator_get_fpu(); + switch (reg) { + case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; +@@ -1098,11 +1114,13 @@ static void read_sse_reg(struct x86_emul + #endif + default: BUG(); + } ++ emulator_put_fpu(); + } + + static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, + int reg) + { ++ emulator_get_fpu(); + switch (reg) { + case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; +@@ -1124,10 +1142,12 @@ static void write_sse_reg(struct x86_emu + #endif + default: BUG(); + } ++ emulator_put_fpu(); + } + + static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) + { ++ emulator_get_fpu(); + switch (reg) { + case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; + case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; +@@ -1139,10 +1159,12 @@ static void read_mmx_reg(struct x86_emul + case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + default: BUG(); + } ++ emulator_put_fpu(); + } + + static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) + { ++ emulator_get_fpu(); + switch (reg) { + case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; + case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; +@@ -1154,6 +1176,7 @@ static void write_mmx_reg(struct x86_emu + case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + default: BUG(); + } ++ emulator_put_fpu(); + } + + static int em_fninit(struct x86_emulate_ctxt *ctxt) +@@ -1161,7 +1184,9 @@ static int em_fninit(struct x86_emulate_ + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + ++ emulator_get_fpu(); + asm volatile("fninit"); ++ emulator_put_fpu(); + return X86EMUL_CONTINUE; + } + +@@ -1172,7 +1197,9 @@ static int em_fnstcw(struct x86_emulate_ + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + ++ emulator_get_fpu(); + asm volatile("fnstcw %0": "+m"(fcw)); ++ emulator_put_fpu(); + + ctxt->dst.val = fcw; + +@@ -1186,7 +1213,9 @@ static int em_fnstsw(struct x86_emulate_ + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + ++ emulator_get_fpu(); + asm volatile("fnstsw %0": "+m"(fsw)); ++ emulator_put_fpu(); + + ctxt->dst.val = fsw; + +@@ -4092,8 +4121,12 @@ static int em_fxsave(struct x86_emulate_ + if (rc != X86EMUL_CONTINUE) + return rc; + ++ emulator_get_fpu(); ++ + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + ++ emulator_put_fpu(); ++ + if (rc != X86EMUL_CONTINUE) + return rc; + +@@ -4136,6 +4169,8 @@ static int em_fxrstor(struct x86_emulate + if (rc != X86EMUL_CONTINUE) + return rc; + ++ emulator_get_fpu(); ++ + if (size < __fxstate_size(16)) { + rc = fxregs_fixup(&fx_state, size); + if (rc != X86EMUL_CONTINUE) +@@ -4151,6 +4186,8 @@ static int em_fxrstor(struct x86_emulate + rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); + + out: ++ emulator_put_fpu(); ++ + return rc; + } + +@@ -5465,7 +5502,9 @@ static int flush_pending_x87_faults(stru + { + int rc; + ++ emulator_get_fpu(); + rc = asm_safe("fwait"); ++ emulator_put_fpu(); + + if (unlikely(rc != X86EMUL_CONTINUE)) + return emulate_exception(ctxt, MF_VECTOR, 0, false); diff --git a/queue-5.5/kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch b/queue-5.5/kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch new file mode 100644 index 00000000000..3d3b2e8d5c5 --- /dev/null +++ b/queue-5.5/kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch @@ -0,0 +1,68 @@ +From 56871d444bc4d7ea66708775e62e2e0926384dbc Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Sat, 18 Jan 2020 20:09:03 +0100 +Subject: KVM: x86: fix overlap between SPTE_MMIO_MASK and generation + +From: Paolo Bonzini + +commit 56871d444bc4d7ea66708775e62e2e0926384dbc upstream. + +The SPTE_MMIO_MASK overlaps with the bits used to track MMIO +generation number. A high enough generation number would overwrite the +SPTE_SPECIAL_MASK region and cause the MMIO SPTE to be misinterpreted. + +Likewise, setting bits 52 and 53 would also cause an incorrect generation +number to be read from the PTE, though this was partially mitigated by the +(useless if it weren't for the bug) removal of SPTE_SPECIAL_MASK from +the spte in get_mmio_spte_generation. Drop that removal, and replace +it with a compile-time assertion. + +Fixes: 6eeb4ef049e7 ("KVM: x86: assign two bits to track SPTE kinds") +Reported-by: Ben Gardon +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/mmu/mmu.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -418,22 +418,24 @@ static inline bool is_access_track_spte( + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. + */ +-#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) ++#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) + + #define MMIO_SPTE_GEN_LOW_START 3 + #define MMIO_SPTE_GEN_LOW_END 11 + #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) + +-#define MMIO_SPTE_GEN_HIGH_START 52 +-#define MMIO_SPTE_GEN_HIGH_END 61 ++#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT ++#define MMIO_SPTE_GEN_HIGH_END 62 + #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) ++ + static u64 generation_mmio_spte_mask(u64 gen) + { + u64 mask; + + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); ++ BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; +@@ -444,8 +446,6 @@ static u64 get_mmio_spte_generation(u64 + { + u64 gen; + +- spte &= ~shadow_mmio_mask; +- + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen; diff --git a/queue-5.5/kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch b/queue-5.5/kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch new file mode 100644 index 00000000000..848afac2dac --- /dev/null +++ b/queue-5.5/kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch @@ -0,0 +1,37 @@ +From 16be9ddea268ad841457a59109963fff8c9de38d Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 18 Dec 2019 13:54:48 -0800 +Subject: KVM: x86: Free wbinvd_dirty_mask if vCPU creation fails + +From: Sean Christopherson + +commit 16be9ddea268ad841457a59109963fff8c9de38d upstream. + +Free the vCPU's wbinvd_dirty_mask if vCPU creation fails after +kvm_arch_vcpu_init(), e.g. when installing the vCPU's file descriptor. +Do the freeing by calling kvm_arch_vcpu_free() instead of open coding +the freeing. This adds a likely superfluous, but ultimately harmless, +call to kvmclock_reset(), which only clears vcpu->arch.pv_time_enabled. +Using kvm_arch_vcpu_free() allows for additional cleanup in the future. + +Fixes: f5f48ee15c2ee ("KVM: VMX: Execute WBINVD to keep data consistency with assigned devices") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -9208,7 +9208,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vc + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); + +- kvm_x86_ops->vcpu_free(vcpu); ++ kvm_arch_vcpu_free(vcpu); + } + + void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) diff --git a/queue-5.5/kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch b/queue-5.5/kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch new file mode 100644 index 00000000000..bcc5ab41605 --- /dev/null +++ b/queue-5.5/kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch @@ -0,0 +1,67 @@ +From c9aef3b85f425d1f6635382ec210ee5a7ef55d7d Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 17 Jan 2020 11:30:49 -0800 +Subject: KVM: x86: Handle TIF_NEED_FPU_LOAD in kvm_{load,put}_guest_fpu() + +From: Sean Christopherson + +commit c9aef3b85f425d1f6635382ec210ee5a7ef55d7d upstream. + +Handle TIF_NEED_FPU_LOAD similar to how fpu__copy() handles the flag +when duplicating FPU state to a new task struct. TIF_NEED_FPU_LOAD can +be set any time control is transferred out of KVM, be it voluntarily, +e.g. if I/O is triggered during a KVM call to get_user_pages, or +involuntarily, e.g. if softirq runs after an IRQ occurs. Therefore, +KVM must account for TIF_NEED_FPU_LOAD whenever it is (potentially) +accessing CPU FPU state. + +Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8517,12 +8517,26 @@ static int complete_emulated_mmio(struct + return 0; + } + ++static void kvm_save_current_fpu(struct fpu *fpu) ++{ ++ /* ++ * If the target FPU state is not resident in the CPU registers, just ++ * memcpy() from current, else save CPU state directly to the target. ++ */ ++ if (test_thread_flag(TIF_NEED_FPU_LOAD)) ++ memcpy(&fpu->state, ¤t->thread.fpu.state, ++ fpu_kernel_xstate_size); ++ else ++ copy_fpregs_to_fpstate(fpu); ++} ++ + /* Swap (qemu) user FPU context for the guest FPU context. */ + static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) + { + fpregs_lock(); + +- copy_fpregs_to_fpstate(vcpu->arch.user_fpu); ++ kvm_save_current_fpu(vcpu->arch.user_fpu); ++ + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, + ~XFEATURE_MASK_PKRU); +@@ -8538,7 +8552,8 @@ static void kvm_put_guest_fpu(struct kvm + { + fpregs_lock(); + +- copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); ++ kvm_save_current_fpu(vcpu->arch.guest_fpu); ++ + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); + + fpregs_mark_activate(); diff --git a/queue-5.5/kvm-x86-reorganize-pvclock_gtod_data-members.patch b/queue-5.5/kvm-x86-reorganize-pvclock_gtod_data-members.patch new file mode 100644 index 00000000000..c4dc2217ce6 --- /dev/null +++ b/queue-5.5/kvm-x86-reorganize-pvclock_gtod_data-members.patch @@ -0,0 +1,112 @@ +From 917f9475c0a8ab8958db7f22a5d495b9a1d51be6 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Wed, 22 Jan 2020 14:32:20 +0100 +Subject: KVM: x86: reorganize pvclock_gtod_data members + +From: Paolo Bonzini + +commit 917f9475c0a8ab8958db7f22a5d495b9a1d51be6 upstream. + +We will need a copy of tk->offs_boot in the next patch. Store it and +cleanup the struct: instead of storing tk->tkr_xxx.base with the tk->offs_boot +included, store the raw value in struct pvclock_clock and sum it in +do_monotonic_raw and do_realtime. tk->tkr_xxx.xtime_nsec also moves +to struct pvclock_clock. + +While at it, fix a (usually harmless) typo in do_monotonic_raw, which +was using gtod->clock.shift instead of gtod->raw_clock.shift. + +Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") +Cc: stable@vger.kernel.org +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 29 ++++++++++++----------------- + 1 file changed, 12 insertions(+), 17 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -1580,6 +1580,8 @@ struct pvclock_clock { + u64 mask; + u32 mult; + u32 shift; ++ u64 base_cycles; ++ u64 offset; + }; + + struct pvclock_gtod_data { +@@ -1588,11 +1590,8 @@ struct pvclock_gtod_data { + struct pvclock_clock clock; /* extract of a clocksource struct */ + struct pvclock_clock raw_clock; /* extract of a clocksource struct */ + +- u64 boot_ns_raw; +- u64 boot_ns; +- u64 nsec_base; ++ ktime_t offs_boot; + u64 wall_time_sec; +- u64 monotonic_raw_nsec; + }; + + static struct pvclock_gtod_data pvclock_gtod_data; +@@ -1600,10 +1599,6 @@ static struct pvclock_gtod_data pvclock_ + static void update_pvclock_gtod(struct timekeeper *tk) + { + struct pvclock_gtod_data *vdata = &pvclock_gtod_data; +- u64 boot_ns, boot_ns_raw; +- +- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); +- boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot)); + + write_seqcount_begin(&vdata->seq); + +@@ -1613,20 +1608,20 @@ static void update_pvclock_gtod(struct t + vdata->clock.mask = tk->tkr_mono.mask; + vdata->clock.mult = tk->tkr_mono.mult; + vdata->clock.shift = tk->tkr_mono.shift; ++ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; ++ vdata->clock.offset = tk->tkr_mono.base; + + vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; + vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; + vdata->raw_clock.mask = tk->tkr_raw.mask; + vdata->raw_clock.mult = tk->tkr_raw.mult; + vdata->raw_clock.shift = tk->tkr_raw.shift; +- +- vdata->boot_ns = boot_ns; +- vdata->nsec_base = tk->tkr_mono.xtime_nsec; ++ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; ++ vdata->raw_clock.offset = tk->tkr_raw.base; + + vdata->wall_time_sec = tk->xtime_sec; + +- vdata->boot_ns_raw = boot_ns_raw; +- vdata->monotonic_raw_nsec = tk->tkr_raw.xtime_nsec; ++ vdata->offs_boot = tk->offs_boot; + + write_seqcount_end(&vdata->seq); + } +@@ -2096,10 +2091,10 @@ static int do_monotonic_raw(s64 *t, u64 + + do { + seq = read_seqcount_begin(>od->seq); +- ns = gtod->monotonic_raw_nsec; ++ ns = gtod->raw_clock.base_cycles; + ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); +- ns >>= gtod->clock.shift; +- ns += gtod->boot_ns_raw; ++ ns >>= gtod->raw_clock.shift; ++ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + *t = ns; + +@@ -2116,7 +2111,7 @@ static int do_realtime(struct timespec64 + do { + seq = read_seqcount_begin(>od->seq); + ts->tv_sec = gtod->wall_time_sec; +- ns = gtod->nsec_base; ++ ns = gtod->clock.base_cycles; + ns += vgettsc(>od->clock, tsc_timestamp, &mode); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); diff --git a/queue-5.5/kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch b/queue-5.5/kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch new file mode 100644 index 00000000000..ea6f7f0c0cc --- /dev/null +++ b/queue-5.5/kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch @@ -0,0 +1,136 @@ +From 2620fe268e80d667a94553cd37a94ccaa2cb8c83 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 17 Jan 2020 11:30:51 -0800 +Subject: KVM: x86: Revert "KVM: X86: Fix fpu state crash in kvm guest" + +From: Sean Christopherson + +commit 2620fe268e80d667a94553cd37a94ccaa2cb8c83 upstream. + +Reload the current thread's FPU state, which contains the guest's FPU +state, to the CPU registers if necessary during vcpu_enter_guest(). +TIF_NEED_FPU_LOAD can be set any time control is transferred out of KVM, +e.g. if I/O is triggered during a KVM call to get_user_pages() or if a +softirq occurs while KVM is scheduled in. + +Moving the handling of TIF_NEED_FPU_LOAD from vcpu_enter_guest() to +kvm_arch_vcpu_load(), effectively kvm_sched_in(), papered over a bug +where kvm_put_guest_fpu() failed to account for TIF_NEED_FPU_LOAD. The +easiest way to the kvm_put_guest_fpu() bug was to run with involuntary +preemption enable, thus handling TIF_NEED_FPU_LOAD during kvm_sched_in() +made the bug go away. But, removing the handling in vcpu_enter_guest() +exposed KVM to the rare case of a softirq triggering kernel_fpu_begin() +between vcpu_load() and vcpu_enter_guest(). + +Now that kvm_{load,put}_guest_fpu() correctly handle TIF_NEED_FPU_LOAD, +revert the commit to both restore the vcpu_enter_guest() behavior and +eliminate the superfluous switch_fpu_return() in kvm_arch_vcpu_load(). + +Note, leaving the handling in kvm_arch_vcpu_load() isn't wrong per se, +but it is unnecessary, and most critically, makes it extremely difficult +to find bugs such as the kvm_put_guest_fpu() issue due to shrinking the +window where a softirq can corrupt state. + +A sample trace triggered by warning if TIF_NEED_FPU_LOAD is set while +vcpu state is loaded: + + + gcmaes_crypt_by_sg.constprop.12+0x26e/0x660 + ? 0xffffffffc024547d + ? __qdisc_run+0x83/0x510 + ? __dev_queue_xmit+0x45e/0x990 + ? ip_finish_output2+0x1a8/0x570 + ? fib4_rule_action+0x61/0x70 + ? fib4_rule_action+0x70/0x70 + ? fib_rules_lookup+0x13f/0x1c0 + ? helper_rfc4106_decrypt+0x82/0xa0 + ? crypto_aead_decrypt+0x40/0x70 + ? crypto_aead_decrypt+0x40/0x70 + ? crypto_aead_decrypt+0x40/0x70 + ? esp_output_tail+0x8f4/0xa5a [esp4] + ? skb_ext_add+0xd3/0x170 + ? xfrm_input+0x7a6/0x12c0 + ? xfrm4_rcv_encap+0xae/0xd0 + ? xfrm4_transport_finish+0x200/0x200 + ? udp_queue_rcv_one_skb+0x1ba/0x460 + ? udp_unicast_rcv_skb.isra.63+0x72/0x90 + ? __udp4_lib_rcv+0x51b/0xb00 + ? ip_protocol_deliver_rcu+0xd2/0x1c0 + ? ip_local_deliver_finish+0x44/0x50 + ? ip_local_deliver+0xe0/0xf0 + ? ip_protocol_deliver_rcu+0x1c0/0x1c0 + ? ip_rcv+0xbc/0xd0 + ? ip_rcv_finish_core.isra.19+0x380/0x380 + ? __netif_receive_skb_one_core+0x7e/0x90 + ? netif_receive_skb_internal+0x3d/0xb0 + ? napi_gro_receive+0xed/0x150 + ? 0xffffffffc0243c77 + ? net_rx_action+0x149/0x3b0 + ? __do_softirq+0xe4/0x2f8 + ? handle_irq_event_percpu+0x6a/0x80 + ? irq_exit+0xe6/0xf0 + ? do_IRQ+0x7f/0xd0 + ? common_interrupt+0xf/0xf + + ? irq_entries_start+0x20/0x660 + ? vmx_get_interrupt_shadow+0x2f0/0x710 [kvm_intel] + ? kvm_set_msr_common+0xfc7/0x2380 [kvm] + ? recalibrate_cpu_khz+0x10/0x10 + ? ktime_get+0x3a/0xa0 + ? kvm_arch_vcpu_ioctl_run+0x107/0x560 [kvm] + ? kvm_init+0x6bf/0xd00 [kvm] + ? __seccomp_filter+0x7a/0x680 + ? do_vfs_ioctl+0xa4/0x630 + ? security_file_ioctl+0x32/0x50 + ? ksys_ioctl+0x60/0x90 + ? __x64_sys_ioctl+0x16/0x20 + ? do_syscall_64+0x5f/0x1a0 + ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 +---[ end trace 9564a1ccad733a90 ]--- + +This reverts commit e751732486eb3f159089a64d1901992b1357e7cc. + +Fixes: e751732486eb3 ("KVM: X86: Fix fpu state crash in kvm guest") +Reported-by: Derek Yerger +Reported-by: kernel@najdan.com +Cc: Wanpeng Li +Cc: Thomas Lambertz +Cc: Rik van Riel +Cc: Sebastian Andrzej Siewior +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Thomas Gleixner +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3496,10 +3496,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu + + kvm_x86_ops->vcpu_load(vcpu, cpu); + +- fpregs_assert_state_consistent(); +- if (test_thread_flag(TIF_NEED_FPU_LOAD)) +- switch_fpu_return(); +- + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { + adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); +@@ -8244,8 +8240,9 @@ static int vcpu_enter_guest(struct kvm_v + trace_kvm_entry(vcpu->vcpu_id); + guest_enter_irqoff(); + +- /* The preempt notifier should have taken care of the FPU already. */ +- WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); ++ fpregs_assert_state_consistent(); ++ if (test_thread_flag(TIF_NEED_FPU_LOAD)) ++ switch_fpu_return(); + + if (unlikely(vcpu->arch.switch_db_regs)) { + set_debugreg(0, 7); diff --git a/queue-5.5/kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch b/queue-5.5/kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch new file mode 100644 index 00000000000..408f8668283 --- /dev/null +++ b/queue-5.5/kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch @@ -0,0 +1,53 @@ +From 7adacf5eb2d2048045d9fd8fdab861fd9e7e2e96 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Wed, 4 Dec 2019 15:50:27 +0100 +Subject: KVM: x86: use CPUID to locate host page table reserved bits + +From: Paolo Bonzini + +commit 7adacf5eb2d2048045d9fd8fdab861fd9e7e2e96 upstream. + +The comment in kvm_get_shadow_phys_bits refers to MKTME, but the same is actually +true of SME and SEV. Just use CPUID[0x8000_0008].EAX[7:0] unconditionally if +available, it is simplest and works even if memory is not encrypted. + +Cc: stable@vger.kernel.org +Reported-by: Tom Lendacky +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -538,16 +538,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes) + static u8 kvm_get_shadow_phys_bits(void) + { + /* +- * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected +- * in CPU detection code, but MKTME treats those reduced bits as +- * 'keyID' thus they are not reserved bits. Therefore for MKTME +- * we should still return physical address bits reported by CPUID. ++ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected ++ * in CPU detection code, but the processor treats those reduced bits as ++ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at ++ * the physical address bits reported by CPUID. + */ +- if (!boot_cpu_has(X86_FEATURE_TME) || +- WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) +- return boot_cpu_data.x86_phys_bits; ++ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) ++ return cpuid_eax(0x80000008) & 0xff; + +- return cpuid_eax(0x80000008) & 0xff; ++ /* ++ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with ++ * custom CPUID. Proceed with whatever the kernel found since these features ++ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). ++ */ ++ return boot_cpu_data.x86_phys_bits; + } + + static void kvm_mmu_reset_all_pte_masks(void) diff --git a/queue-5.5/kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch b/queue-5.5/kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch new file mode 100644 index 00000000000..9e2ade1ebe9 --- /dev/null +++ b/queue-5.5/kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch @@ -0,0 +1,650 @@ +From 736c291c9f36b07f8889c61764c28edce20e715d Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Fri, 6 Dec 2019 15:57:14 -0800 +Subject: KVM: x86: Use gpa_t for cr2/gpa to fix TDP support on 32-bit KVM + +From: Sean Christopherson + +commit 736c291c9f36b07f8889c61764c28edce20e715d upstream. + +Convert a plethora of parameters and variables in the MMU and page fault +flows from type gva_t to gpa_t to properly handle TDP on 32-bit KVM. + +Thanks to PSE and PAE paging, 32-bit kernels can access 64-bit physical +addresses. When TDP is enabled, the fault address is a guest physical +address and thus can be a 64-bit value, even when both KVM and its guest +are using 32-bit virtual addressing, e.g. VMX's VMCS.GUEST_PHYSICAL is a +64-bit field, not a natural width field. + +Using a gva_t for the fault address means KVM will incorrectly drop the +upper 32-bits of the GPA. Ditto for gva_to_gpa() when it is used to +translate L2 GPAs to L1 GPAs. + +Opportunistically rename variables and parameters to better reflect the +dual address modes, e.g. use "cr2_or_gpa" for fault addresses and plain +"addr" instead of "vaddr" when the address may be either a GVA or an L2 +GPA. Similarly, use "gpa" in the nonpaging_page_fault() flows to avoid +a confusing "gpa_t gva" declaration; this also sets the stage for a +future patch to combing nonpaging_page_fault() and tdp_page_fault() with +minimal churn. + +Sprinkle in a few comments to document flows where an address is known +to be a GVA and thus can be safely truncated to a 32-bit value. Add +WARNs in kvm_handle_page_fault() and FNAME(gva_to_gpa_nested)() to help +document such cases and detect bugs. + +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/kvm_host.h | 8 ++-- + arch/x86/kvm/mmu/mmu.c | 69 +++++++++++++++++++++++----------------- + arch/x86/kvm/mmu/paging_tmpl.h | 25 +++++++++----- + arch/x86/kvm/mmutrace.h | 12 +++--- + arch/x86/kvm/x86.c | 40 +++++++++++------------ + arch/x86/kvm/x86.h | 2 - + include/linux/kvm_host.h | 6 +-- + virt/kvm/async_pf.c | 10 ++--- + 8 files changed, 94 insertions(+), 78 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -378,12 +378,12 @@ struct kvm_mmu { + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); + unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); + u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); +- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, ++ int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, + bool prefault); + void (*inject_page_fault)(struct kvm_vcpu *vcpu, + struct x86_exception *fault); +- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, +- struct x86_exception *exception); ++ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa, ++ u32 access, struct x86_exception *exception); + gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception); + int (*sync_page)(struct kvm_vcpu *vcpu, +@@ -1469,7 +1469,7 @@ void kvm_vcpu_deactivate_apicv(struct kv + + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); + +-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, ++int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, + void *insn, int insn_len); + void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); + void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -3532,7 +3532,7 @@ static bool is_access_allowed(u32 fault_ + * - true: let the vcpu to access on the same address again. + * - false: let the real page fault path to fix it. + */ +-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, ++static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, + u32 error_code) + { + struct kvm_shadow_walk_iterator iterator; +@@ -3552,7 +3552,7 @@ static bool fast_page_fault(struct kvm_v + do { + u64 new_spte; + +- for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) ++ for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) + if (!is_shadow_present_pte(spte) || + iterator.level < level) + break; +@@ -3630,7 +3630,7 @@ static bool fast_page_fault(struct kvm_v + + } while (true); + +- trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, ++ trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, + spte, fault_handled); + walk_shadow_page_lockless_end(vcpu); + +@@ -3638,10 +3638,11 @@ static bool fast_page_fault(struct kvm_v + } + + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); ++ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, ++ bool *writable); + static int make_mmu_pages_available(struct kvm_vcpu *vcpu); + +-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, ++static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + gfn_t gfn, bool prefault) + { + int r; +@@ -3667,16 +3668,16 @@ static int nonpaging_map(struct kvm_vcpu + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + +- if (fast_page_fault(vcpu, v, level, error_code)) ++ if (fast_page_fault(vcpu, gpa, level, error_code)) + return RET_PF_RETRY; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + +- if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) ++ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + return RET_PF_RETRY; + +- if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) ++ if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) + return r; + + r = RET_PF_RETRY; +@@ -3687,7 +3688,7 @@ static int nonpaging_map(struct kvm_vcpu + goto out_unlock; + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); +- r = __direct_map(vcpu, v, write, map_writable, level, pfn, ++ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, false); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); +@@ -3985,7 +3986,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu + } + EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); + +-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, ++static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, + u32 access, struct x86_exception *exception) + { + if (exception) +@@ -3993,7 +3994,7 @@ static gpa_t nonpaging_gva_to_gpa(struct + return vaddr; + } + +-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, ++static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, + u32 access, + struct x86_exception *exception) + { +@@ -4153,13 +4154,14 @@ static void shadow_page_table_clear_floo + walk_shadow_page_lockless_end(vcpu); + } + +-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, ++static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, + u32 error_code, bool prefault) + { +- gfn_t gfn = gva >> PAGE_SHIFT; ++ gfn_t gfn = gpa >> PAGE_SHIFT; + int r; + +- pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); ++ /* Note, paging is disabled, ergo gva == gpa. */ ++ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + + if (page_fault_handle_page_track(vcpu, error_code, gfn)) + return RET_PF_EMULATE; +@@ -4171,11 +4173,12 @@ static int nonpaging_page_fault(struct k + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); + + +- return nonpaging_map(vcpu, gva & PAGE_MASK, ++ return nonpaging_map(vcpu, gpa & PAGE_MASK, + error_code, gfn, prefault); + } + +-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) ++static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, ++ gfn_t gfn) + { + struct kvm_arch_async_pf arch; + +@@ -4184,11 +4187,13 @@ static int kvm_arch_setup_async_pf(struc + arch.direct_map = vcpu->arch.mmu->direct_map; + arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); + +- return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); ++ return kvm_setup_async_pf(vcpu, cr2_or_gpa, ++ kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + } + + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) ++ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, ++ bool *writable) + { + struct kvm_memory_slot *slot; + bool async; +@@ -4208,12 +4213,12 @@ static bool try_async_pf(struct kvm_vcpu + return false; /* *pfn has correct page already */ + + if (!prefault && kvm_can_do_async_pf(vcpu)) { +- trace_kvm_try_async_get_page(gva, gfn); ++ trace_kvm_try_async_get_page(cr2_or_gpa, gfn); + if (kvm_find_async_pf_gfn(vcpu, gfn)) { +- trace_kvm_async_pf_doublefault(gva, gfn); ++ trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return true; +- } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) ++ } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) + return true; + } + +@@ -4226,6 +4231,12 @@ int kvm_handle_page_fault(struct kvm_vcp + { + int r = 1; + ++#ifndef CONFIG_X86_64 ++ /* A 64-bit CR2 should be impossible on 32-bit KVM. */ ++ if (WARN_ON_ONCE(fault_address >> 32)) ++ return -EFAULT; ++#endif ++ + vcpu->arch.l1tf_flush_l1d = true; + switch (vcpu->arch.apf.host_apf_reason) { + default: +@@ -4263,7 +4274,7 @@ check_hugepage_cache_consistency(struct + return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); + } + +-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ++static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + bool prefault) + { + kvm_pfn_t pfn; +@@ -5520,7 +5531,7 @@ static int make_mmu_pages_available(stru + return 0; + } + +-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, ++int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, + void *insn, int insn_len) + { + int r, emulation_type = 0; +@@ -5529,18 +5540,18 @@ int kvm_mmu_page_fault(struct kvm_vcpu * + /* With shadow page tables, fault_address contains a GVA or nGPA. */ + if (vcpu->arch.mmu->direct_map) { + vcpu->arch.gpa_available = true; +- vcpu->arch.gpa_val = cr2; ++ vcpu->arch.gpa_val = cr2_or_gpa; + } + + r = RET_PF_INVALID; + if (unlikely(error_code & PFERR_RSVD_MASK)) { +- r = handle_mmio_page_fault(vcpu, cr2, direct); ++ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); + if (r == RET_PF_EMULATE) + goto emulate; + } + + if (r == RET_PF_INVALID) { +- r = vcpu->arch.mmu->page_fault(vcpu, cr2, ++ r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, + lower_32_bits(error_code), + false); + WARN_ON(r == RET_PF_INVALID); +@@ -5560,7 +5571,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu * + */ + if (vcpu->arch.mmu->direct_map && + (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { +- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); ++ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); + return 1; + } + +@@ -5575,7 +5586,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu * + * explicitly shadowing L1's page tables, i.e. unprotecting something + * for L1 isn't going to magically fix whatever issue cause L2 to fail. + */ +- if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) ++ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) + emulation_type = EMULTYPE_ALLOW_RETRY; + emulate: + /* +@@ -5590,7 +5601,7 @@ emulate: + return 1; + } + +- return x86_emulate_instruction(vcpu, cr2, emulation_type, insn, ++ return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, + insn_len); + } + EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +--- a/arch/x86/kvm/mmu/paging_tmpl.h ++++ b/arch/x86/kvm/mmu/paging_tmpl.h +@@ -291,11 +291,11 @@ static inline unsigned FNAME(gpte_pkeys) + } + + /* +- * Fetch a guest pte for a guest virtual address ++ * Fetch a guest pte for a guest virtual address, or for an L2's GPA. + */ + static int FNAME(walk_addr_generic)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, +- gva_t addr, u32 access) ++ gpa_t addr, u32 access) + { + int ret; + pt_element_t pte; +@@ -496,7 +496,7 @@ error: + } + + static int FNAME(walk_addr)(struct guest_walker *walker, +- struct kvm_vcpu *vcpu, gva_t addr, u32 access) ++ struct kvm_vcpu *vcpu, gpa_t addr, u32 access) + { + return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, + access); +@@ -611,7 +611,7 @@ static void FNAME(pte_prefetch)(struct k + * If the guest tries to write a write-protected page, we need to + * emulate this operation, return 1 to indicate this case. + */ +-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, ++static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, + struct guest_walker *gw, + int write_fault, int hlevel, + kvm_pfn_t pfn, bool map_writable, bool prefault, +@@ -765,7 +765,7 @@ FNAME(is_self_change_mapping)(struct kvm + * Returns: 1 if we need to emulate the instruction, 0 otherwise, or + * a negative value on error. + */ +-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, ++static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, + bool prefault) + { + int write_fault = error_code & PFERR_WRITE_MASK; +@@ -945,18 +945,19 @@ static void FNAME(invlpg)(struct kvm_vcp + spin_unlock(&vcpu->kvm->mmu_lock); + } + +-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, ++/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ ++static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, + struct x86_exception *exception) + { + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + +- r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); ++ r = FNAME(walk_addr)(&walker, vcpu, addr, access); + + if (r) { + gpa = gfn_to_gpa(walker.gfn); +- gpa |= vaddr & ~PAGE_MASK; ++ gpa |= addr & ~PAGE_MASK; + } else if (exception) + *exception = walker.fault; + +@@ -964,7 +965,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kv + } + + #if PTTYPE != PTTYPE_EPT +-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, ++/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ ++static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, + u32 access, + struct x86_exception *exception) + { +@@ -972,6 +974,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(st + gpa_t gpa = UNMAPPED_GVA; + int r; + ++#ifndef CONFIG_X86_64 ++ /* A 64-bit GVA should be impossible on 32-bit KVM. */ ++ WARN_ON_ONCE(vaddr >> 32); ++#endif ++ + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); + + if (r) { +--- a/arch/x86/kvm/mmutrace.h ++++ b/arch/x86/kvm/mmutrace.h +@@ -249,13 +249,13 @@ TRACE_EVENT( + + TRACE_EVENT( + fast_page_fault, +- TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, ++ TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, + u64 *sptep, u64 old_spte, bool retry), +- TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), ++ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), + + TP_STRUCT__entry( + __field(int, vcpu_id) +- __field(gva_t, gva) ++ __field(gpa_t, cr2_or_gpa) + __field(u32, error_code) + __field(u64 *, sptep) + __field(u64, old_spte) +@@ -265,7 +265,7 @@ TRACE_EVENT( + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; +- __entry->gva = gva; ++ __entry->cr2_or_gpa = cr2_or_gpa; + __entry->error_code = error_code; + __entry->sptep = sptep; + __entry->old_spte = old_spte; +@@ -273,9 +273,9 @@ TRACE_EVENT( + __entry->retry = retry; + ), + +- TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" ++ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" + " new %llx spurious %d fixed %d", __entry->vcpu_id, +- __entry->gva, __print_flags(__entry->error_code, "|", ++ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", + kvm_mmu_trace_pferr_flags), __entry->sptep, + __entry->old_spte, __entry->new_spte, + __spte_satisfied(old_spte), __spte_satisfied(new_spte) +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -6396,11 +6396,11 @@ static int handle_emulation_failure(stru + return 1; + } + +-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, ++static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool write_fault_to_shadow_pgtable, + int emulation_type) + { +- gpa_t gpa = cr2; ++ gpa_t gpa = cr2_or_gpa; + kvm_pfn_t pfn; + + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) +@@ -6414,7 +6414,7 @@ static bool reexecute_instruction(struct + * Write permission should be allowed since only + * write access need to be emulated. + */ +- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); ++ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + + /* + * If the mapping is invalid in guest, let cpu retry +@@ -6471,10 +6471,10 @@ static bool reexecute_instruction(struct + } + + static bool retry_instruction(struct x86_emulate_ctxt *ctxt, +- unsigned long cr2, int emulation_type) ++ gpa_t cr2_or_gpa, int emulation_type) + { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); +- unsigned long last_retry_eip, last_retry_addr, gpa = cr2; ++ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; + + last_retry_eip = vcpu->arch.last_retry_eip; + last_retry_addr = vcpu->arch.last_retry_addr; +@@ -6503,14 +6503,14 @@ static bool retry_instruction(struct x86 + if (x86_page_table_writing_insn(ctxt)) + return false; + +- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) ++ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) + return false; + + vcpu->arch.last_retry_eip = ctxt->eip; +- vcpu->arch.last_retry_addr = cr2; ++ vcpu->arch.last_retry_addr = cr2_or_gpa; + + if (!vcpu->arch.mmu->direct_map) +- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); ++ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + +@@ -6656,11 +6656,8 @@ static bool is_vmware_backdoor_opcode(st + return false; + } + +-int x86_emulate_instruction(struct kvm_vcpu *vcpu, +- unsigned long cr2, +- int emulation_type, +- void *insn, +- int insn_len) ++int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, ++ int emulation_type, void *insn, int insn_len) + { + int r; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; +@@ -6706,8 +6703,9 @@ int x86_emulate_instruction(struct kvm_v + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } +- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, +- emulation_type)) ++ if (reexecute_instruction(vcpu, cr2_or_gpa, ++ write_fault_to_spt, ++ emulation_type)) + return 1; + if (ctxt->have_exception) { + /* +@@ -6741,7 +6739,7 @@ int x86_emulate_instruction(struct kvm_v + return 1; + } + +- if (retry_instruction(ctxt, cr2, emulation_type)) ++ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) + return 1; + + /* this is needed for vmware backdoor interface to work since it +@@ -6753,7 +6751,7 @@ int x86_emulate_instruction(struct kvm_v + + restart: + /* Save the faulting GPA (cr2) in the address field */ +- ctxt->exception.address = cr2; ++ ctxt->exception.address = cr2_or_gpa; + + r = x86_emulate_insn(ctxt); + +@@ -6761,7 +6759,7 @@ restart: + return 1; + + if (r == EMULATION_FAILED) { +- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, ++ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, + emulation_type)) + return 1; + +@@ -10045,7 +10043,7 @@ void kvm_arch_async_page_ready(struct kv + work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) + return; + +- vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true); ++ vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); + } + + static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) +@@ -10158,7 +10156,7 @@ void kvm_arch_async_page_not_present(str + { + struct x86_exception fault; + +- trace_kvm_async_pf_not_present(work->arch.token, work->gva); ++ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa); + kvm_add_async_pf_gfn(vcpu, work->arch.gfn); + + if (kvm_can_deliver_async_pf(vcpu) && +@@ -10193,7 +10191,7 @@ void kvm_arch_async_page_present(struct + work->arch.token = ~0; /* broadcast wakeup */ + else + kvm_del_async_pf_gfn(vcpu, work->arch.gfn); +- trace_kvm_async_pf_ready(work->arch.token, work->gva); ++ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); + + if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && + !apf_get_user(vcpu, &val)) { +--- a/arch/x86/kvm/x86.h ++++ b/arch/x86/kvm/x86.h +@@ -289,7 +289,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vc + bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, + int page_num); + bool kvm_vector_hashing_enabled(void); +-int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, ++int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + int emulation_type, void *insn, int insn_len); + + #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -204,7 +204,7 @@ struct kvm_async_pf { + struct list_head queue; + struct kvm_vcpu *vcpu; + struct mm_struct *mm; +- gva_t gva; ++ gpa_t cr2_or_gpa; + unsigned long addr; + struct kvm_arch_async_pf arch; + bool wakeup_all; +@@ -212,8 +212,8 @@ struct kvm_async_pf { + + void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); + void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); +-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, +- struct kvm_arch_async_pf *arch); ++int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, ++ unsigned long hva, struct kvm_arch_async_pf *arch); + int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); + #endif + +--- a/virt/kvm/async_pf.c ++++ b/virt/kvm/async_pf.c +@@ -64,7 +64,7 @@ static void async_pf_execute(struct work + struct mm_struct *mm = apf->mm; + struct kvm_vcpu *vcpu = apf->vcpu; + unsigned long addr = apf->addr; +- gva_t gva = apf->gva; ++ gpa_t cr2_or_gpa = apf->cr2_or_gpa; + int locked = 1; + + might_sleep(); +@@ -92,7 +92,7 @@ static void async_pf_execute(struct work + * this point + */ + +- trace_kvm_async_pf_completed(addr, gva); ++ trace_kvm_async_pf_completed(addr, cr2_or_gpa); + + if (swq_has_sleeper(&vcpu->wq)) + swake_up_one(&vcpu->wq); +@@ -165,8 +165,8 @@ void kvm_check_async_pf_completion(struc + } + } + +-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, +- struct kvm_arch_async_pf *arch) ++int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, ++ unsigned long hva, struct kvm_arch_async_pf *arch) + { + struct kvm_async_pf *work; + +@@ -185,7 +185,7 @@ int kvm_setup_async_pf(struct kvm_vcpu * + + work->wakeup_all = false; + work->vcpu = vcpu; +- work->gva = gva; ++ work->cr2_or_gpa = cr2_or_gpa; + work->addr = hva; + work->arch = *arch; + work->mm = current->mm; diff --git a/queue-5.5/kvm-x86-use-raw-clock-values-consistently.patch b/queue-5.5/kvm-x86-use-raw-clock-values-consistently.patch new file mode 100644 index 00000000000..f8d2aecba93 --- /dev/null +++ b/queue-5.5/kvm-x86-use-raw-clock-values-consistently.patch @@ -0,0 +1,138 @@ +From 8171cd68806bd2fc28ef688e32fb2a3b3deb04e5 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Wed, 22 Jan 2020 14:36:09 +0100 +Subject: KVM: x86: use raw clock values consistently + +From: Paolo Bonzini + +commit 8171cd68806bd2fc28ef688e32fb2a3b3deb04e5 upstream. + +Commit 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw +clock") changed kvmclock to use tkr_raw instead of tkr_mono. However, +the default kvmclock_offset for the VM was still based on the monotonic +clock and, if the raw clock drifted enough from the monotonic clock, +this could cause a negative system_time to be written to the guest's +struct pvclock. RHEL5 does not like it and (if it boots fast enough to +observe a negative time value) it hangs. + +There is another thing to be careful about: getboottime64 returns the +host boot time with tkr_mono frequency, and subtracting the tkr_raw-based +kvmclock value will cause the wallclock to be off if tkr_raw drifts +from tkr_mono. To avoid this, compute the wallclock delta from the +current time instead of being clever and using getboottime64. + +Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") +Cc: stable@vger.kernel.org +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 38 +++++++++++++++++++++++--------------- + 1 file changed, 23 insertions(+), 15 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -1625,6 +1625,18 @@ static void update_pvclock_gtod(struct t + + write_seqcount_end(&vdata->seq); + } ++ ++static s64 get_kvmclock_base_ns(void) ++{ ++ /* Count up from boot time, but with the frequency of the raw clock. */ ++ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); ++} ++#else ++static s64 get_kvmclock_base_ns(void) ++{ ++ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ ++ return ktime_get_boottime_ns(); ++} + #endif + + void kvm_set_pending_timer(struct kvm_vcpu *vcpu) +@@ -1638,7 +1650,7 @@ static void kvm_write_wall_clock(struct + int version; + int r; + struct pvclock_wall_clock wc; +- struct timespec64 boot; ++ u64 wall_nsec; + + if (!wall_clock) + return; +@@ -1658,17 +1670,12 @@ static void kvm_write_wall_clock(struct + /* + * The guest calculates current wall clock time by adding + * system time (updated by kvm_guest_time_update below) to the +- * wall clock specified here. guest system time equals host +- * system time for us, thus we must fill in host boot time here. ++ * wall clock specified here. We do the reverse here. + */ +- getboottime64(&boot); ++ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + +- if (kvm->arch.kvmclock_offset) { +- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); +- boot = timespec64_sub(boot, ts); +- } +- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ +- wc.nsec = boot.tv_nsec; ++ wc.nsec = do_div(wall_nsec, 1000000000); ++ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ + wc.version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); +@@ -1916,7 +1923,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + offset = kvm_compute_tsc_offset(vcpu, data); +- ns = ktime_get_boottime_ns(); ++ ns = get_kvmclock_base_ns(); + elapsed = ns - kvm->arch.last_tsc_nsec; + + if (vcpu->arch.virtual_tsc_khz) { +@@ -2254,7 +2261,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) + spin_lock(&ka->pvclock_gtod_sync_lock); + if (!ka->use_master_clock) { + spin_unlock(&ka->pvclock_gtod_sync_lock); +- return ktime_get_boottime_ns() + ka->kvmclock_offset; ++ return get_kvmclock_base_ns() + ka->kvmclock_offset; + } + + hv_clock.tsc_timestamp = ka->master_cycle_now; +@@ -2270,7 +2277,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) + &hv_clock.tsc_to_system_mul); + ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + } else +- ret = ktime_get_boottime_ns() + ka->kvmclock_offset; ++ ret = get_kvmclock_base_ns() + ka->kvmclock_offset; + + put_cpu(); + +@@ -2369,7 +2376,7 @@ static int kvm_guest_time_update(struct + } + if (!use_master_clock) { + host_tsc = rdtsc(); +- kernel_ns = ktime_get_boottime_ns(); ++ kernel_ns = get_kvmclock_base_ns(); + } + + tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); +@@ -2409,6 +2416,7 @@ static int kvm_guest_time_update(struct + vcpu->hv_clock.tsc_timestamp = tsc_timestamp; + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_guest_tsc = tsc_timestamp; ++ WARN_ON(vcpu->hv_clock.system_time < 0); + + /* If the host uses TSC clocksource, then it is stable */ + pvclock_flags = 0; +@@ -9580,7 +9588,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un + mutex_init(&kvm->arch.apic_map_lock); + spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + +- kvm->arch.kvmclock_offset = -ktime_get_boottime_ns(); ++ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); + pvclock_update_vm_gtod_copy(kvm); + + kvm->arch.guest_can_read_msr_platform_info = true; diff --git a/queue-5.5/mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch b/queue-5.5/mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch new file mode 100644 index 00000000000..185453a90f9 --- /dev/null +++ b/queue-5.5/mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch @@ -0,0 +1,134 @@ +From e822969cab48b786b64246aad1a3ba2a774f5d23 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Mon, 3 Feb 2020 17:33:48 -0800 +Subject: mm/page_alloc.c: fix uninitialized memmaps on a partially populated last section + +From: David Hildenbrand + +commit e822969cab48b786b64246aad1a3ba2a774f5d23 upstream. + +Patch series "mm: fix max_pfn not falling on section boundary", v2. + +Playing with different memory sizes for a x86-64 guest, I discovered that +some memmaps (highest section if max_mem does not fall on the section +boundary) are marked as being valid and online, but contain garbage. We +have to properly initialize these memmaps. + +Looking at /proc/kpageflags and friends, I found some more issues, +partially related to this. + +This patch (of 3): + +If max_pfn is not aligned to a section boundary, we can easily run into +BUGs. This can e.g., be triggered on x86-64 under QEMU by specifying a +memory size that is not a multiple of 128MB (e.g., 4097MB, but also +4160MB). I was told that on real HW, we can easily have this scenario +(esp., one of the main reasons sub-section hotadd of devmem was added). + +The issue is, that we have a valid memmap (pfn_valid()) for the whole +section, and the whole section will be marked "online". +pfn_to_online_page() will succeed, but the memmap contains garbage. + +E.g., doing a "./page-types -r -a 0x144001" when QEMU was started with "-m +4160M" - (see tools/vm/page-types.c): + +[ 200.476376] BUG: unable to handle page fault for address: fffffffffffffffe +[ 200.477500] #PF: supervisor read access in kernel mode +[ 200.478334] #PF: error_code(0x0000) - not-present page +[ 200.479076] PGD 59614067 P4D 59614067 PUD 59616067 PMD 0 +[ 200.479557] Oops: 0000 [#4] SMP NOPTI +[ 200.479875] CPU: 0 PID: 603 Comm: page-types Tainted: G D W 5.5.0-rc1-next-20191209 #93 +[ 200.480646] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4 +[ 200.481648] RIP: 0010:stable_page_flags+0x4d/0x410 +[ 200.482061] Code: f3 ff 41 89 c0 48 b8 00 00 00 00 01 00 00 00 45 84 c0 0f 85 cd 02 00 00 48 8b 53 08 48 8b 2b 48f +[ 200.483644] RSP: 0018:ffffb139401cbe60 EFLAGS: 00010202 +[ 200.484091] RAX: fffffffffffffffe RBX: fffffbeec5100040 RCX: 0000000000000000 +[ 200.484697] RDX: 0000000000000001 RSI: ffffffff9535c7cd RDI: 0000000000000246 +[ 200.485313] RBP: ffffffffffffffff R08: 0000000000000000 R09: 0000000000000000 +[ 200.485917] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000144001 +[ 200.486523] R13: 00007ffd6ba55f48 R14: 00007ffd6ba55f40 R15: ffffb139401cbf08 +[ 200.487130] FS: 00007f68df717580(0000) GS:ffff9ec77fa00000(0000) knlGS:0000000000000000 +[ 200.487804] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 200.488295] CR2: fffffffffffffffe CR3: 0000000135d48000 CR4: 00000000000006f0 +[ 200.488897] Call Trace: +[ 200.489115] kpageflags_read+0xe9/0x140 +[ 200.489447] proc_reg_read+0x3c/0x60 +[ 200.489755] vfs_read+0xc2/0x170 +[ 200.490037] ksys_pread64+0x65/0xa0 +[ 200.490352] do_syscall_64+0x5c/0xa0 +[ 200.490665] entry_SYSCALL_64_after_hwframe+0x49/0xbe + +But it can be triggered much easier via "cat /proc/kpageflags > /dev/null" +after cold/hot plugging a DIMM to such a system: + +[root@localhost ~]# cat /proc/kpageflags > /dev/null +[ 111.517275] BUG: unable to handle page fault for address: fffffffffffffffe +[ 111.517907] #PF: supervisor read access in kernel mode +[ 111.518333] #PF: error_code(0x0000) - not-present page +[ 111.518771] PGD a240e067 P4D a240e067 PUD a2410067 PMD 0 + +This patch fixes that by at least zero-ing out that memmap (so e.g., +page_to_pfn() will not crash). Commit 907ec5fca3dc ("mm: zero remaining +unavailable struct pages") tried to fix a similar issue, but forgot to +consider this special case. + +After this patch, there are still problems to solve. E.g., not all of +these pages falling into a memory hole will actually get initialized later +and set PageReserved - they are only zeroed out - but at least the +immediate crashes are gone. A follow-up patch will take care of this. + +Link: http://lkml.kernel.org/r/20191211163201.17179-2-david@redhat.com +Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") +Signed-off-by: David Hildenbrand +Tested-by: Daniel Jordan +Cc: Naoya Horiguchi +Cc: Pavel Tatashin +Cc: Andrew Morton +Cc: Steven Sistare +Cc: Michal Hocko +Cc: Daniel Jordan +Cc: Bob Picco +Cc: Oscar Salvador +Cc: Alexey Dobriyan +Cc: Dan Williams +Cc: Michal Hocko +Cc: Stephen Rothwell +Cc: [4.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -6921,7 +6921,8 @@ static u64 zero_pfn_range(unsigned long + * This function also addresses a similar issue where struct pages are left + * uninitialized because the physical address range is not covered by + * memblock.memory or memblock.reserved. That could happen when memblock +- * layout is manually configured via memmap=. ++ * layout is manually configured via memmap=, or when the highest physical ++ * address (max_pfn) does not end on a section boundary. + */ + void __init zero_resv_unavail(void) + { +@@ -6939,7 +6940,16 @@ void __init zero_resv_unavail(void) + pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); + next = end; + } +- pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn); ++ ++ /* ++ * Early sections always have a fully populated memmap for the whole ++ * section - see pfn_valid(). If the last section has holes at the ++ * end and that section is marked "online", the memmap will be ++ * considered initialized. Make sure that memmap has a well defined ++ * state. ++ */ ++ pgcnt += zero_pfn_range(PFN_DOWN(next), ++ round_up(max_pfn, PAGES_PER_SECTION)); + + /* + * Struct pages that do not have backing memory. This could be because diff --git a/queue-5.5/ocfs2-fix-oops-when-writing-cloned-file.patch b/queue-5.5/ocfs2-fix-oops-when-writing-cloned-file.patch new file mode 100644 index 00000000000..fde63c4a031 --- /dev/null +++ b/queue-5.5/ocfs2-fix-oops-when-writing-cloned-file.patch @@ -0,0 +1,139 @@ +From 2d797e9ff95ecbcf0a83d657928ed20579444857 Mon Sep 17 00:00:00 2001 +From: Gang He +Date: Mon, 3 Feb 2020 17:33:45 -0800 +Subject: ocfs2: fix oops when writing cloned file + +From: Gang He + +commit 2d797e9ff95ecbcf0a83d657928ed20579444857 upstream. + +Writing a cloned file triggers a kernel oops and the user-space command +process is also killed by the system. The bug can be reproduced stably +via: + +1) create a file under ocfs2 file system directory. + + journalctl -b > aa.txt + +2) create a cloned file for this file. + + reflink aa.txt bb.txt + +3) write the cloned file with dd command. + + dd if=/dev/zero of=bb.txt bs=512 count=1 conv=notrunc + +The dd command is killed by the kernel, then you can see the oops message +via dmesg command. + +[ 463.875404] BUG: kernel NULL pointer dereference, address: 0000000000000028 +[ 463.875413] #PF: supervisor read access in kernel mode +[ 463.875416] #PF: error_code(0x0000) - not-present page +[ 463.875418] PGD 0 P4D 0 +[ 463.875425] Oops: 0000 [#1] SMP PTI +[ 463.875431] CPU: 1 PID: 2291 Comm: dd Tainted: G OE 5.3.16-2-default +[ 463.875433] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 +[ 463.875500] RIP: 0010:ocfs2_refcount_cow+0xa4/0x5d0 [ocfs2] +[ 463.875505] Code: 06 89 6c 24 38 89 eb f6 44 24 3c 02 74 be 49 8b 47 28 +[ 463.875508] RSP: 0018:ffffa2cb409dfce8 EFLAGS: 00010202 +[ 463.875512] RAX: ffff8b1ebdca8000 RBX: 0000000000000001 RCX: ffff8b1eb73a9df0 +[ 463.875515] RDX: 0000000000056a01 RSI: 0000000000000000 RDI: 0000000000000000 +[ 463.875517] RBP: 0000000000000001 R08: ffff8b1eb73a9de0 R09: 0000000000000000 +[ 463.875520] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 +[ 463.875522] R13: ffff8b1eb922f048 R14: 0000000000000000 R15: ffff8b1eb922f048 +[ 463.875526] FS: 00007f8f44d15540(0000) GS:ffff8b1ebeb00000(0000) knlGS:0000000000000000 +[ 463.875529] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 463.875532] CR2: 0000000000000028 CR3: 000000003c17a000 CR4: 00000000000006e0 +[ 463.875546] Call Trace: +[ 463.875596] ? ocfs2_inode_lock_full_nested+0x18b/0x960 [ocfs2] +[ 463.875648] ocfs2_file_write_iter+0xaf8/0xc70 [ocfs2] +[ 463.875672] new_sync_write+0x12d/0x1d0 +[ 463.875688] vfs_write+0xad/0x1a0 +[ 463.875697] ksys_write+0xa1/0xe0 +[ 463.875710] do_syscall_64+0x60/0x1f0 +[ 463.875743] entry_SYSCALL_64_after_hwframe+0x49/0xbe +[ 463.875758] RIP: 0033:0x7f8f4482ed44 +[ 463.875762] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 80 00 00 00 +[ 463.875765] RSP: 002b:00007fff300a79d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 +[ 463.875769] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f8f4482ed44 +[ 463.875771] RDX: 0000000000000200 RSI: 000055f771b5c000 RDI: 0000000000000001 +[ 463.875774] RBP: 0000000000000200 R08: 00007f8f44af9c78 R09: 0000000000000003 +[ 463.875776] R10: 000000000000089f R11: 0000000000000246 R12: 000055f771b5c000 +[ 463.875779] R13: 0000000000000200 R14: 0000000000000000 R15: 000055f771b5c000 + +This regression problem was introduced by commit e74540b28556 ("ocfs2: +protect extent tree in ocfs2_prepare_inode_for_write()"). + +Link: http://lkml.kernel.org/r/20200121050153.13290-1-ghe@suse.com +Fixes: e74540b28556 ("ocfs2: protect extent tree in ocfs2_prepare_inode_for_write()"). +Signed-off-by: Gang He +Reviewed-by: Joseph Qi +Cc: Mark Fasheh +Cc: Joel Becker +Cc: Junxiao Bi +Cc: Changwei Ge +Cc: Jun Piao +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/file.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +--- a/fs/ocfs2/file.c ++++ b/fs/ocfs2/file.c +@@ -2101,17 +2101,15 @@ static int ocfs2_is_io_unaligned(struct + static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, +- int overwrite_io, + int write_sem, + int wait) + { + int ret = 0; + + if (wait) +- ret = ocfs2_inode_lock(inode, NULL, meta_level); ++ ret = ocfs2_inode_lock(inode, di_bh, meta_level); + else +- ret = ocfs2_try_inode_lock(inode, +- overwrite_io ? NULL : di_bh, meta_level); ++ ret = ocfs2_try_inode_lock(inode, di_bh, meta_level); + if (ret < 0) + goto out; + +@@ -2136,6 +2134,7 @@ static int ocfs2_inode_lock_for_extent_t + + out_unlock: + brelse(*di_bh); ++ *di_bh = NULL; + ocfs2_inode_unlock(inode, meta_level); + out: + return ret; +@@ -2177,7 +2176,6 @@ static int ocfs2_prepare_inode_for_write + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, +- overwrite_io, + write_sem, + wait); + if (ret < 0) { +@@ -2233,13 +2231,13 @@ static int ocfs2_prepare_inode_for_write + &di_bh, + meta_level, + write_sem); ++ meta_level = 1; ++ write_sem = 1; + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, +- overwrite_io, +- 1, ++ write_sem, + wait); +- write_sem = 1; + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); diff --git a/queue-5.5/series b/queue-5.5/series index f7041a62f0c..ea60440d134 100644 --- a/queue-5.5/series +++ b/queue-5.5/series @@ -241,3 +241,22 @@ kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch kvm-svm-pku-not-currently-supported.patch kvm-x86-mmu-apply-max-pa-check-for-mmio-sptes-to-32-bit-kvm.patch +x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch +kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch +x86-kvm-introduce-kvm_-un-map_gfn.patch +x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch +x86-kvm-cache-gfn-to-pfn-translation.patch +x86-kvm-clean-up-host-s-steal-time-structure.patch +kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch +kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch +kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch +kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch +kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch +kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch +kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch +kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch +kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch +kvm-x86-reorganize-pvclock_gtod_data-members.patch +kvm-x86-use-raw-clock-values-consistently.patch +ocfs2-fix-oops-when-writing-cloned-file.patch +mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch diff --git a/queue-5.5/x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch b/queue-5.5/x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch new file mode 100644 index 00000000000..f5fe6438cd9 --- /dev/null +++ b/queue-5.5/x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch @@ -0,0 +1,39 @@ +From 8c6de56a42e0c657955e12b882a81ef07d1d073e Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky +Date: Wed, 30 Oct 2019 19:01:31 +0000 +Subject: x86/kvm: Be careful not to clear KVM_VCPU_FLUSH_TLB bit + +From: Boris Ostrovsky + +commit 8c6de56a42e0c657955e12b882a81ef07d1d073e upstream. + +kvm_steal_time_set_preempted() may accidentally clear KVM_VCPU_FLUSH_TLB +bit if it is called more than once while VCPU is preempted. + +This is part of CVE-2019-3016. + +(This bug was also independently discovered by Jim Mattson +) + +Signed-off-by: Boris Ostrovsky +Reviewed-by: Joao Martins +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3514,6 +3514,9 @@ static void kvm_steal_time_set_preempted + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + ++ if (vcpu->arch.st.steal.preempted) ++ return; ++ + vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; + + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, diff --git a/queue-5.5/x86-kvm-cache-gfn-to-pfn-translation.patch b/queue-5.5/x86-kvm-cache-gfn-to-pfn-translation.patch new file mode 100644 index 00000000000..ebffaa46ee3 --- /dev/null +++ b/queue-5.5/x86-kvm-cache-gfn-to-pfn-translation.patch @@ -0,0 +1,285 @@ +From 917248144db5d7320655dbb41d3af0b8a0f3d589 Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky +Date: Thu, 5 Dec 2019 01:30:51 +0000 +Subject: x86/kvm: Cache gfn to pfn translation + +From: Boris Ostrovsky + +commit 917248144db5d7320655dbb41d3af0b8a0f3d589 upstream. + +__kvm_map_gfn()'s call to gfn_to_pfn_memslot() is +* relatively expensive +* in certain cases (such as when done from atomic context) cannot be called + +Stashing gfn-to-pfn mapping should help with both cases. + +This is part of CVE-2019-3016. + +Signed-off-by: Boris Ostrovsky +Reviewed-by: Joao Martins +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 10 ++++ + include/linux/kvm_host.h | 7 ++ + include/linux/kvm_types.h | 9 +++ + virt/kvm/kvm_main.c | 98 ++++++++++++++++++++++++++++++++-------- + 5 files changed, 103 insertions(+), 22 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -689,6 +689,7 @@ struct kvm_vcpu_arch { + u64 last_steal; + struct gfn_to_hva_cache stime; + struct kvm_steal_time steal; ++ struct gfn_to_pfn_cache cache; + } st; + + u64 tsc_offset; +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -9111,6 +9111,9 @@ static void fx_init(struct kvm_vcpu *vcp + void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) + { + void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; ++ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; ++ ++ kvm_release_pfn(cache->pfn, cache->dirty, cache); + + kvmclock_reset(vcpu); + +@@ -9784,11 +9787,18 @@ out_free: + + void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) + { ++ struct kvm_vcpu *vcpu; ++ int i; ++ + /* + * memslots->generation has been incremented. + * mmio generation may have reached its maximum value. + */ + kvm_mmu_invalidate_mmio_sptes(kvm, gen); ++ ++ /* Force re-initialization of steal_time cache */ ++ kvm_for_each_vcpu(i, vcpu, kvm) ++ kvm_vcpu_kick(vcpu); + } + + int kvm_arch_prepare_memory_region(struct kvm *kvm, +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -723,6 +723,7 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn); + void kvm_set_pfn_accessed(kvm_pfn_t pfn); + void kvm_get_pfn(kvm_pfn_t pfn); + ++void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); + int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, + int len); + int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, +@@ -775,10 +776,12 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_ + kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); + kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); + int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); +-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map); ++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, ++ struct gfn_to_pfn_cache *cache, bool atomic); + struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); + void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); +-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); ++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, ++ struct gfn_to_pfn_cache *cache, bool dirty, bool atomic); + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); + unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); + int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, +--- a/include/linux/kvm_types.h ++++ b/include/linux/kvm_types.h +@@ -18,7 +18,7 @@ struct kvm_memslots; + + enum kvm_mr_change; + +-#include ++#include + + /* + * Address types: +@@ -51,4 +51,11 @@ struct gfn_to_hva_cache { + struct kvm_memory_slot *memslot; + }; + ++struct gfn_to_pfn_cache { ++ u64 generation; ++ gfn_t gfn; ++ kvm_pfn_t pfn; ++ bool dirty; ++}; ++ + #endif /* __KVM_TYPES_H__ */ +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -1821,27 +1821,72 @@ struct page *gfn_to_page(struct kvm *kvm + } + EXPORT_SYMBOL_GPL(gfn_to_page); + ++void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) ++{ ++ if (pfn == 0) ++ return; ++ ++ if (cache) ++ cache->pfn = cache->gfn = 0; ++ ++ if (dirty) ++ kvm_release_pfn_dirty(pfn); ++ else ++ kvm_release_pfn_clean(pfn); ++} ++ ++static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, ++ struct gfn_to_pfn_cache *cache, u64 gen) ++{ ++ kvm_release_pfn(cache->pfn, cache->dirty, cache); ++ ++ cache->pfn = gfn_to_pfn_memslot(slot, gfn); ++ cache->gfn = gfn; ++ cache->dirty = false; ++ cache->generation = gen; ++} ++ + static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, +- struct kvm_host_map *map) ++ struct kvm_host_map *map, ++ struct gfn_to_pfn_cache *cache, ++ bool atomic) + { + kvm_pfn_t pfn; + void *hva = NULL; + struct page *page = KVM_UNMAPPED_PAGE; + struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); ++ u64 gen = slots->generation; + + if (!map) + return -EINVAL; + +- pfn = gfn_to_pfn_memslot(slot, gfn); ++ if (cache) { ++ if (!cache->pfn || cache->gfn != gfn || ++ cache->generation != gen) { ++ if (atomic) ++ return -EAGAIN; ++ kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); ++ } ++ pfn = cache->pfn; ++ } else { ++ if (atomic) ++ return -EAGAIN; ++ pfn = gfn_to_pfn_memslot(slot, gfn); ++ } + if (is_error_noslot_pfn(pfn)) + return -EINVAL; + + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); +- hva = kmap(page); ++ if (atomic) ++ hva = kmap_atomic(page); ++ else ++ hva = kmap(page); + #ifdef CONFIG_HAS_IOMEM +- } else { ++ } else if (!atomic) { + hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); ++ } else { ++ return -EINVAL; + #endif + } + +@@ -1856,20 +1901,25 @@ static int __kvm_map_gfn(struct kvm_mems + return 0; + } + +-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) ++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, ++ struct gfn_to_pfn_cache *cache, bool atomic) + { +- return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map); ++ return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, ++ cache, atomic); + } + EXPORT_SYMBOL_GPL(kvm_map_gfn); + + int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) + { +- return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map); ++ return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, ++ NULL, false); + } + EXPORT_SYMBOL_GPL(kvm_vcpu_map); + + static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, +- struct kvm_host_map *map, bool dirty) ++ struct kvm_host_map *map, ++ struct gfn_to_pfn_cache *cache, ++ bool dirty, bool atomic) + { + if (!map) + return; +@@ -1877,34 +1927,44 @@ static void __kvm_unmap_gfn(struct kvm_m + if (!map->hva) + return; + +- if (map->page != KVM_UNMAPPED_PAGE) +- kunmap(map->page); ++ if (map->page != KVM_UNMAPPED_PAGE) { ++ if (atomic) ++ kunmap_atomic(map->hva); ++ else ++ kunmap(map->page); ++ } + #ifdef CONFIG_HAS_IOMEM +- else ++ else if (!atomic) + memunmap(map->hva); ++ else ++ WARN_ONCE(1, "Unexpected unmapping in atomic context"); + #endif + +- if (dirty) { ++ if (dirty) + mark_page_dirty_in_slot(memslot, map->gfn); +- kvm_release_pfn_dirty(map->pfn); +- } else { +- kvm_release_pfn_clean(map->pfn); +- } ++ ++ if (cache) ++ cache->dirty |= dirty; ++ else ++ kvm_release_pfn(map->pfn, dirty, NULL); + + map->hva = NULL; + map->page = NULL; + } + +-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) ++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, ++ struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) + { +- __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty); ++ __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, ++ cache, dirty, atomic); + return 0; + } + EXPORT_SYMBOL_GPL(kvm_unmap_gfn); + + void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) + { +- __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty); ++ __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, ++ dirty, false); + } + EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); + diff --git a/queue-5.5/x86-kvm-clean-up-host-s-steal-time-structure.patch b/queue-5.5/x86-kvm-clean-up-host-s-steal-time-structure.patch new file mode 100644 index 00000000000..2817d668509 --- /dev/null +++ b/queue-5.5/x86-kvm-clean-up-host-s-steal-time-structure.patch @@ -0,0 +1,81 @@ +From a6bd811f1209fe1c64c9f6fd578101d6436c6b6e Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky +Date: Fri, 6 Dec 2019 15:36:12 +0000 +Subject: x86/KVM: Clean up host's steal time structure + +From: Boris Ostrovsky + +commit a6bd811f1209fe1c64c9f6fd578101d6436c6b6e upstream. + +Now that we are mapping kvm_steal_time from the guest directly we +don't need keep a copy of it in kvm_vcpu_arch.st. The same is true +for the stime field. + +This is part of CVE-2019-3016. + +Signed-off-by: Boris Ostrovsky +Reviewed-by: Joao Martins +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/kvm_host.h | 3 +-- + arch/x86/kvm/x86.c | 11 +++-------- + 2 files changed, 4 insertions(+), 10 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -685,10 +685,9 @@ struct kvm_vcpu_arch { + bool pvclock_set_guest_stopped_request; + + struct { ++ u8 preempted; + u64 msr_val; + u64 last_steal; +- struct gfn_to_hva_cache stime; +- struct kvm_steal_time steal; + struct gfn_to_pfn_cache cache; + } st; + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -2611,7 +2611,7 @@ static void record_steal_time(struct kvm + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb(vcpu, false); + +- vcpu->arch.st.steal.preempted = 0; ++ vcpu->arch.st.preempted = 0; + + if (st->version & 1) + st->version += 1; /* first time write, random junk */ +@@ -2795,11 +2795,6 @@ int kvm_set_msr_common(struct kvm_vcpu * + if (data & KVM_STEAL_RESERVED_MASK) + return 1; + +- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, +- data & KVM_STEAL_VALID_BITS, +- sizeof(struct kvm_steal_time))) +- return 1; +- + vcpu->arch.st.msr_val = data; + + if (!(data & KVM_MSR_ENABLED)) +@@ -3519,7 +3514,7 @@ static void kvm_steal_time_set_preempted + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + +- if (vcpu->arch.st.steal.preempted) ++ if (vcpu->arch.st.preempted) + return; + + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, +@@ -3529,7 +3524,7 @@ static void kvm_steal_time_set_preempted + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + +- st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; ++ st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + } diff --git a/queue-5.5/x86-kvm-introduce-kvm_-un-map_gfn.patch b/queue-5.5/x86-kvm-introduce-kvm_-un-map_gfn.patch new file mode 100644 index 00000000000..0e6c864e60d --- /dev/null +++ b/queue-5.5/x86-kvm-introduce-kvm_-un-map_gfn.patch @@ -0,0 +1,109 @@ +From 1eff70a9abd46f175defafd29bc17ad456f398a7 Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky +Date: Tue, 12 Nov 2019 16:35:06 +0000 +Subject: x86/kvm: Introduce kvm_(un)map_gfn() + +From: Boris Ostrovsky + +commit 1eff70a9abd46f175defafd29bc17ad456f398a7 upstream. + +kvm_vcpu_(un)map operates on gfns from any current address space. +In certain cases we want to make sure we are not mapping SMRAM +and for that we can use kvm_(un)map_gfn() that we are introducing +in this patch. + +This is part of CVE-2019-3016. + +Signed-off-by: Boris Ostrovsky +Reviewed-by: Joao Martins +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/kvm_host.h | 2 ++ + virt/kvm/kvm_main.c | 29 ++++++++++++++++++++++++----- + 2 files changed, 26 insertions(+), 5 deletions(-) + +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -775,8 +775,10 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_ + kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); + kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); + int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); ++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map); + struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); + void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); ++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); + unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); + int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -1821,12 +1821,13 @@ struct page *gfn_to_page(struct kvm *kvm + } + EXPORT_SYMBOL_GPL(gfn_to_page); + +-static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn, ++static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, + struct kvm_host_map *map) + { + kvm_pfn_t pfn; + void *hva = NULL; + struct page *page = KVM_UNMAPPED_PAGE; ++ struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); + + if (!map) + return -EINVAL; +@@ -1855,14 +1856,20 @@ static int __kvm_map_gfn(struct kvm_memo + return 0; + } + ++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) ++{ ++ return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map); ++} ++EXPORT_SYMBOL_GPL(kvm_map_gfn); ++ + int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) + { +- return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map); ++ return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map); + } + EXPORT_SYMBOL_GPL(kvm_vcpu_map); + +-void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, +- bool dirty) ++static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, ++ struct kvm_host_map *map, bool dirty) + { + if (!map) + return; +@@ -1878,7 +1885,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcp + #endif + + if (dirty) { +- kvm_vcpu_mark_page_dirty(vcpu, map->gfn); ++ mark_page_dirty_in_slot(memslot, map->gfn); + kvm_release_pfn_dirty(map->pfn); + } else { + kvm_release_pfn_clean(map->pfn); +@@ -1887,6 +1894,18 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcp + map->hva = NULL; + map->page = NULL; + } ++ ++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) ++{ ++ __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(kvm_unmap_gfn); ++ ++void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) ++{ ++ __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty); ++} + EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); + + struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) diff --git a/queue-5.5/x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch b/queue-5.5/x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch new file mode 100644 index 00000000000..a3687deea55 --- /dev/null +++ b/queue-5.5/x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch @@ -0,0 +1,129 @@ +From b043138246a41064527cf019a3d51d9f015e9796 Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky +Date: Thu, 5 Dec 2019 03:45:32 +0000 +Subject: x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed + +From: Boris Ostrovsky + +commit b043138246a41064527cf019a3d51d9f015e9796 upstream. + +There is a potential race in record_steal_time() between setting +host-local vcpu->arch.st.steal.preempted to zero (i.e. clearing +KVM_VCPU_PREEMPTED) and propagating this value to the guest with +kvm_write_guest_cached(). Between those two events the guest may +still see KVM_VCPU_PREEMPTED in its copy of kvm_steal_time, set +KVM_VCPU_FLUSH_TLB and assume that hypervisor will do the right +thing. Which it won't. + +Instad of copying, we should map kvm_steal_time and that will +guarantee atomicity of accesses to @preempted. + +This is part of CVE-2019-3016. + +Signed-off-by: Boris Ostrovsky +Reviewed-by: Joao Martins +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 51 ++++++++++++++++++++++++++++++--------------------- + 1 file changed, 30 insertions(+), 21 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -2588,45 +2588,47 @@ static void kvm_vcpu_flush_tlb(struct kv + + static void record_steal_time(struct kvm_vcpu *vcpu) + { ++ struct kvm_host_map map; ++ struct kvm_steal_time *st; ++ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + +- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, +- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) ++ /* -EAGAIN is returned in atomic context so we can just return. */ ++ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, ++ &map, &vcpu->arch.st.cache, false)) + return; + ++ st = map.hva + ++ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); ++ + /* + * Doing a TLB flush here, on the guest's behalf, can avoid + * expensive IPIs. + */ + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, +- vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); +- if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) ++ st->preempted & KVM_VCPU_FLUSH_TLB); ++ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb(vcpu, false); + +- if (vcpu->arch.st.steal.version & 1) +- vcpu->arch.st.steal.version += 1; /* first time write, random junk */ ++ vcpu->arch.st.steal.preempted = 0; + +- vcpu->arch.st.steal.version += 1; ++ if (st->version & 1) ++ st->version += 1; /* first time write, random junk */ + +- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, +- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); ++ st->version += 1; + + smp_wmb(); + +- vcpu->arch.st.steal.steal += current->sched_info.run_delay - ++ st->steal += current->sched_info.run_delay - + vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + +- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, +- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); +- + smp_wmb(); + +- vcpu->arch.st.steal.version += 1; ++ st->version += 1; + +- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, +- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); ++ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); + } + + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +@@ -3511,18 +3513,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu + + static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) + { ++ struct kvm_host_map map; ++ struct kvm_steal_time *st; ++ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + if (vcpu->arch.st.steal.preempted) + return; + +- vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; ++ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, ++ &vcpu->arch.st.cache, true)) ++ return; ++ ++ st = map.hva + ++ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); ++ ++ st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; + +- kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, +- &vcpu->arch.st.steal.preempted, +- offsetof(struct kvm_steal_time, preempted), +- sizeof(vcpu->arch.st.steal.preempted)); ++ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + } + + void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -- 2.47.3