--- /dev/null
+From 55680890ea78be0df5e1384989f1be835043c084 Mon Sep 17 00:00:00 2001
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+Date: Fri, 31 Jan 2020 05:02:00 -0500
+Subject: KVM: s390: do not clobber registers during guest reset/store status
+
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+
+commit 55680890ea78be0df5e1384989f1be835043c084 upstream.
+
+The initial CPU reset clobbers the userspace fpc and the store status
+ioctl clobbers the guest acrs + fpr. As these calls are only done via
+ioctl (and not via vcpu_run), no CPU context is loaded, so we can (and
+must) act directly on the sync regs, not on the thread context.
+
+Cc: stable@kernel.org
+Fixes: e1788bb995be ("KVM: s390: handle floating point registers in the run ioctl not in vcpu_put/load")
+Fixes: 31d8b8d41a7e ("KVM: s390: handle access registers in the run ioctl not in vcpu_put/load")
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Cornelia Huck <cohuck@redhat.com>
+Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
+Link: https://lore.kernel.org/r/20200131100205.74720-2-frankja@linux.ibm.com
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/kvm/kvm-s390.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/arch/s390/kvm/kvm-s390.c
++++ b/arch/s390/kvm/kvm-s390.c
+@@ -2863,9 +2863,7 @@ static void kvm_s390_vcpu_initial_reset(
+ vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 |
+ CR14_UNUSED_33 |
+ CR14_EXTERNAL_DAMAGE_SUBMASK;
+- /* make sure the new fpc will be lazily loaded */
+- save_fpu_regs();
+- current->thread.fpu.fpc = 0;
++ vcpu->run->s.regs.fpc = 0;
+ vcpu->arch.sie_block->gbea = 1;
+ vcpu->arch.sie_block->pp = 0;
+ vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+@@ -4354,7 +4352,7 @@ long kvm_arch_vcpu_ioctl(struct file *fi
+ switch (ioctl) {
+ case KVM_S390_STORE_STATUS:
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+- r = kvm_s390_vcpu_store_status(vcpu, arg);
++ r = kvm_s390_store_status_unloaded(vcpu, arg);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ case KVM_S390_SET_INITIAL_PSW: {
--- /dev/null
+From fe6ed369fca98e99df55c932b85782a5687526b5 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Tue, 10 Dec 2019 15:24:32 -0800
+Subject: KVM: VMX: Add non-canonical check on writes to RTIT address MSRs
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit fe6ed369fca98e99df55c932b85782a5687526b5 upstream.
+
+Reject writes to RTIT address MSRs if the data being written is a
+non-canonical address as the MSRs are subject to canonical checks, e.g.
+KVM will trigger an unchecked #GP when loading the values to hardware
+during pt_guest_enter().
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/vmx.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2140,6 +2140,8 @@ static int vmx_set_msr(struct kvm_vcpu *
+ (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_num_address_ranges)))
+ return 1;
++ if (is_noncanonical_address(data, vcpu))
++ return 1;
+ if (index % 2)
+ vmx->pt_desc.guest.addr_b[index / 2] = data;
+ else
--- /dev/null
+From b11306b53b2540c6ba068c4deddb6a17d9f8d95b Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Tue, 10 Dec 2019 14:44:13 -0800
+Subject: KVM: x86: Don't let userspace set host-reserved cr4 bits
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit b11306b53b2540c6ba068c4deddb6a17d9f8d95b upstream.
+
+Calculate the host-reserved cr4 bits at runtime based on the system's
+capabilities (using logic similar to __do_cpuid_func()), and use the
+dynamically generated mask for the reserved bit check in kvm_set_cr4()
+instead using of the static CR4_RESERVED_BITS define. This prevents
+userspace from "enabling" features in cr4 that are not supported by the
+system, e.g. by ignoring KVM_GET_SUPPORTED_CPUID and specifying a bogus
+CPUID for the vCPU.
+
+Allowing userspace to set unsupported bits in cr4 can lead to a variety
+of undesirable behavior, e.g. failed VM-Enter, and in general increases
+KVM's attack surface. A crafty userspace can even abuse CR4.LA57 to
+induce an unchecked #GP on a WRMSR.
+
+On a platform without LA57 support:
+
+ KVM_SET_CPUID2 // CPUID_7_0_ECX.LA57 = 1
+ KVM_SET_SREGS // CR4.LA57 = 1
+ KVM_SET_MSRS // KERNEL_GS_BASE = 0x0004000000000000
+ KVM_RUN
+
+leads to a #GP when writing KERNEL_GS_BASE into hardware:
+
+ unchecked MSR access error: WRMSR to 0xc0000102 (tried to write 0x0004000000000000)
+ at rIP: 0xffffffffa00f239a (vmx_prepare_switch_to_guest+0x10a/0x1d0 [kvm_intel])
+ Call Trace:
+ kvm_arch_vcpu_ioctl_run+0x671/0x1c70 [kvm]
+ kvm_vcpu_ioctl+0x36b/0x5d0 [kvm]
+ do_vfs_ioctl+0xa1/0x620
+ ksys_ioctl+0x66/0x70
+ __x64_sys_ioctl+0x16/0x20
+ do_syscall_64+0x4c/0x170
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ RIP: 0033:0x7fc08133bf47
+
+Note, the above sequence fails VM-Enter due to invalid guest state.
+Userspace can allow VM-Enter to succeed (after the WRMSR #GP) by adding
+a KVM_SET_SREGS w/ CR4.LA57=0 after KVM_SET_MSRS, in which case KVM will
+technically leak the host's KERNEL_GS_BASE into the guest. But, as
+KERNEL_GS_BASE is a userspace-defined value/address, the leak is largely
+benign as a malicious userspace would simply be exposing its own data to
+the guest, and attacking a benevolent userspace would require multiple
+bugs in the userspace VMM.
+
+Cc: stable@vger.kernel.org
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++++++++-
+ 1 file changed, 34 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -92,6 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~
+ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
+ #endif
+
++static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
++
+ #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
+ #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
+
+@@ -886,9 +888,38 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u
+ }
+ EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
++static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
++{
++ u64 reserved_bits = CR4_RESERVED_BITS;
++
++ if (!cpu_has(c, X86_FEATURE_XSAVE))
++ reserved_bits |= X86_CR4_OSXSAVE;
++
++ if (!cpu_has(c, X86_FEATURE_SMEP))
++ reserved_bits |= X86_CR4_SMEP;
++
++ if (!cpu_has(c, X86_FEATURE_SMAP))
++ reserved_bits |= X86_CR4_SMAP;
++
++ if (!cpu_has(c, X86_FEATURE_FSGSBASE))
++ reserved_bits |= X86_CR4_FSGSBASE;
++
++ if (!cpu_has(c, X86_FEATURE_PKU))
++ reserved_bits |= X86_CR4_PKE;
++
++ if (!cpu_has(c, X86_FEATURE_LA57) &&
++ !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
++ reserved_bits |= X86_CR4_LA57;
++
++ if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
++ reserved_bits |= X86_CR4_UMIP;
++
++ return reserved_bits;
++}
++
+ static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+ {
+- if (cr4 & CR4_RESERVED_BITS)
++ if (cr4 & cr4_reserved_bits)
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
+@@ -9341,6 +9372,8 @@ int kvm_arch_hardware_setup(void)
+ if (r != 0)
+ return r;
+
++ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
++
+ if (kvm_has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
--- /dev/null
+From a7baead7e312f5a05381d68585fb6dc68e19e90f Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Fri, 17 Jan 2020 11:30:50 -0800
+Subject: KVM: x86: Ensure guest's FPU state is loaded when accessing for emulation
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit a7baead7e312f5a05381d68585fb6dc68e19e90f upstream.
+
+Lock the FPU regs and reload the current thread's FPU state, which holds
+the guest's FPU state, to the CPU registers if necessary prior to
+accessing guest FPU state as part of emulation. kernel_fpu_begin() can
+be called from softirq context, therefore KVM must ensure softirqs are
+disabled (locking the FPU regs disables softirqs) when touching CPU FPU
+state.
+
+Note, for all intents and purposes this reverts commit 6ab0b9feb82a7
+("x86,kvm: remove KVM emulator get_fpu / put_fpu"), but at the time it
+was applied, removing get/put_fpu() was correct. The re-introduction
+of {get,put}_fpu() is necessitated by the deferring of FPU state load.
+
+Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c | 39 +++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 39 insertions(+)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -22,6 +22,7 @@
+ #include "kvm_cache_regs.h"
+ #include <asm/kvm_emulate.h>
+ #include <linux/stringify.h>
++#include <asm/fpu/api.h>
+ #include <asm/debugreg.h>
+ #include <asm/nospec-branch.h>
+
+@@ -1075,8 +1076,23 @@ static void fetch_register_operand(struc
+ }
+ }
+
++static void emulator_get_fpu(void)
++{
++ fpregs_lock();
++
++ fpregs_assert_state_consistent();
++ if (test_thread_flag(TIF_NEED_FPU_LOAD))
++ switch_fpu_return();
++}
++
++static void emulator_put_fpu(void)
++{
++ fpregs_unlock();
++}
++
+ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+ {
++ emulator_get_fpu();
+ switch (reg) {
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+@@ -1098,11 +1114,13 @@ static void read_sse_reg(struct x86_emul
+ #endif
+ default: BUG();
+ }
++ emulator_put_fpu();
+ }
+
+ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
+ int reg)
+ {
++ emulator_get_fpu();
+ switch (reg) {
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+@@ -1124,10 +1142,12 @@ static void write_sse_reg(struct x86_emu
+ #endif
+ default: BUG();
+ }
++ emulator_put_fpu();
+ }
+
+ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+ {
++ emulator_get_fpu();
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+@@ -1139,10 +1159,12 @@ static void read_mmx_reg(struct x86_emul
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
++ emulator_put_fpu();
+ }
+
+ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+ {
++ emulator_get_fpu();
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+@@ -1154,6 +1176,7 @@ static void write_mmx_reg(struct x86_emu
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
++ emulator_put_fpu();
+ }
+
+ static int em_fninit(struct x86_emulate_ctxt *ctxt)
+@@ -1161,7 +1184,9 @@ static int em_fninit(struct x86_emulate_
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
++ emulator_get_fpu();
+ asm volatile("fninit");
++ emulator_put_fpu();
+ return X86EMUL_CONTINUE;
+ }
+
+@@ -1172,7 +1197,9 @@ static int em_fnstcw(struct x86_emulate_
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
++ emulator_get_fpu();
+ asm volatile("fnstcw %0": "+m"(fcw));
++ emulator_put_fpu();
+
+ ctxt->dst.val = fcw;
+
+@@ -1186,7 +1213,9 @@ static int em_fnstsw(struct x86_emulate_
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
++ emulator_get_fpu();
+ asm volatile("fnstsw %0": "+m"(fsw));
++ emulator_put_fpu();
+
+ ctxt->dst.val = fsw;
+
+@@ -4094,8 +4123,12 @@ static int em_fxsave(struct x86_emulate_
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
++ emulator_get_fpu();
++
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
++ emulator_put_fpu();
++
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+@@ -4138,6 +4171,8 @@ static int em_fxrstor(struct x86_emulate
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
++ emulator_get_fpu();
++
+ if (size < __fxstate_size(16)) {
+ rc = fxregs_fixup(&fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+@@ -4153,6 +4188,8 @@ static int em_fxrstor(struct x86_emulate
+ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+ out:
++ emulator_put_fpu();
++
+ return rc;
+ }
+
+@@ -5467,7 +5504,9 @@ static int flush_pending_x87_faults(stru
+ {
+ int rc;
+
++ emulator_get_fpu();
+ rc = asm_safe("fwait");
++ emulator_put_fpu();
+
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return emulate_exception(ctxt, MF_VECTOR, 0, false);
--- /dev/null
+From 16be9ddea268ad841457a59109963fff8c9de38d Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Wed, 18 Dec 2019 13:54:48 -0800
+Subject: KVM: x86: Free wbinvd_dirty_mask if vCPU creation fails
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit 16be9ddea268ad841457a59109963fff8c9de38d upstream.
+
+Free the vCPU's wbinvd_dirty_mask if vCPU creation fails after
+kvm_arch_vcpu_init(), e.g. when installing the vCPU's file descriptor.
+Do the freeing by calling kvm_arch_vcpu_free() instead of open coding
+the freeing. This adds a likely superfluous, but ultimately harmless,
+call to kvmclock_reset(), which only clears vcpu->arch.pv_time_enabled.
+Using kvm_arch_vcpu_free() allows for additional cleanup in the future.
+
+Fixes: f5f48ee15c2ee ("KVM: VMX: Execute WBINVD to keep data consistency with assigned devices")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -9180,7 +9180,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vc
+ kvm_mmu_unload(vcpu);
+ vcpu_put(vcpu);
+
+- kvm_x86_ops->vcpu_free(vcpu);
++ kvm_arch_vcpu_free(vcpu);
+ }
+
+ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
--- /dev/null
+From c9aef3b85f425d1f6635382ec210ee5a7ef55d7d Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Fri, 17 Jan 2020 11:30:49 -0800
+Subject: KVM: x86: Handle TIF_NEED_FPU_LOAD in kvm_{load,put}_guest_fpu()
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit c9aef3b85f425d1f6635382ec210ee5a7ef55d7d upstream.
+
+Handle TIF_NEED_FPU_LOAD similar to how fpu__copy() handles the flag
+when duplicating FPU state to a new task struct. TIF_NEED_FPU_LOAD can
+be set any time control is transferred out of KVM, be it voluntarily,
+e.g. if I/O is triggered during a KVM call to get_user_pages, or
+involuntarily, e.g. if softirq runs after an IRQ occurs. Therefore,
+KVM must account for TIF_NEED_FPU_LOAD whenever it is (potentially)
+accessing CPU FPU state.
+
+Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 19 +++++++++++++++++--
+ 1 file changed, 17 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8493,12 +8493,26 @@ static int complete_emulated_mmio(struct
+ return 0;
+ }
+
++static void kvm_save_current_fpu(struct fpu *fpu)
++{
++ /*
++ * If the target FPU state is not resident in the CPU registers, just
++ * memcpy() from current, else save CPU state directly to the target.
++ */
++ if (test_thread_flag(TIF_NEED_FPU_LOAD))
++ memcpy(&fpu->state, ¤t->thread.fpu.state,
++ fpu_kernel_xstate_size);
++ else
++ copy_fpregs_to_fpstate(fpu);
++}
++
+ /* Swap (qemu) user FPU context for the guest FPU context. */
+ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+ {
+ fpregs_lock();
+
+- copy_fpregs_to_fpstate(vcpu->arch.user_fpu);
++ kvm_save_current_fpu(vcpu->arch.user_fpu);
++
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+ ~XFEATURE_MASK_PKRU);
+@@ -8514,7 +8528,8 @@ static void kvm_put_guest_fpu(struct kvm
+ {
+ fpregs_lock();
+
+- copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
++ kvm_save_current_fpu(vcpu->arch.guest_fpu);
++
+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
+
+ fpregs_mark_activate();
--- /dev/null
+From 2620fe268e80d667a94553cd37a94ccaa2cb8c83 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Fri, 17 Jan 2020 11:30:51 -0800
+Subject: KVM: x86: Revert "KVM: X86: Fix fpu state crash in kvm guest"
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit 2620fe268e80d667a94553cd37a94ccaa2cb8c83 upstream.
+
+Reload the current thread's FPU state, which contains the guest's FPU
+state, to the CPU registers if necessary during vcpu_enter_guest().
+TIF_NEED_FPU_LOAD can be set any time control is transferred out of KVM,
+e.g. if I/O is triggered during a KVM call to get_user_pages() or if a
+softirq occurs while KVM is scheduled in.
+
+Moving the handling of TIF_NEED_FPU_LOAD from vcpu_enter_guest() to
+kvm_arch_vcpu_load(), effectively kvm_sched_in(), papered over a bug
+where kvm_put_guest_fpu() failed to account for TIF_NEED_FPU_LOAD. The
+easiest way to the kvm_put_guest_fpu() bug was to run with involuntary
+preemption enable, thus handling TIF_NEED_FPU_LOAD during kvm_sched_in()
+made the bug go away. But, removing the handling in vcpu_enter_guest()
+exposed KVM to the rare case of a softirq triggering kernel_fpu_begin()
+between vcpu_load() and vcpu_enter_guest().
+
+Now that kvm_{load,put}_guest_fpu() correctly handle TIF_NEED_FPU_LOAD,
+revert the commit to both restore the vcpu_enter_guest() behavior and
+eliminate the superfluous switch_fpu_return() in kvm_arch_vcpu_load().
+
+Note, leaving the handling in kvm_arch_vcpu_load() isn't wrong per se,
+but it is unnecessary, and most critically, makes it extremely difficult
+to find bugs such as the kvm_put_guest_fpu() issue due to shrinking the
+window where a softirq can corrupt state.
+
+A sample trace triggered by warning if TIF_NEED_FPU_LOAD is set while
+vcpu state is loaded:
+
+ <IRQ>
+ gcmaes_crypt_by_sg.constprop.12+0x26e/0x660
+ ? 0xffffffffc024547d
+ ? __qdisc_run+0x83/0x510
+ ? __dev_queue_xmit+0x45e/0x990
+ ? ip_finish_output2+0x1a8/0x570
+ ? fib4_rule_action+0x61/0x70
+ ? fib4_rule_action+0x70/0x70
+ ? fib_rules_lookup+0x13f/0x1c0
+ ? helper_rfc4106_decrypt+0x82/0xa0
+ ? crypto_aead_decrypt+0x40/0x70
+ ? crypto_aead_decrypt+0x40/0x70
+ ? crypto_aead_decrypt+0x40/0x70
+ ? esp_output_tail+0x8f4/0xa5a [esp4]
+ ? skb_ext_add+0xd3/0x170
+ ? xfrm_input+0x7a6/0x12c0
+ ? xfrm4_rcv_encap+0xae/0xd0
+ ? xfrm4_transport_finish+0x200/0x200
+ ? udp_queue_rcv_one_skb+0x1ba/0x460
+ ? udp_unicast_rcv_skb.isra.63+0x72/0x90
+ ? __udp4_lib_rcv+0x51b/0xb00
+ ? ip_protocol_deliver_rcu+0xd2/0x1c0
+ ? ip_local_deliver_finish+0x44/0x50
+ ? ip_local_deliver+0xe0/0xf0
+ ? ip_protocol_deliver_rcu+0x1c0/0x1c0
+ ? ip_rcv+0xbc/0xd0
+ ? ip_rcv_finish_core.isra.19+0x380/0x380
+ ? __netif_receive_skb_one_core+0x7e/0x90
+ ? netif_receive_skb_internal+0x3d/0xb0
+ ? napi_gro_receive+0xed/0x150
+ ? 0xffffffffc0243c77
+ ? net_rx_action+0x149/0x3b0
+ ? __do_softirq+0xe4/0x2f8
+ ? handle_irq_event_percpu+0x6a/0x80
+ ? irq_exit+0xe6/0xf0
+ ? do_IRQ+0x7f/0xd0
+ ? common_interrupt+0xf/0xf
+ </IRQ>
+ ? irq_entries_start+0x20/0x660
+ ? vmx_get_interrupt_shadow+0x2f0/0x710 [kvm_intel]
+ ? kvm_set_msr_common+0xfc7/0x2380 [kvm]
+ ? recalibrate_cpu_khz+0x10/0x10
+ ? ktime_get+0x3a/0xa0
+ ? kvm_arch_vcpu_ioctl_run+0x107/0x560 [kvm]
+ ? kvm_init+0x6bf/0xd00 [kvm]
+ ? __seccomp_filter+0x7a/0x680
+ ? do_vfs_ioctl+0xa4/0x630
+ ? security_file_ioctl+0x32/0x50
+ ? ksys_ioctl+0x60/0x90
+ ? __x64_sys_ioctl+0x16/0x20
+ ? do_syscall_64+0x5f/0x1a0
+ ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
+---[ end trace 9564a1ccad733a90 ]---
+
+This reverts commit e751732486eb3f159089a64d1901992b1357e7cc.
+
+Fixes: e751732486eb3 ("KVM: X86: Fix fpu state crash in kvm guest")
+Reported-by: Derek Yerger <derek@djy.llc>
+Reported-by: kernel@najdan.com
+Cc: Wanpeng Li <wanpengli@tencent.com>
+Cc: Thomas Lambertz <mail@thomaslambertz.de>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 9 +++------
+ 1 file changed, 3 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3481,10 +3481,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu
+
+ kvm_x86_ops->vcpu_load(vcpu, cpu);
+
+- fpregs_assert_state_consistent();
+- if (test_thread_flag(TIF_NEED_FPU_LOAD))
+- switch_fpu_return();
+-
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+ adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+@@ -8220,8 +8216,9 @@ static int vcpu_enter_guest(struct kvm_v
+ trace_kvm_entry(vcpu->vcpu_id);
+ guest_enter_irqoff();
+
+- /* The preempt notifier should have taken care of the FPU already. */
+- WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD));
++ fpregs_assert_state_consistent();
++ if (test_thread_flag(TIF_NEED_FPU_LOAD))
++ switch_fpu_return();
+
+ if (unlikely(vcpu->arch.switch_db_regs)) {
+ set_debugreg(0, 7);
--- /dev/null
+From e822969cab48b786b64246aad1a3ba2a774f5d23 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Mon, 3 Feb 2020 17:33:48 -0800
+Subject: mm/page_alloc.c: fix uninitialized memmaps on a partially populated last section
+
+From: David Hildenbrand <david@redhat.com>
+
+commit e822969cab48b786b64246aad1a3ba2a774f5d23 upstream.
+
+Patch series "mm: fix max_pfn not falling on section boundary", v2.
+
+Playing with different memory sizes for a x86-64 guest, I discovered that
+some memmaps (highest section if max_mem does not fall on the section
+boundary) are marked as being valid and online, but contain garbage. We
+have to properly initialize these memmaps.
+
+Looking at /proc/kpageflags and friends, I found some more issues,
+partially related to this.
+
+This patch (of 3):
+
+If max_pfn is not aligned to a section boundary, we can easily run into
+BUGs. This can e.g., be triggered on x86-64 under QEMU by specifying a
+memory size that is not a multiple of 128MB (e.g., 4097MB, but also
+4160MB). I was told that on real HW, we can easily have this scenario
+(esp., one of the main reasons sub-section hotadd of devmem was added).
+
+The issue is, that we have a valid memmap (pfn_valid()) for the whole
+section, and the whole section will be marked "online".
+pfn_to_online_page() will succeed, but the memmap contains garbage.
+
+E.g., doing a "./page-types -r -a 0x144001" when QEMU was started with "-m
+4160M" - (see tools/vm/page-types.c):
+
+[ 200.476376] BUG: unable to handle page fault for address: fffffffffffffffe
+[ 200.477500] #PF: supervisor read access in kernel mode
+[ 200.478334] #PF: error_code(0x0000) - not-present page
+[ 200.479076] PGD 59614067 P4D 59614067 PUD 59616067 PMD 0
+[ 200.479557] Oops: 0000 [#4] SMP NOPTI
+[ 200.479875] CPU: 0 PID: 603 Comm: page-types Tainted: G D W 5.5.0-rc1-next-20191209 #93
+[ 200.480646] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4
+[ 200.481648] RIP: 0010:stable_page_flags+0x4d/0x410
+[ 200.482061] Code: f3 ff 41 89 c0 48 b8 00 00 00 00 01 00 00 00 45 84 c0 0f 85 cd 02 00 00 48 8b 53 08 48 8b 2b 48f
+[ 200.483644] RSP: 0018:ffffb139401cbe60 EFLAGS: 00010202
+[ 200.484091] RAX: fffffffffffffffe RBX: fffffbeec5100040 RCX: 0000000000000000
+[ 200.484697] RDX: 0000000000000001 RSI: ffffffff9535c7cd RDI: 0000000000000246
+[ 200.485313] RBP: ffffffffffffffff R08: 0000000000000000 R09: 0000000000000000
+[ 200.485917] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000144001
+[ 200.486523] R13: 00007ffd6ba55f48 R14: 00007ffd6ba55f40 R15: ffffb139401cbf08
+[ 200.487130] FS: 00007f68df717580(0000) GS:ffff9ec77fa00000(0000) knlGS:0000000000000000
+[ 200.487804] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 200.488295] CR2: fffffffffffffffe CR3: 0000000135d48000 CR4: 00000000000006f0
+[ 200.488897] Call Trace:
+[ 200.489115] kpageflags_read+0xe9/0x140
+[ 200.489447] proc_reg_read+0x3c/0x60
+[ 200.489755] vfs_read+0xc2/0x170
+[ 200.490037] ksys_pread64+0x65/0xa0
+[ 200.490352] do_syscall_64+0x5c/0xa0
+[ 200.490665] entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+But it can be triggered much easier via "cat /proc/kpageflags > /dev/null"
+after cold/hot plugging a DIMM to such a system:
+
+[root@localhost ~]# cat /proc/kpageflags > /dev/null
+[ 111.517275] BUG: unable to handle page fault for address: fffffffffffffffe
+[ 111.517907] #PF: supervisor read access in kernel mode
+[ 111.518333] #PF: error_code(0x0000) - not-present page
+[ 111.518771] PGD a240e067 P4D a240e067 PUD a2410067 PMD 0
+
+This patch fixes that by at least zero-ing out that memmap (so e.g.,
+page_to_pfn() will not crash). Commit 907ec5fca3dc ("mm: zero remaining
+unavailable struct pages") tried to fix a similar issue, but forgot to
+consider this special case.
+
+After this patch, there are still problems to solve. E.g., not all of
+these pages falling into a memory hole will actually get initialized later
+and set PageReserved - they are only zeroed out - but at least the
+immediate crashes are gone. A follow-up patch will take care of this.
+
+Link: http://lkml.kernel.org/r/20191211163201.17179-2-david@redhat.com
+Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Tested-by: Daniel Jordan <daniel.m.jordan@oracle.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Steven Sistare <steven.sistare@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
+Cc: Bob Picco <bob.picco@oracle.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Alexey Dobriyan <adobriyan@gmail.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: <stable@vger.kernel.org> [4.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -6933,7 +6933,8 @@ static u64 zero_pfn_range(unsigned long
+ * This function also addresses a similar issue where struct pages are left
+ * uninitialized because the physical address range is not covered by
+ * memblock.memory or memblock.reserved. That could happen when memblock
+- * layout is manually configured via memmap=.
++ * layout is manually configured via memmap=, or when the highest physical
++ * address (max_pfn) does not end on a section boundary.
+ */
+ void __init zero_resv_unavail(void)
+ {
+@@ -6951,7 +6952,16 @@ void __init zero_resv_unavail(void)
+ pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ next = end;
+ }
+- pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
++
++ /*
++ * Early sections always have a fully populated memmap for the whole
++ * section - see pfn_valid(). If the last section has holes at the
++ * end and that section is marked "online", the memmap will be
++ * considered initialized. Make sure that memmap has a well defined
++ * state.
++ */
++ pgcnt += zero_pfn_range(PFN_DOWN(next),
++ round_up(max_pfn, PAGES_PER_SECTION));
+
+ /*
+ * Struct pages that do not have backing memory. This could be because
--- /dev/null
+From 2d797e9ff95ecbcf0a83d657928ed20579444857 Mon Sep 17 00:00:00 2001
+From: Gang He <GHe@suse.com>
+Date: Mon, 3 Feb 2020 17:33:45 -0800
+Subject: ocfs2: fix oops when writing cloned file
+
+From: Gang He <GHe@suse.com>
+
+commit 2d797e9ff95ecbcf0a83d657928ed20579444857 upstream.
+
+Writing a cloned file triggers a kernel oops and the user-space command
+process is also killed by the system. The bug can be reproduced stably
+via:
+
+1) create a file under ocfs2 file system directory.
+
+ journalctl -b > aa.txt
+
+2) create a cloned file for this file.
+
+ reflink aa.txt bb.txt
+
+3) write the cloned file with dd command.
+
+ dd if=/dev/zero of=bb.txt bs=512 count=1 conv=notrunc
+
+The dd command is killed by the kernel, then you can see the oops message
+via dmesg command.
+
+[ 463.875404] BUG: kernel NULL pointer dereference, address: 0000000000000028
+[ 463.875413] #PF: supervisor read access in kernel mode
+[ 463.875416] #PF: error_code(0x0000) - not-present page
+[ 463.875418] PGD 0 P4D 0
+[ 463.875425] Oops: 0000 [#1] SMP PTI
+[ 463.875431] CPU: 1 PID: 2291 Comm: dd Tainted: G OE 5.3.16-2-default
+[ 463.875433] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+[ 463.875500] RIP: 0010:ocfs2_refcount_cow+0xa4/0x5d0 [ocfs2]
+[ 463.875505] Code: 06 89 6c 24 38 89 eb f6 44 24 3c 02 74 be 49 8b 47 28
+[ 463.875508] RSP: 0018:ffffa2cb409dfce8 EFLAGS: 00010202
+[ 463.875512] RAX: ffff8b1ebdca8000 RBX: 0000000000000001 RCX: ffff8b1eb73a9df0
+[ 463.875515] RDX: 0000000000056a01 RSI: 0000000000000000 RDI: 0000000000000000
+[ 463.875517] RBP: 0000000000000001 R08: ffff8b1eb73a9de0 R09: 0000000000000000
+[ 463.875520] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
+[ 463.875522] R13: ffff8b1eb922f048 R14: 0000000000000000 R15: ffff8b1eb922f048
+[ 463.875526] FS: 00007f8f44d15540(0000) GS:ffff8b1ebeb00000(0000) knlGS:0000000000000000
+[ 463.875529] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 463.875532] CR2: 0000000000000028 CR3: 000000003c17a000 CR4: 00000000000006e0
+[ 463.875546] Call Trace:
+[ 463.875596] ? ocfs2_inode_lock_full_nested+0x18b/0x960 [ocfs2]
+[ 463.875648] ocfs2_file_write_iter+0xaf8/0xc70 [ocfs2]
+[ 463.875672] new_sync_write+0x12d/0x1d0
+[ 463.875688] vfs_write+0xad/0x1a0
+[ 463.875697] ksys_write+0xa1/0xe0
+[ 463.875710] do_syscall_64+0x60/0x1f0
+[ 463.875743] entry_SYSCALL_64_after_hwframe+0x49/0xbe
+[ 463.875758] RIP: 0033:0x7f8f4482ed44
+[ 463.875762] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 80 00 00 00
+[ 463.875765] RSP: 002b:00007fff300a79d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+[ 463.875769] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f8f4482ed44
+[ 463.875771] RDX: 0000000000000200 RSI: 000055f771b5c000 RDI: 0000000000000001
+[ 463.875774] RBP: 0000000000000200 R08: 00007f8f44af9c78 R09: 0000000000000003
+[ 463.875776] R10: 000000000000089f R11: 0000000000000246 R12: 000055f771b5c000
+[ 463.875779] R13: 0000000000000200 R14: 0000000000000000 R15: 000055f771b5c000
+
+This regression problem was introduced by commit e74540b28556 ("ocfs2:
+protect extent tree in ocfs2_prepare_inode_for_write()").
+
+Link: http://lkml.kernel.org/r/20200121050153.13290-1-ghe@suse.com
+Fixes: e74540b28556 ("ocfs2: protect extent tree in ocfs2_prepare_inode_for_write()").
+Signed-off-by: Gang He <ghe@suse.com>
+Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Mark Fasheh <mark@fasheh.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Changwei Ge <gechangwei@live.cn>
+Cc: Jun Piao <piaojun@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/file.c | 14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+--- a/fs/ocfs2/file.c
++++ b/fs/ocfs2/file.c
+@@ -2101,17 +2101,15 @@ static int ocfs2_is_io_unaligned(struct
+ static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+- int overwrite_io,
+ int write_sem,
+ int wait)
+ {
+ int ret = 0;
+
+ if (wait)
+- ret = ocfs2_inode_lock(inode, NULL, meta_level);
++ ret = ocfs2_inode_lock(inode, di_bh, meta_level);
+ else
+- ret = ocfs2_try_inode_lock(inode,
+- overwrite_io ? NULL : di_bh, meta_level);
++ ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
+ if (ret < 0)
+ goto out;
+
+@@ -2136,6 +2134,7 @@ static int ocfs2_inode_lock_for_extent_t
+
+ out_unlock:
+ brelse(*di_bh);
++ *di_bh = NULL;
+ ocfs2_inode_unlock(inode, meta_level);
+ out:
+ return ret;
+@@ -2177,7 +2176,6 @@ static int ocfs2_prepare_inode_for_write
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+- overwrite_io,
+ write_sem,
+ wait);
+ if (ret < 0) {
+@@ -2233,13 +2231,13 @@ static int ocfs2_prepare_inode_for_write
+ &di_bh,
+ meta_level,
+ write_sem);
++ meta_level = 1;
++ write_sem = 1;
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+- overwrite_io,
+- 1,
++ write_sem,
+ wait);
+- write_sem = 1;
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch
kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch
kvm-svm-pku-not-currently-supported.patch
+x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch
+x86-kvm-introduce-kvm_-un-map_gfn.patch
+x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch
+x86-kvm-cache-gfn-to-pfn-translation.patch
+x86-kvm-clean-up-host-s-steal-time-structure.patch
+kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch
+kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch
+kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch
+kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch
+kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch
+kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch
+kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch
+ocfs2-fix-oops-when-writing-cloned-file.patch
+mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch
--- /dev/null
+From 8c6de56a42e0c657955e12b882a81ef07d1d073e Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Wed, 30 Oct 2019 19:01:31 +0000
+Subject: x86/kvm: Be careful not to clear KVM_VCPU_FLUSH_TLB bit
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit 8c6de56a42e0c657955e12b882a81ef07d1d073e upstream.
+
+kvm_steal_time_set_preempted() may accidentally clear KVM_VCPU_FLUSH_TLB
+bit if it is called more than once while VCPU is preempted.
+
+This is part of CVE-2019-3016.
+
+(This bug was also independently discovered by Jim Mattson
+<jmattson@google.com>)
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3499,6 +3499,9 @@ static void kvm_steal_time_set_preempted
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
++ if (vcpu->arch.st.steal.preempted)
++ return;
++
+ vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
+
+ kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
--- /dev/null
+From 917248144db5d7320655dbb41d3af0b8a0f3d589 Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Thu, 5 Dec 2019 01:30:51 +0000
+Subject: x86/kvm: Cache gfn to pfn translation
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit 917248144db5d7320655dbb41d3af0b8a0f3d589 upstream.
+
+__kvm_map_gfn()'s call to gfn_to_pfn_memslot() is
+* relatively expensive
+* in certain cases (such as when done from atomic context) cannot be called
+
+Stashing gfn-to-pfn mapping should help with both cases.
+
+This is part of CVE-2019-3016.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h | 1
+ arch/x86/kvm/x86.c | 10 ++++
+ include/linux/kvm_host.h | 7 ++
+ include/linux/kvm_types.h | 9 +++
+ virt/kvm/kvm_main.c | 98 ++++++++++++++++++++++++++++++++--------
+ 5 files changed, 103 insertions(+), 22 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -671,6 +671,7 @@ struct kvm_vcpu_arch {
+ u64 last_steal;
+ struct gfn_to_hva_cache stime;
+ struct kvm_steal_time steal;
++ struct gfn_to_pfn_cache cache;
+ } st;
+
+ u64 tsc_offset;
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -9081,6 +9081,9 @@ static void fx_init(struct kvm_vcpu *vcp
+ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+ {
+ void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
++ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
++
++ kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
+ kvmclock_reset(vcpu);
+
+@@ -9745,11 +9748,18 @@ out_free:
+
+ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+ {
++ struct kvm_vcpu *vcpu;
++ int i;
++
+ /*
+ * memslots->generation has been incremented.
+ * mmio generation may have reached its maximum value.
+ */
+ kvm_mmu_invalidate_mmio_sptes(kvm, gen);
++
++ /* Force re-initialization of steal_time cache */
++ kvm_for_each_vcpu(i, vcpu, kvm)
++ kvm_vcpu_kick(vcpu);
+ }
+
+ int kvm_arch_prepare_memory_region(struct kvm *kvm,
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -728,6 +728,7 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn);
+ void kvm_set_pfn_accessed(kvm_pfn_t pfn);
+ void kvm_get_pfn(kvm_pfn_t pfn);
+
++void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
+ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+ int len);
+ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+@@ -758,10 +759,12 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
+-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map);
++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
++ struct gfn_to_pfn_cache *cache, bool atomic);
+ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
+-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
++ struct gfn_to_pfn_cache *cache, bool dirty, bool atomic);
+ unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
+ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
+ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
+--- a/include/linux/kvm_types.h
++++ b/include/linux/kvm_types.h
+@@ -18,7 +18,7 @@ struct kvm_memslots;
+
+ enum kvm_mr_change;
+
+-#include <asm/types.h>
++#include <linux/types.h>
+
+ /*
+ * Address types:
+@@ -49,4 +49,11 @@ struct gfn_to_hva_cache {
+ struct kvm_memory_slot *memslot;
+ };
+
++struct gfn_to_pfn_cache {
++ u64 generation;
++ gfn_t gfn;
++ kvm_pfn_t pfn;
++ bool dirty;
++};
++
+ #endif /* __KVM_TYPES_H__ */
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -1809,27 +1809,72 @@ struct page *gfn_to_page(struct kvm *kvm
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_page);
+
++void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
++{
++ if (pfn == 0)
++ return;
++
++ if (cache)
++ cache->pfn = cache->gfn = 0;
++
++ if (dirty)
++ kvm_release_pfn_dirty(pfn);
++ else
++ kvm_release_pfn_clean(pfn);
++}
++
++static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
++ struct gfn_to_pfn_cache *cache, u64 gen)
++{
++ kvm_release_pfn(cache->pfn, cache->dirty, cache);
++
++ cache->pfn = gfn_to_pfn_memslot(slot, gfn);
++ cache->gfn = gfn;
++ cache->dirty = false;
++ cache->generation = gen;
++}
++
+ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+- struct kvm_host_map *map)
++ struct kvm_host_map *map,
++ struct gfn_to_pfn_cache *cache,
++ bool atomic)
+ {
+ kvm_pfn_t pfn;
+ void *hva = NULL;
+ struct page *page = KVM_UNMAPPED_PAGE;
+ struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
++ u64 gen = slots->generation;
+
+ if (!map)
+ return -EINVAL;
+
+- pfn = gfn_to_pfn_memslot(slot, gfn);
++ if (cache) {
++ if (!cache->pfn || cache->gfn != gfn ||
++ cache->generation != gen) {
++ if (atomic)
++ return -EAGAIN;
++ kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
++ }
++ pfn = cache->pfn;
++ } else {
++ if (atomic)
++ return -EAGAIN;
++ pfn = gfn_to_pfn_memslot(slot, gfn);
++ }
+ if (is_error_noslot_pfn(pfn))
+ return -EINVAL;
+
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+- hva = kmap(page);
++ if (atomic)
++ hva = kmap_atomic(page);
++ else
++ hva = kmap(page);
+ #ifdef CONFIG_HAS_IOMEM
+- } else {
++ } else if (!atomic) {
+ hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
++ } else {
++ return -EINVAL;
+ #endif
+ }
+
+@@ -1844,20 +1889,25 @@ static int __kvm_map_gfn(struct kvm_mems
+ return 0;
+ }
+
+-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
++ struct gfn_to_pfn_cache *cache, bool atomic)
+ {
+- return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map);
++ return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
++ cache, atomic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_map_gfn);
+
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+ {
+- return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map);
++ return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
++ NULL, false);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+
+ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
+- struct kvm_host_map *map, bool dirty)
++ struct kvm_host_map *map,
++ struct gfn_to_pfn_cache *cache,
++ bool dirty, bool atomic)
+ {
+ if (!map)
+ return;
+@@ -1865,34 +1915,44 @@ static void __kvm_unmap_gfn(struct kvm_m
+ if (!map->hva)
+ return;
+
+- if (map->page != KVM_UNMAPPED_PAGE)
+- kunmap(map->page);
++ if (map->page != KVM_UNMAPPED_PAGE) {
++ if (atomic)
++ kunmap_atomic(map->hva);
++ else
++ kunmap(map->page);
++ }
+ #ifdef CONFIG_HAS_IOMEM
+- else
++ else if (!atomic)
+ memunmap(map->hva);
++ else
++ WARN_ONCE(1, "Unexpected unmapping in atomic context");
+ #endif
+
+- if (dirty) {
++ if (dirty)
+ mark_page_dirty_in_slot(memslot, map->gfn);
+- kvm_release_pfn_dirty(map->pfn);
+- } else {
+- kvm_release_pfn_clean(map->pfn);
+- }
++
++ if (cache)
++ cache->dirty |= dirty;
++ else
++ kvm_release_pfn(map->pfn, dirty, NULL);
+
+ map->hva = NULL;
+ map->page = NULL;
+ }
+
+-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
++ struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
+ {
+- __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty);
++ __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
++ cache, dirty, atomic);
+ return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
+
+ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
+ {
+- __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty);
++ __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
++ dirty, false);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+
--- /dev/null
+From a6bd811f1209fe1c64c9f6fd578101d6436c6b6e Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Fri, 6 Dec 2019 15:36:12 +0000
+Subject: x86/KVM: Clean up host's steal time structure
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit a6bd811f1209fe1c64c9f6fd578101d6436c6b6e upstream.
+
+Now that we are mapping kvm_steal_time from the guest directly we
+don't need keep a copy of it in kvm_vcpu_arch.st. The same is true
+for the stime field.
+
+This is part of CVE-2019-3016.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h | 3 +--
+ arch/x86/kvm/x86.c | 11 +++--------
+ 2 files changed, 4 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -667,10 +667,9 @@ struct kvm_vcpu_arch {
+ bool pvclock_set_guest_stopped_request;
+
+ struct {
++ u8 preempted;
+ u64 msr_val;
+ u64 last_steal;
+- struct gfn_to_hva_cache stime;
+- struct kvm_steal_time steal;
+ struct gfn_to_pfn_cache cache;
+ } st;
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2616,7 +2616,7 @@ static void record_steal_time(struct kvm
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb(vcpu, false);
+
+- vcpu->arch.st.steal.preempted = 0;
++ vcpu->arch.st.preempted = 0;
+
+ if (st->version & 1)
+ st->version += 1; /* first time write, random junk */
+@@ -2786,11 +2786,6 @@ int kvm_set_msr_common(struct kvm_vcpu *
+ if (data & KVM_STEAL_RESERVED_MASK)
+ return 1;
+
+- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+- data & KVM_STEAL_VALID_BITS,
+- sizeof(struct kvm_steal_time)))
+- return 1;
+-
+ vcpu->arch.st.msr_val = data;
+
+ if (!(data & KVM_MSR_ENABLED))
+@@ -3504,7 +3499,7 @@ static void kvm_steal_time_set_preempted
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+- if (vcpu->arch.st.steal.preempted)
++ if (vcpu->arch.st.preempted)
+ return;
+
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+@@ -3514,7 +3509,7 @@ static void kvm_steal_time_set_preempted
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+- st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
++ st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ }
--- /dev/null
+From 1eff70a9abd46f175defafd29bc17ad456f398a7 Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Tue, 12 Nov 2019 16:35:06 +0000
+Subject: x86/kvm: Introduce kvm_(un)map_gfn()
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit 1eff70a9abd46f175defafd29bc17ad456f398a7 upstream.
+
+kvm_vcpu_(un)map operates on gfns from any current address space.
+In certain cases we want to make sure we are not mapping SMRAM
+and for that we can use kvm_(un)map_gfn() that we are introducing
+in this patch.
+
+This is part of CVE-2019-3016.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/kvm_host.h | 2 ++
+ virt/kvm/kvm_main.c | 29 ++++++++++++++++++++++++-----
+ 2 files changed, 26 insertions(+), 5 deletions(-)
+
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -758,8 +758,10 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map);
+ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
+ unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
+ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
+ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -1809,12 +1809,13 @@ struct page *gfn_to_page(struct kvm *kvm
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_page);
+
+-static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
++static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+ struct kvm_host_map *map)
+ {
+ kvm_pfn_t pfn;
+ void *hva = NULL;
+ struct page *page = KVM_UNMAPPED_PAGE;
++ struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
+
+ if (!map)
+ return -EINVAL;
+@@ -1843,14 +1844,20 @@ static int __kvm_map_gfn(struct kvm_memo
+ return 0;
+ }
+
++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
++{
++ return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map);
++}
++EXPORT_SYMBOL_GPL(kvm_map_gfn);
++
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+ {
+- return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
++ return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+
+-void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
+- bool dirty)
++static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
++ struct kvm_host_map *map, bool dirty)
+ {
+ if (!map)
+ return;
+@@ -1866,7 +1873,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcp
+ #endif
+
+ if (dirty) {
+- kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
++ mark_page_dirty_in_slot(memslot, map->gfn);
+ kvm_release_pfn_dirty(map->pfn);
+ } else {
+ kvm_release_pfn_clean(map->pfn);
+@@ -1875,6 +1882,18 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcp
+ map->hva = NULL;
+ map->page = NULL;
+ }
++
++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
++{
++ __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty);
++ return 0;
++}
++EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
++
++void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
++{
++ __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty);
++}
+ EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+
+ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
--- /dev/null
+From b043138246a41064527cf019a3d51d9f015e9796 Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Thu, 5 Dec 2019 03:45:32 +0000
+Subject: x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit b043138246a41064527cf019a3d51d9f015e9796 upstream.
+
+There is a potential race in record_steal_time() between setting
+host-local vcpu->arch.st.steal.preempted to zero (i.e. clearing
+KVM_VCPU_PREEMPTED) and propagating this value to the guest with
+kvm_write_guest_cached(). Between those two events the guest may
+still see KVM_VCPU_PREEMPTED in its copy of kvm_steal_time, set
+KVM_VCPU_FLUSH_TLB and assume that hypervisor will do the right
+thing. Which it won't.
+
+Instad of copying, we should map kvm_steal_time and that will
+guarantee atomicity of accesses to @preempted.
+
+This is part of CVE-2019-3016.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 51 ++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 30 insertions(+), 21 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2593,45 +2593,47 @@ static void kvm_vcpu_flush_tlb(struct kv
+
+ static void record_steal_time(struct kvm_vcpu *vcpu)
+ {
++ struct kvm_host_map map;
++ struct kvm_steal_time *st;
++
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
++ /* -EAGAIN is returned in atomic context so we can just return. */
++ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
++ &map, &vcpu->arch.st.cache, false))
+ return;
+
++ st = map.hva +
++ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
++
+ /*
+ * Doing a TLB flush here, on the guest's behalf, can avoid
+ * expensive IPIs.
+ */
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+- vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB);
+- if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
++ st->preempted & KVM_VCPU_FLUSH_TLB);
++ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb(vcpu, false);
+
+- if (vcpu->arch.st.steal.version & 1)
+- vcpu->arch.st.steal.version += 1; /* first time write, random junk */
++ vcpu->arch.st.steal.preempted = 0;
+
+- vcpu->arch.st.steal.version += 1;
++ if (st->version & 1)
++ st->version += 1; /* first time write, random junk */
+
+- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
++ st->version += 1;
+
+ smp_wmb();
+
+- vcpu->arch.st.steal.steal += current->sched_info.run_delay -
++ st->steal += current->sched_info.run_delay -
+ vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+-
+ smp_wmb();
+
+- vcpu->arch.st.steal.version += 1;
++ st->version += 1;
+
+- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
++ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ }
+
+ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+@@ -3496,18 +3498,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu
+
+ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+ {
++ struct kvm_host_map map;
++ struct kvm_steal_time *st;
++
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (vcpu->arch.st.steal.preempted)
+ return;
+
+- vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
++ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
++ &vcpu->arch.st.cache, true))
++ return;
++
++ st = map.hva +
++ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
++
++ st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
+
+- kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+- &vcpu->arch.st.steal.preempted,
+- offsetof(struct kvm_steal_time, preempted),
+- sizeof(vcpu->arch.st.steal.preempted));
++ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ }
+
+ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)