]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.5-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 9 Feb 2020 12:43:37 +0000 (13:43 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 9 Feb 2020 12:43:37 +0000 (13:43 +0100)
added patches:
kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch
kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch
kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch
kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch
kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch
kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch
kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch
kvm-x86-reorganize-pvclock_gtod_data-members.patch
kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch
kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch
kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch
kvm-x86-use-raw-clock-values-consistently.patch
mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch
ocfs2-fix-oops-when-writing-cloned-file.patch
x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch
x86-kvm-cache-gfn-to-pfn-translation.patch
x86-kvm-clean-up-host-s-steal-time-structure.patch
x86-kvm-introduce-kvm_-un-map_gfn.patch
x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch

20 files changed:
queue-5.5/kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch [new file with mode: 0644]
queue-5.5/kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch [new file with mode: 0644]
queue-5.5/kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch [new file with mode: 0644]
queue-5.5/kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch [new file with mode: 0644]
queue-5.5/kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch [new file with mode: 0644]
queue-5.5/kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch [new file with mode: 0644]
queue-5.5/kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch [new file with mode: 0644]
queue-5.5/kvm-x86-reorganize-pvclock_gtod_data-members.patch [new file with mode: 0644]
queue-5.5/kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch [new file with mode: 0644]
queue-5.5/kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch [new file with mode: 0644]
queue-5.5/kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch [new file with mode: 0644]
queue-5.5/kvm-x86-use-raw-clock-values-consistently.patch [new file with mode: 0644]
queue-5.5/mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch [new file with mode: 0644]
queue-5.5/ocfs2-fix-oops-when-writing-cloned-file.patch [new file with mode: 0644]
queue-5.5/series
queue-5.5/x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch [new file with mode: 0644]
queue-5.5/x86-kvm-cache-gfn-to-pfn-translation.patch [new file with mode: 0644]
queue-5.5/x86-kvm-clean-up-host-s-steal-time-structure.patch [new file with mode: 0644]
queue-5.5/x86-kvm-introduce-kvm_-un-map_gfn.patch [new file with mode: 0644]
queue-5.5/x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch [new file with mode: 0644]

diff --git a/queue-5.5/kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch b/queue-5.5/kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch
new file mode 100644 (file)
index 0000000..c7d7d00
--- /dev/null
@@ -0,0 +1,51 @@
+From 55680890ea78be0df5e1384989f1be835043c084 Mon Sep 17 00:00:00 2001
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+Date: Fri, 31 Jan 2020 05:02:00 -0500
+Subject: KVM: s390: do not clobber registers during guest reset/store status
+
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+
+commit 55680890ea78be0df5e1384989f1be835043c084 upstream.
+
+The initial CPU reset clobbers the userspace fpc and the store status
+ioctl clobbers the guest acrs + fpr.  As these calls are only done via
+ioctl (and not via vcpu_run), no CPU context is loaded, so we can (and
+must) act directly on the sync regs, not on the thread context.
+
+Cc: stable@kernel.org
+Fixes: e1788bb995be ("KVM: s390: handle floating point registers in the run ioctl not in vcpu_put/load")
+Fixes: 31d8b8d41a7e ("KVM: s390: handle access registers in the run ioctl not in vcpu_put/load")
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Cornelia Huck <cohuck@redhat.com>
+Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
+Link: https://lore.kernel.org/r/20200131100205.74720-2-frankja@linux.ibm.com
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/kvm/kvm-s390.c |    6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/arch/s390/kvm/kvm-s390.c
++++ b/arch/s390/kvm/kvm-s390.c
+@@ -2860,9 +2860,7 @@ static void kvm_s390_vcpu_initial_reset(
+       vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 |
+                                       CR14_UNUSED_33 |
+                                       CR14_EXTERNAL_DAMAGE_SUBMASK;
+-      /* make sure the new fpc will be lazily loaded */
+-      save_fpu_regs();
+-      current->thread.fpu.fpc = 0;
++      vcpu->run->s.regs.fpc = 0;
+       vcpu->arch.sie_block->gbea = 1;
+       vcpu->arch.sie_block->pp = 0;
+       vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
+@@ -4351,7 +4349,7 @@ long kvm_arch_vcpu_ioctl(struct file *fi
+       switch (ioctl) {
+       case KVM_S390_STORE_STATUS:
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
+-              r = kvm_s390_vcpu_store_status(vcpu, arg);
++              r = kvm_s390_store_status_unloaded(vcpu, arg);
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
+               break;
+       case KVM_S390_SET_INITIAL_PSW: {
diff --git a/queue-5.5/kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch b/queue-5.5/kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch
new file mode 100644 (file)
index 0000000..fe36fbd
--- /dev/null
@@ -0,0 +1,34 @@
+From fe6ed369fca98e99df55c932b85782a5687526b5 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Tue, 10 Dec 2019 15:24:32 -0800
+Subject: KVM: VMX: Add non-canonical check on writes to RTIT address MSRs
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit fe6ed369fca98e99df55c932b85782a5687526b5 upstream.
+
+Reject writes to RTIT address MSRs if the data being written is a
+non-canonical address as the MSRs are subject to canonical checks, e.g.
+KVM will trigger an unchecked #GP when loading the values to hardware
+during pt_guest_enter().
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/vmx.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -2146,6 +2146,8 @@ static int vmx_set_msr(struct kvm_vcpu *
+                       (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
+                                       PT_CAP_num_address_ranges)))
+                       return 1;
++              if (is_noncanonical_address(data, vcpu))
++                      return 1;
+               if (index % 2)
+                       vmx->pt_desc.guest.addr_b[index / 2] = data;
+               else
diff --git a/queue-5.5/kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch b/queue-5.5/kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch
new file mode 100644 (file)
index 0000000..8451ad1
--- /dev/null
@@ -0,0 +1,122 @@
+From b11306b53b2540c6ba068c4deddb6a17d9f8d95b Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Tue, 10 Dec 2019 14:44:13 -0800
+Subject: KVM: x86: Don't let userspace set host-reserved cr4 bits
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit b11306b53b2540c6ba068c4deddb6a17d9f8d95b upstream.
+
+Calculate the host-reserved cr4 bits at runtime based on the system's
+capabilities (using logic similar to __do_cpuid_func()), and use the
+dynamically generated mask for the reserved bit check in kvm_set_cr4()
+instead using of the static CR4_RESERVED_BITS define.  This prevents
+userspace from "enabling" features in cr4 that are not supported by the
+system, e.g. by ignoring KVM_GET_SUPPORTED_CPUID and specifying a bogus
+CPUID for the vCPU.
+
+Allowing userspace to set unsupported bits in cr4 can lead to a variety
+of undesirable behavior, e.g. failed VM-Enter, and in general increases
+KVM's attack surface.  A crafty userspace can even abuse CR4.LA57 to
+induce an unchecked #GP on a WRMSR.
+
+On a platform without LA57 support:
+
+  KVM_SET_CPUID2 // CPUID_7_0_ECX.LA57 = 1
+  KVM_SET_SREGS  // CR4.LA57 = 1
+  KVM_SET_MSRS   // KERNEL_GS_BASE = 0x0004000000000000
+  KVM_RUN
+
+leads to a #GP when writing KERNEL_GS_BASE into hardware:
+
+  unchecked MSR access error: WRMSR to 0xc0000102 (tried to write 0x0004000000000000)
+  at rIP: 0xffffffffa00f239a (vmx_prepare_switch_to_guest+0x10a/0x1d0 [kvm_intel])
+  Call Trace:
+   kvm_arch_vcpu_ioctl_run+0x671/0x1c70 [kvm]
+   kvm_vcpu_ioctl+0x36b/0x5d0 [kvm]
+   do_vfs_ioctl+0xa1/0x620
+   ksys_ioctl+0x66/0x70
+   __x64_sys_ioctl+0x16/0x20
+   do_syscall_64+0x4c/0x170
+   entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  RIP: 0033:0x7fc08133bf47
+
+Note, the above sequence fails VM-Enter due to invalid guest state.
+Userspace can allow VM-Enter to succeed (after the WRMSR #GP) by adding
+a KVM_SET_SREGS w/ CR4.LA57=0 after KVM_SET_MSRS, in which case KVM will
+technically leak the host's KERNEL_GS_BASE into the guest.  But, as
+KERNEL_GS_BASE is a userspace-defined value/address, the leak is largely
+benign as a malicious userspace would simply be exposing its own data to
+the guest, and attacking a benevolent userspace would require multiple
+bugs in the userspace VMM.
+
+Cc: stable@vger.kernel.org
+Cc: Jun Nakajima <jun.nakajima@intel.com>
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |   35 ++++++++++++++++++++++++++++++++++-
+ 1 file changed, 34 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -93,6 +93,8 @@ u64 __read_mostly efer_reserved_bits = ~
+ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
+ #endif
++static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
++
+ #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
+ #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
+@@ -879,9 +881,38 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u
+ }
+ EXPORT_SYMBOL_GPL(kvm_set_xcr);
++static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
++{
++      u64 reserved_bits = CR4_RESERVED_BITS;
++
++      if (!cpu_has(c, X86_FEATURE_XSAVE))
++              reserved_bits |= X86_CR4_OSXSAVE;
++
++      if (!cpu_has(c, X86_FEATURE_SMEP))
++              reserved_bits |= X86_CR4_SMEP;
++
++      if (!cpu_has(c, X86_FEATURE_SMAP))
++              reserved_bits |= X86_CR4_SMAP;
++
++      if (!cpu_has(c, X86_FEATURE_FSGSBASE))
++              reserved_bits |= X86_CR4_FSGSBASE;
++
++      if (!cpu_has(c, X86_FEATURE_PKU))
++              reserved_bits |= X86_CR4_PKE;
++
++      if (!cpu_has(c, X86_FEATURE_LA57) &&
++          !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
++              reserved_bits |= X86_CR4_LA57;
++
++      if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
++              reserved_bits |= X86_CR4_UMIP;
++
++      return reserved_bits;
++}
++
+ static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+ {
+-      if (cr4 & CR4_RESERVED_BITS)
++      if (cr4 & cr4_reserved_bits)
+               return -EINVAL;
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
+@@ -9369,6 +9400,8 @@ int kvm_arch_hardware_setup(void)
+       if (r != 0)
+               return r;
++      cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
++
+       if (kvm_has_tsc_control) {
+               /*
+                * Make sure the user can only configure tsc_khz values that
diff --git a/queue-5.5/kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch b/queue-5.5/kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch
new file mode 100644 (file)
index 0000000..a5db339
--- /dev/null
@@ -0,0 +1,184 @@
+From a7baead7e312f5a05381d68585fb6dc68e19e90f Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Fri, 17 Jan 2020 11:30:50 -0800
+Subject: KVM: x86: Ensure guest's FPU state is loaded when accessing for emulation
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit a7baead7e312f5a05381d68585fb6dc68e19e90f upstream.
+
+Lock the FPU regs and reload the current thread's FPU state, which holds
+the guest's FPU state, to the CPU registers if necessary prior to
+accessing guest FPU state as part of emulation.  kernel_fpu_begin() can
+be called from softirq context, therefore KVM must ensure softirqs are
+disabled (locking the FPU regs disables softirqs) when touching CPU FPU
+state.
+
+Note, for all intents and purposes this reverts commit 6ab0b9feb82a7
+("x86,kvm: remove KVM emulator get_fpu / put_fpu"), but at the time it
+was applied, removing get/put_fpu() was correct.  The re-introduction
+of {get,put}_fpu() is necessitated by the deferring of FPU state load.
+
+Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |   39 +++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 39 insertions(+)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -22,6 +22,7 @@
+ #include "kvm_cache_regs.h"
+ #include <asm/kvm_emulate.h>
+ #include <linux/stringify.h>
++#include <asm/fpu/api.h>
+ #include <asm/debugreg.h>
+ #include <asm/nospec-branch.h>
+@@ -1075,8 +1076,23 @@ static void fetch_register_operand(struc
+       }
+ }
++static void emulator_get_fpu(void)
++{
++      fpregs_lock();
++
++      fpregs_assert_state_consistent();
++      if (test_thread_flag(TIF_NEED_FPU_LOAD))
++              switch_fpu_return();
++}
++
++static void emulator_put_fpu(void)
++{
++      fpregs_unlock();
++}
++
+ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+ {
++      emulator_get_fpu();
+       switch (reg) {
+       case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+       case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+@@ -1098,11 +1114,13 @@ static void read_sse_reg(struct x86_emul
+ #endif
+       default: BUG();
+       }
++      emulator_put_fpu();
+ }
+ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
+                         int reg)
+ {
++      emulator_get_fpu();
+       switch (reg) {
+       case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+       case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+@@ -1124,10 +1142,12 @@ static void write_sse_reg(struct x86_emu
+ #endif
+       default: BUG();
+       }
++      emulator_put_fpu();
+ }
+ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+ {
++      emulator_get_fpu();
+       switch (reg) {
+       case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+       case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+@@ -1139,10 +1159,12 @@ static void read_mmx_reg(struct x86_emul
+       case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+       default: BUG();
+       }
++      emulator_put_fpu();
+ }
+ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+ {
++      emulator_get_fpu();
+       switch (reg) {
+       case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+       case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+@@ -1154,6 +1176,7 @@ static void write_mmx_reg(struct x86_emu
+       case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+       default: BUG();
+       }
++      emulator_put_fpu();
+ }
+ static int em_fninit(struct x86_emulate_ctxt *ctxt)
+@@ -1161,7 +1184,9 @@ static int em_fninit(struct x86_emulate_
+       if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+               return emulate_nm(ctxt);
++      emulator_get_fpu();
+       asm volatile("fninit");
++      emulator_put_fpu();
+       return X86EMUL_CONTINUE;
+ }
+@@ -1172,7 +1197,9 @@ static int em_fnstcw(struct x86_emulate_
+       if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+               return emulate_nm(ctxt);
++      emulator_get_fpu();
+       asm volatile("fnstcw %0": "+m"(fcw));
++      emulator_put_fpu();
+       ctxt->dst.val = fcw;
+@@ -1186,7 +1213,9 @@ static int em_fnstsw(struct x86_emulate_
+       if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+               return emulate_nm(ctxt);
++      emulator_get_fpu();
+       asm volatile("fnstsw %0": "+m"(fsw));
++      emulator_put_fpu();
+       ctxt->dst.val = fsw;
+@@ -4092,8 +4121,12 @@ static int em_fxsave(struct x86_emulate_
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
++      emulator_get_fpu();
++
+       rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
++      emulator_put_fpu();
++
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
+@@ -4136,6 +4169,8 @@ static int em_fxrstor(struct x86_emulate
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
++      emulator_get_fpu();
++
+       if (size < __fxstate_size(16)) {
+               rc = fxregs_fixup(&fx_state, size);
+               if (rc != X86EMUL_CONTINUE)
+@@ -4151,6 +4186,8 @@ static int em_fxrstor(struct x86_emulate
+               rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+ out:
++      emulator_put_fpu();
++
+       return rc;
+ }
+@@ -5465,7 +5502,9 @@ static int flush_pending_x87_faults(stru
+ {
+       int rc;
++      emulator_get_fpu();
+       rc = asm_safe("fwait");
++      emulator_put_fpu();
+       if (unlikely(rc != X86EMUL_CONTINUE))
+               return emulate_exception(ctxt, MF_VECTOR, 0, false);
diff --git a/queue-5.5/kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch b/queue-5.5/kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch
new file mode 100644 (file)
index 0000000..3d3b2e8
--- /dev/null
@@ -0,0 +1,68 @@
+From 56871d444bc4d7ea66708775e62e2e0926384dbc Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Sat, 18 Jan 2020 20:09:03 +0100
+Subject: KVM: x86: fix overlap between SPTE_MMIO_MASK and generation
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 56871d444bc4d7ea66708775e62e2e0926384dbc upstream.
+
+The SPTE_MMIO_MASK overlaps with the bits used to track MMIO
+generation number.  A high enough generation number would overwrite the
+SPTE_SPECIAL_MASK region and cause the MMIO SPTE to be misinterpreted.
+
+Likewise, setting bits 52 and 53 would also cause an incorrect generation
+number to be read from the PTE, though this was partially mitigated by the
+(useless if it weren't for the bug) removal of SPTE_SPECIAL_MASK from
+the spte in get_mmio_spte_generation.  Drop that removal, and replace
+it with a compile-time assertion.
+
+Fixes: 6eeb4ef049e7 ("KVM: x86: assign two bits to track SPTE kinds")
+Reported-by: Ben Gardon <bgardon@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/mmu/mmu.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -418,22 +418,24 @@ static inline bool is_access_track_spte(
+  * requires a full MMU zap).  The flag is instead explicitly queried when
+  * checking for MMIO spte cache hits.
+  */
+-#define MMIO_SPTE_GEN_MASK            GENMASK_ULL(18, 0)
++#define MMIO_SPTE_GEN_MASK            GENMASK_ULL(17, 0)
+ #define MMIO_SPTE_GEN_LOW_START               3
+ #define MMIO_SPTE_GEN_LOW_END         11
+ #define MMIO_SPTE_GEN_LOW_MASK                GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
+                                                   MMIO_SPTE_GEN_LOW_START)
+-#define MMIO_SPTE_GEN_HIGH_START      52
+-#define MMIO_SPTE_GEN_HIGH_END                61
++#define MMIO_SPTE_GEN_HIGH_START      PT64_SECOND_AVAIL_BITS_SHIFT
++#define MMIO_SPTE_GEN_HIGH_END                62
+ #define MMIO_SPTE_GEN_HIGH_MASK               GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
+                                                   MMIO_SPTE_GEN_HIGH_START)
++
+ static u64 generation_mmio_spte_mask(u64 gen)
+ {
+       u64 mask;
+       WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
++      BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
+       mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
+       mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
+@@ -444,8 +446,6 @@ static u64 get_mmio_spte_generation(u64
+ {
+       u64 gen;
+-      spte &= ~shadow_mmio_mask;
+-
+       gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
+       gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
+       return gen;
diff --git a/queue-5.5/kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch b/queue-5.5/kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch
new file mode 100644 (file)
index 0000000..848afac
--- /dev/null
@@ -0,0 +1,37 @@
+From 16be9ddea268ad841457a59109963fff8c9de38d Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Wed, 18 Dec 2019 13:54:48 -0800
+Subject: KVM: x86: Free wbinvd_dirty_mask if vCPU creation fails
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit 16be9ddea268ad841457a59109963fff8c9de38d upstream.
+
+Free the vCPU's wbinvd_dirty_mask if vCPU creation fails after
+kvm_arch_vcpu_init(), e.g. when installing the vCPU's file descriptor.
+Do the freeing by calling kvm_arch_vcpu_free() instead of open coding
+the freeing.  This adds a likely superfluous, but ultimately harmless,
+call to kvmclock_reset(), which only clears vcpu->arch.pv_time_enabled.
+Using kvm_arch_vcpu_free() allows for additional cleanup in the future.
+
+Fixes: f5f48ee15c2ee ("KVM: VMX: Execute WBINVD to keep data consistency with assigned devices")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -9208,7 +9208,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vc
+       kvm_mmu_unload(vcpu);
+       vcpu_put(vcpu);
+-      kvm_x86_ops->vcpu_free(vcpu);
++      kvm_arch_vcpu_free(vcpu);
+ }
+ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
diff --git a/queue-5.5/kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch b/queue-5.5/kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch
new file mode 100644 (file)
index 0000000..bcc5ab4
--- /dev/null
@@ -0,0 +1,67 @@
+From c9aef3b85f425d1f6635382ec210ee5a7ef55d7d Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Fri, 17 Jan 2020 11:30:49 -0800
+Subject: KVM: x86: Handle TIF_NEED_FPU_LOAD in kvm_{load,put}_guest_fpu()
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit c9aef3b85f425d1f6635382ec210ee5a7ef55d7d upstream.
+
+Handle TIF_NEED_FPU_LOAD similar to how fpu__copy() handles the flag
+when duplicating FPU state to a new task struct.  TIF_NEED_FPU_LOAD can
+be set any time control is transferred out of KVM, be it voluntarily,
+e.g. if I/O is triggered during a KVM call to get_user_pages, or
+involuntarily, e.g. if softirq runs after an IRQ occurs.  Therefore,
+KVM must account for TIF_NEED_FPU_LOAD whenever it is (potentially)
+accessing CPU FPU state.
+
+Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |   19 +++++++++++++++++--
+ 1 file changed, 17 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8517,12 +8517,26 @@ static int complete_emulated_mmio(struct
+       return 0;
+ }
++static void kvm_save_current_fpu(struct fpu *fpu)
++{
++      /*
++       * If the target FPU state is not resident in the CPU registers, just
++       * memcpy() from current, else save CPU state directly to the target.
++       */
++      if (test_thread_flag(TIF_NEED_FPU_LOAD))
++              memcpy(&fpu->state, &current->thread.fpu.state,
++                     fpu_kernel_xstate_size);
++      else
++              copy_fpregs_to_fpstate(fpu);
++}
++
+ /* Swap (qemu) user FPU context for the guest FPU context. */
+ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+ {
+       fpregs_lock();
+-      copy_fpregs_to_fpstate(vcpu->arch.user_fpu);
++      kvm_save_current_fpu(vcpu->arch.user_fpu);
++
+       /* PKRU is separately restored in kvm_x86_ops->run.  */
+       __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+                               ~XFEATURE_MASK_PKRU);
+@@ -8538,7 +8552,8 @@ static void kvm_put_guest_fpu(struct kvm
+ {
+       fpregs_lock();
+-      copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
++      kvm_save_current_fpu(vcpu->arch.guest_fpu);
++
+       copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
+       fpregs_mark_activate();
diff --git a/queue-5.5/kvm-x86-reorganize-pvclock_gtod_data-members.patch b/queue-5.5/kvm-x86-reorganize-pvclock_gtod_data-members.patch
new file mode 100644 (file)
index 0000000..c4dc221
--- /dev/null
@@ -0,0 +1,112 @@
+From 917f9475c0a8ab8958db7f22a5d495b9a1d51be6 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Wed, 22 Jan 2020 14:32:20 +0100
+Subject: KVM: x86: reorganize pvclock_gtod_data members
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 917f9475c0a8ab8958db7f22a5d495b9a1d51be6 upstream.
+
+We will need a copy of tk->offs_boot in the next patch.  Store it and
+cleanup the struct: instead of storing tk->tkr_xxx.base with the tk->offs_boot
+included, store the raw value in struct pvclock_clock and sum it in
+do_monotonic_raw and do_realtime.   tk->tkr_xxx.xtime_nsec also moves
+to struct pvclock_clock.
+
+While at it, fix a (usually harmless) typo in do_monotonic_raw, which
+was using gtod->clock.shift instead of gtod->raw_clock.shift.
+
+Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock")
+Cc: stable@vger.kernel.org
+Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |   29 ++++++++++++-----------------
+ 1 file changed, 12 insertions(+), 17 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -1580,6 +1580,8 @@ struct pvclock_clock {
+       u64 mask;
+       u32 mult;
+       u32 shift;
++      u64 base_cycles;
++      u64 offset;
+ };
+ struct pvclock_gtod_data {
+@@ -1588,11 +1590,8 @@ struct pvclock_gtod_data {
+       struct pvclock_clock clock; /* extract of a clocksource struct */
+       struct pvclock_clock raw_clock; /* extract of a clocksource struct */
+-      u64             boot_ns_raw;
+-      u64             boot_ns;
+-      u64             nsec_base;
++      ktime_t         offs_boot;
+       u64             wall_time_sec;
+-      u64             monotonic_raw_nsec;
+ };
+ static struct pvclock_gtod_data pvclock_gtod_data;
+@@ -1600,10 +1599,6 @@ static struct pvclock_gtod_data pvclock_
+ static void update_pvclock_gtod(struct timekeeper *tk)
+ {
+       struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+-      u64 boot_ns, boot_ns_raw;
+-
+-      boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
+-      boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot));
+       write_seqcount_begin(&vdata->seq);
+@@ -1613,20 +1608,20 @@ static void update_pvclock_gtod(struct t
+       vdata->clock.mask               = tk->tkr_mono.mask;
+       vdata->clock.mult               = tk->tkr_mono.mult;
+       vdata->clock.shift              = tk->tkr_mono.shift;
++      vdata->clock.base_cycles        = tk->tkr_mono.xtime_nsec;
++      vdata->clock.offset             = tk->tkr_mono.base;
+       vdata->raw_clock.vclock_mode    = tk->tkr_raw.clock->archdata.vclock_mode;
+       vdata->raw_clock.cycle_last     = tk->tkr_raw.cycle_last;
+       vdata->raw_clock.mask           = tk->tkr_raw.mask;
+       vdata->raw_clock.mult           = tk->tkr_raw.mult;
+       vdata->raw_clock.shift          = tk->tkr_raw.shift;
+-
+-      vdata->boot_ns                  = boot_ns;
+-      vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
++      vdata->raw_clock.base_cycles    = tk->tkr_raw.xtime_nsec;
++      vdata->raw_clock.offset         = tk->tkr_raw.base;
+       vdata->wall_time_sec            = tk->xtime_sec;
+-      vdata->boot_ns_raw              = boot_ns_raw;
+-      vdata->monotonic_raw_nsec       = tk->tkr_raw.xtime_nsec;
++      vdata->offs_boot                = tk->offs_boot;
+       write_seqcount_end(&vdata->seq);
+ }
+@@ -2096,10 +2091,10 @@ static int do_monotonic_raw(s64 *t, u64
+       do {
+               seq = read_seqcount_begin(&gtod->seq);
+-              ns = gtod->monotonic_raw_nsec;
++              ns = gtod->raw_clock.base_cycles;
+               ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
+-              ns >>= gtod->clock.shift;
+-              ns += gtod->boot_ns_raw;
++              ns >>= gtod->raw_clock.shift;
++              ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
+       } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+       *t = ns;
+@@ -2116,7 +2111,7 @@ static int do_realtime(struct timespec64
+       do {
+               seq = read_seqcount_begin(&gtod->seq);
+               ts->tv_sec = gtod->wall_time_sec;
+-              ns = gtod->nsec_base;
++              ns = gtod->clock.base_cycles;
+               ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+               ns >>= gtod->clock.shift;
+       } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
diff --git a/queue-5.5/kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch b/queue-5.5/kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch
new file mode 100644 (file)
index 0000000..ea6f7f0
--- /dev/null
@@ -0,0 +1,136 @@
+From 2620fe268e80d667a94553cd37a94ccaa2cb8c83 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Fri, 17 Jan 2020 11:30:51 -0800
+Subject: KVM: x86: Revert "KVM: X86: Fix fpu state crash in kvm guest"
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit 2620fe268e80d667a94553cd37a94ccaa2cb8c83 upstream.
+
+Reload the current thread's FPU state, which contains the guest's FPU
+state, to the CPU registers if necessary during vcpu_enter_guest().
+TIF_NEED_FPU_LOAD can be set any time control is transferred out of KVM,
+e.g. if I/O is triggered during a KVM call to get_user_pages() or if a
+softirq occurs while KVM is scheduled in.
+
+Moving the handling of TIF_NEED_FPU_LOAD from vcpu_enter_guest() to
+kvm_arch_vcpu_load(), effectively kvm_sched_in(), papered over a bug
+where kvm_put_guest_fpu() failed to account for TIF_NEED_FPU_LOAD.  The
+easiest way to the kvm_put_guest_fpu() bug was to run with involuntary
+preemption enable, thus handling TIF_NEED_FPU_LOAD during kvm_sched_in()
+made the bug go away.  But, removing the handling in vcpu_enter_guest()
+exposed KVM to the rare case of a softirq triggering kernel_fpu_begin()
+between vcpu_load() and vcpu_enter_guest().
+
+Now that kvm_{load,put}_guest_fpu() correctly handle TIF_NEED_FPU_LOAD,
+revert the commit to both restore the vcpu_enter_guest() behavior and
+eliminate the superfluous switch_fpu_return() in kvm_arch_vcpu_load().
+
+Note, leaving the handling in kvm_arch_vcpu_load() isn't wrong per se,
+but it is unnecessary, and most critically, makes it extremely difficult
+to find bugs such as the kvm_put_guest_fpu() issue due to shrinking the
+window where a softirq can corrupt state.
+
+A sample trace triggered by warning if TIF_NEED_FPU_LOAD is set while
+vcpu state is loaded:
+
+ <IRQ>
+  gcmaes_crypt_by_sg.constprop.12+0x26e/0x660
+  ? 0xffffffffc024547d
+  ? __qdisc_run+0x83/0x510
+  ? __dev_queue_xmit+0x45e/0x990
+  ? ip_finish_output2+0x1a8/0x570
+  ? fib4_rule_action+0x61/0x70
+  ? fib4_rule_action+0x70/0x70
+  ? fib_rules_lookup+0x13f/0x1c0
+  ? helper_rfc4106_decrypt+0x82/0xa0
+  ? crypto_aead_decrypt+0x40/0x70
+  ? crypto_aead_decrypt+0x40/0x70
+  ? crypto_aead_decrypt+0x40/0x70
+  ? esp_output_tail+0x8f4/0xa5a [esp4]
+  ? skb_ext_add+0xd3/0x170
+  ? xfrm_input+0x7a6/0x12c0
+  ? xfrm4_rcv_encap+0xae/0xd0
+  ? xfrm4_transport_finish+0x200/0x200
+  ? udp_queue_rcv_one_skb+0x1ba/0x460
+  ? udp_unicast_rcv_skb.isra.63+0x72/0x90
+  ? __udp4_lib_rcv+0x51b/0xb00
+  ? ip_protocol_deliver_rcu+0xd2/0x1c0
+  ? ip_local_deliver_finish+0x44/0x50
+  ? ip_local_deliver+0xe0/0xf0
+  ? ip_protocol_deliver_rcu+0x1c0/0x1c0
+  ? ip_rcv+0xbc/0xd0
+  ? ip_rcv_finish_core.isra.19+0x380/0x380
+  ? __netif_receive_skb_one_core+0x7e/0x90
+  ? netif_receive_skb_internal+0x3d/0xb0
+  ? napi_gro_receive+0xed/0x150
+  ? 0xffffffffc0243c77
+  ? net_rx_action+0x149/0x3b0
+  ? __do_softirq+0xe4/0x2f8
+  ? handle_irq_event_percpu+0x6a/0x80
+  ? irq_exit+0xe6/0xf0
+  ? do_IRQ+0x7f/0xd0
+  ? common_interrupt+0xf/0xf
+  </IRQ>
+  ? irq_entries_start+0x20/0x660
+  ? vmx_get_interrupt_shadow+0x2f0/0x710 [kvm_intel]
+  ? kvm_set_msr_common+0xfc7/0x2380 [kvm]
+  ? recalibrate_cpu_khz+0x10/0x10
+  ? ktime_get+0x3a/0xa0
+  ? kvm_arch_vcpu_ioctl_run+0x107/0x560 [kvm]
+  ? kvm_init+0x6bf/0xd00 [kvm]
+  ? __seccomp_filter+0x7a/0x680
+  ? do_vfs_ioctl+0xa4/0x630
+  ? security_file_ioctl+0x32/0x50
+  ? ksys_ioctl+0x60/0x90
+  ? __x64_sys_ioctl+0x16/0x20
+  ? do_syscall_64+0x5f/0x1a0
+  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
+---[ end trace 9564a1ccad733a90 ]---
+
+This reverts commit e751732486eb3f159089a64d1901992b1357e7cc.
+
+Fixes: e751732486eb3 ("KVM: X86: Fix fpu state crash in kvm guest")
+Reported-by: Derek Yerger <derek@djy.llc>
+Reported-by: kernel@najdan.com
+Cc: Wanpeng Li <wanpengli@tencent.com>
+Cc: Thomas Lambertz <mail@thomaslambertz.de>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |    9 +++------
+ 1 file changed, 3 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3496,10 +3496,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu
+       kvm_x86_ops->vcpu_load(vcpu, cpu);
+-      fpregs_assert_state_consistent();
+-      if (test_thread_flag(TIF_NEED_FPU_LOAD))
+-              switch_fpu_return();
+-
+       /* Apply any externally detected TSC adjustments (due to suspend) */
+       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+               adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+@@ -8244,8 +8240,9 @@ static int vcpu_enter_guest(struct kvm_v
+       trace_kvm_entry(vcpu->vcpu_id);
+       guest_enter_irqoff();
+-      /* The preempt notifier should have taken care of the FPU already.  */
+-      WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD));
++      fpregs_assert_state_consistent();
++      if (test_thread_flag(TIF_NEED_FPU_LOAD))
++              switch_fpu_return();
+       if (unlikely(vcpu->arch.switch_db_regs)) {
+               set_debugreg(0, 7);
diff --git a/queue-5.5/kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch b/queue-5.5/kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch
new file mode 100644 (file)
index 0000000..408f866
--- /dev/null
@@ -0,0 +1,53 @@
+From 7adacf5eb2d2048045d9fd8fdab861fd9e7e2e96 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Wed, 4 Dec 2019 15:50:27 +0100
+Subject: KVM: x86: use CPUID to locate host page table reserved bits
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 7adacf5eb2d2048045d9fd8fdab861fd9e7e2e96 upstream.
+
+The comment in kvm_get_shadow_phys_bits refers to MKTME, but the same is actually
+true of SME and SEV.  Just use CPUID[0x8000_0008].EAX[7:0] unconditionally if
+available, it is simplest and works even if memory is not encrypted.
+
+Cc: stable@vger.kernel.org
+Reported-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/mmu/mmu.c |   20 ++++++++++++--------
+ 1 file changed, 12 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -538,16 +538,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes)
+ static u8 kvm_get_shadow_phys_bits(void)
+ {
+       /*
+-       * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
+-       * in CPU detection code, but MKTME treats those reduced bits as
+-       * 'keyID' thus they are not reserved bits. Therefore for MKTME
+-       * we should still return physical address bits reported by CPUID.
++       * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
++       * in CPU detection code, but the processor treats those reduced bits as
++       * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
++       * the physical address bits reported by CPUID.
+        */
+-      if (!boot_cpu_has(X86_FEATURE_TME) ||
+-          WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
+-              return boot_cpu_data.x86_phys_bits;
++      if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
++              return cpuid_eax(0x80000008) & 0xff;
+-      return cpuid_eax(0x80000008) & 0xff;
++      /*
++       * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
++       * custom CPUID.  Proceed with whatever the kernel found since these features
++       * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
++       */
++      return boot_cpu_data.x86_phys_bits;
+ }
+ static void kvm_mmu_reset_all_pte_masks(void)
diff --git a/queue-5.5/kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch b/queue-5.5/kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch
new file mode 100644 (file)
index 0000000..9e2ade1
--- /dev/null
@@ -0,0 +1,650 @@
+From 736c291c9f36b07f8889c61764c28edce20e715d Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Fri, 6 Dec 2019 15:57:14 -0800
+Subject: KVM: x86: Use gpa_t for cr2/gpa to fix TDP support on 32-bit KVM
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit 736c291c9f36b07f8889c61764c28edce20e715d upstream.
+
+Convert a plethora of parameters and variables in the MMU and page fault
+flows from type gva_t to gpa_t to properly handle TDP on 32-bit KVM.
+
+Thanks to PSE and PAE paging, 32-bit kernels can access 64-bit physical
+addresses.  When TDP is enabled, the fault address is a guest physical
+address and thus can be a 64-bit value, even when both KVM and its guest
+are using 32-bit virtual addressing, e.g. VMX's VMCS.GUEST_PHYSICAL is a
+64-bit field, not a natural width field.
+
+Using a gva_t for the fault address means KVM will incorrectly drop the
+upper 32-bits of the GPA.  Ditto for gva_to_gpa() when it is used to
+translate L2 GPAs to L1 GPAs.
+
+Opportunistically rename variables and parameters to better reflect the
+dual address modes, e.g. use "cr2_or_gpa" for fault addresses and plain
+"addr" instead of "vaddr" when the address may be either a GVA or an L2
+GPA.  Similarly, use "gpa" in the nonpaging_page_fault() flows to avoid
+a confusing "gpa_t gva" declaration; this also sets the stage for a
+future patch to combing nonpaging_page_fault() and tdp_page_fault() with
+minimal churn.
+
+Sprinkle in a few comments to document flows where an address is known
+to be a GVA and thus can be safely truncated to a 32-bit value.  Add
+WARNs in kvm_handle_page_fault() and FNAME(gva_to_gpa_nested)() to help
+document such cases and detect bugs.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h |    8 ++--
+ arch/x86/kvm/mmu/mmu.c          |   69 +++++++++++++++++++++++-----------------
+ arch/x86/kvm/mmu/paging_tmpl.h  |   25 +++++++++-----
+ arch/x86/kvm/mmutrace.h         |   12 +++---
+ arch/x86/kvm/x86.c              |   40 +++++++++++------------
+ arch/x86/kvm/x86.h              |    2 -
+ include/linux/kvm_host.h        |    6 +--
+ virt/kvm/async_pf.c             |   10 ++---
+ 8 files changed, 94 insertions(+), 78 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -378,12 +378,12 @@ struct kvm_mmu {
+       void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+       unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
+       u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
+-      int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
++      int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err,
+                         bool prefault);
+       void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+                                 struct x86_exception *fault);
+-      gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
+-                          struct x86_exception *exception);
++      gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa,
++                          u32 access, struct x86_exception *exception);
+       gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+                              struct x86_exception *exception);
+       int (*sync_page)(struct kvm_vcpu *vcpu,
+@@ -1469,7 +1469,7 @@ void kvm_vcpu_deactivate_apicv(struct kv
+ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
++int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+                      void *insn, int insn_len);
+ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
+--- a/arch/x86/kvm/mmu/mmu.c
++++ b/arch/x86/kvm/mmu/mmu.c
+@@ -3532,7 +3532,7 @@ static bool is_access_allowed(u32 fault_
+  * - true: let the vcpu to access on the same address again.
+  * - false: let the real page fault path to fix it.
+  */
+-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
++static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
+                           u32 error_code)
+ {
+       struct kvm_shadow_walk_iterator iterator;
+@@ -3552,7 +3552,7 @@ static bool fast_page_fault(struct kvm_v
+       do {
+               u64 new_spte;
+-              for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
++              for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
+                       if (!is_shadow_present_pte(spte) ||
+                           iterator.level < level)
+                               break;
+@@ -3630,7 +3630,7 @@ static bool fast_page_fault(struct kvm_v
+       } while (true);
+-      trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
++      trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
+                             spte, fault_handled);
+       walk_shadow_page_lockless_end(vcpu);
+@@ -3638,10 +3638,11 @@ static bool fast_page_fault(struct kvm_v
+ }
+ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+-                       gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
++                       gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
++                       bool *writable);
+ static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
+-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
++static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+                        gfn_t gfn, bool prefault)
+ {
+       int r;
+@@ -3667,16 +3668,16 @@ static int nonpaging_map(struct kvm_vcpu
+               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+       }
+-      if (fast_page_fault(vcpu, v, level, error_code))
++      if (fast_page_fault(vcpu, gpa, level, error_code))
+               return RET_PF_RETRY;
+       mmu_seq = vcpu->kvm->mmu_notifier_seq;
+       smp_rmb();
+-      if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
++      if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+               return RET_PF_RETRY;
+-      if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
++      if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
+               return r;
+       r = RET_PF_RETRY;
+@@ -3687,7 +3688,7 @@ static int nonpaging_map(struct kvm_vcpu
+               goto out_unlock;
+       if (likely(!force_pt_level))
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+-      r = __direct_map(vcpu, v, write, map_writable, level, pfn,
++      r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+                        prefault, false);
+ out_unlock:
+       spin_unlock(&vcpu->kvm->mmu_lock);
+@@ -3985,7 +3986,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu
+ }
+ EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
+-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
++static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
+                                 u32 access, struct x86_exception *exception)
+ {
+       if (exception)
+@@ -3993,7 +3994,7 @@ static gpa_t nonpaging_gva_to_gpa(struct
+       return vaddr;
+ }
+-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
++static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
+                                        u32 access,
+                                        struct x86_exception *exception)
+ {
+@@ -4153,13 +4154,14 @@ static void shadow_page_table_clear_floo
+       walk_shadow_page_lockless_end(vcpu);
+ }
+-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
++static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
+                               u32 error_code, bool prefault)
+ {
+-      gfn_t gfn = gva >> PAGE_SHIFT;
++      gfn_t gfn = gpa >> PAGE_SHIFT;
+       int r;
+-      pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
++      /* Note, paging is disabled, ergo gva == gpa. */
++      pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return RET_PF_EMULATE;
+@@ -4171,11 +4173,12 @@ static int nonpaging_page_fault(struct k
+       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
+-      return nonpaging_map(vcpu, gva & PAGE_MASK,
++      return nonpaging_map(vcpu, gpa & PAGE_MASK,
+                            error_code, gfn, prefault);
+ }
+-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
++static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
++                                 gfn_t gfn)
+ {
+       struct kvm_arch_async_pf arch;
+@@ -4184,11 +4187,13 @@ static int kvm_arch_setup_async_pf(struc
+       arch.direct_map = vcpu->arch.mmu->direct_map;
+       arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
+-      return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
++      return kvm_setup_async_pf(vcpu, cr2_or_gpa,
++                                kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ }
+ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+-                       gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
++                       gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
++                       bool *writable)
+ {
+       struct kvm_memory_slot *slot;
+       bool async;
+@@ -4208,12 +4213,12 @@ static bool try_async_pf(struct kvm_vcpu
+               return false; /* *pfn has correct page already */
+       if (!prefault && kvm_can_do_async_pf(vcpu)) {
+-              trace_kvm_try_async_get_page(gva, gfn);
++              trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
+               if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+-                      trace_kvm_async_pf_doublefault(gva, gfn);
++                      trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
+                       kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+                       return true;
+-              } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
++              } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
+                       return true;
+       }
+@@ -4226,6 +4231,12 @@ int kvm_handle_page_fault(struct kvm_vcp
+ {
+       int r = 1;
++#ifndef CONFIG_X86_64
++      /* A 64-bit CR2 should be impossible on 32-bit KVM. */
++      if (WARN_ON_ONCE(fault_address >> 32))
++              return -EFAULT;
++#endif
++
+       vcpu->arch.l1tf_flush_l1d = true;
+       switch (vcpu->arch.apf.host_apf_reason) {
+       default:
+@@ -4263,7 +4274,7 @@ check_hugepage_cache_consistency(struct
+       return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
+ }
+-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
++static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+                         bool prefault)
+ {
+       kvm_pfn_t pfn;
+@@ -5520,7 +5531,7 @@ static int make_mmu_pages_available(stru
+       return 0;
+ }
+-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
++int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+                      void *insn, int insn_len)
+ {
+       int r, emulation_type = 0;
+@@ -5529,18 +5540,18 @@ int kvm_mmu_page_fault(struct kvm_vcpu *
+       /* With shadow page tables, fault_address contains a GVA or nGPA.  */
+       if (vcpu->arch.mmu->direct_map) {
+               vcpu->arch.gpa_available = true;
+-              vcpu->arch.gpa_val = cr2;
++              vcpu->arch.gpa_val = cr2_or_gpa;
+       }
+       r = RET_PF_INVALID;
+       if (unlikely(error_code & PFERR_RSVD_MASK)) {
+-              r = handle_mmio_page_fault(vcpu, cr2, direct);
++              r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
+               if (r == RET_PF_EMULATE)
+                       goto emulate;
+       }
+       if (r == RET_PF_INVALID) {
+-              r = vcpu->arch.mmu->page_fault(vcpu, cr2,
++              r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa,
+                                              lower_32_bits(error_code),
+                                              false);
+               WARN_ON(r == RET_PF_INVALID);
+@@ -5560,7 +5571,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *
+        */
+       if (vcpu->arch.mmu->direct_map &&
+           (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
+-              kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
++              kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
+               return 1;
+       }
+@@ -5575,7 +5586,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *
+        * explicitly shadowing L1's page tables, i.e. unprotecting something
+        * for L1 isn't going to magically fix whatever issue cause L2 to fail.
+        */
+-      if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
++      if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
+               emulation_type = EMULTYPE_ALLOW_RETRY;
+ emulate:
+       /*
+@@ -5590,7 +5601,7 @@ emulate:
+                       return 1;
+       }
+-      return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
++      return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
+                                      insn_len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+--- a/arch/x86/kvm/mmu/paging_tmpl.h
++++ b/arch/x86/kvm/mmu/paging_tmpl.h
+@@ -291,11 +291,11 @@ static inline unsigned FNAME(gpte_pkeys)
+ }
+ /*
+- * Fetch a guest pte for a guest virtual address
++ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
+  */
+ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+                                   struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+-                                  gva_t addr, u32 access)
++                                  gpa_t addr, u32 access)
+ {
+       int ret;
+       pt_element_t pte;
+@@ -496,7 +496,7 @@ error:
+ }
+ static int FNAME(walk_addr)(struct guest_walker *walker,
+-                          struct kvm_vcpu *vcpu, gva_t addr, u32 access)
++                          struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
+ {
+       return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
+                                       access);
+@@ -611,7 +611,7 @@ static void FNAME(pte_prefetch)(struct k
+  * If the guest tries to write a write-protected page, we need to
+  * emulate this operation, return 1 to indicate this case.
+  */
+-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
++static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
+                        struct guest_walker *gw,
+                        int write_fault, int hlevel,
+                        kvm_pfn_t pfn, bool map_writable, bool prefault,
+@@ -765,7 +765,7 @@ FNAME(is_self_change_mapping)(struct kvm
+  *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+  *           a negative value on error.
+  */
+-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
++static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
+                            bool prefault)
+ {
+       int write_fault = error_code & PFERR_WRITE_MASK;
+@@ -945,18 +945,19 @@ static void FNAME(invlpg)(struct kvm_vcp
+       spin_unlock(&vcpu->kvm->mmu_lock);
+ }
+-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
++/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
++static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
+                              struct x86_exception *exception)
+ {
+       struct guest_walker walker;
+       gpa_t gpa = UNMAPPED_GVA;
+       int r;
+-      r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
++      r = FNAME(walk_addr)(&walker, vcpu, addr, access);
+       if (r) {
+               gpa = gfn_to_gpa(walker.gfn);
+-              gpa |= vaddr & ~PAGE_MASK;
++              gpa |= addr & ~PAGE_MASK;
+       } else if (exception)
+               *exception = walker.fault;
+@@ -964,7 +965,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
+ }
+ #if PTTYPE != PTTYPE_EPT
+-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
++/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
++static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
+                                     u32 access,
+                                     struct x86_exception *exception)
+ {
+@@ -972,6 +974,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(st
+       gpa_t gpa = UNMAPPED_GVA;
+       int r;
++#ifndef CONFIG_X86_64
++      /* A 64-bit GVA should be impossible on 32-bit KVM. */
++      WARN_ON_ONCE(vaddr >> 32);
++#endif
++
+       r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
+       if (r) {
+--- a/arch/x86/kvm/mmutrace.h
++++ b/arch/x86/kvm/mmutrace.h
+@@ -249,13 +249,13 @@ TRACE_EVENT(
+ TRACE_EVENT(
+       fast_page_fault,
+-      TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
++      TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
+                u64 *sptep, u64 old_spte, bool retry),
+-      TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
++      TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
+       TP_STRUCT__entry(
+               __field(int, vcpu_id)
+-              __field(gva_t, gva)
++              __field(gpa_t, cr2_or_gpa)
+               __field(u32, error_code)
+               __field(u64 *, sptep)
+               __field(u64, old_spte)
+@@ -265,7 +265,7 @@ TRACE_EVENT(
+       TP_fast_assign(
+               __entry->vcpu_id = vcpu->vcpu_id;
+-              __entry->gva = gva;
++              __entry->cr2_or_gpa = cr2_or_gpa;
+               __entry->error_code = error_code;
+               __entry->sptep = sptep;
+               __entry->old_spte = old_spte;
+@@ -273,9 +273,9 @@ TRACE_EVENT(
+               __entry->retry = retry;
+       ),
+-      TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
++      TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
+                 " new %llx spurious %d fixed %d", __entry->vcpu_id,
+-                __entry->gva, __print_flags(__entry->error_code, "|",
++                __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
+                 kvm_mmu_trace_pferr_flags), __entry->sptep,
+                 __entry->old_spte, __entry->new_spte,
+                 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -6396,11 +6396,11 @@ static int handle_emulation_failure(stru
+       return 1;
+ }
+-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
++static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                 bool write_fault_to_shadow_pgtable,
+                                 int emulation_type)
+ {
+-      gpa_t gpa = cr2;
++      gpa_t gpa = cr2_or_gpa;
+       kvm_pfn_t pfn;
+       if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+@@ -6414,7 +6414,7 @@ static bool reexecute_instruction(struct
+                * Write permission should be allowed since only
+                * write access need to be emulated.
+                */
+-              gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
++              gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+               /*
+                * If the mapping is invalid in guest, let cpu retry
+@@ -6471,10 +6471,10 @@ static bool reexecute_instruction(struct
+ }
+ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
+-                            unsigned long cr2,  int emulation_type)
++                            gpa_t cr2_or_gpa,  int emulation_type)
+ {
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+-      unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
++      unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
+       last_retry_eip = vcpu->arch.last_retry_eip;
+       last_retry_addr = vcpu->arch.last_retry_addr;
+@@ -6503,14 +6503,14 @@ static bool retry_instruction(struct x86
+       if (x86_page_table_writing_insn(ctxt))
+               return false;
+-      if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
++      if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
+               return false;
+       vcpu->arch.last_retry_eip = ctxt->eip;
+-      vcpu->arch.last_retry_addr = cr2;
++      vcpu->arch.last_retry_addr = cr2_or_gpa;
+       if (!vcpu->arch.mmu->direct_map)
+-              gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
++              gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+@@ -6656,11 +6656,8 @@ static bool is_vmware_backdoor_opcode(st
+       return false;
+ }
+-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
+-                          unsigned long cr2,
+-                          int emulation_type,
+-                          void *insn,
+-                          int insn_len)
++int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
++                          int emulation_type, void *insn, int insn_len)
+ {
+       int r;
+       struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+@@ -6706,8 +6703,9 @@ int x86_emulate_instruction(struct kvm_v
+                               kvm_queue_exception(vcpu, UD_VECTOR);
+                               return 1;
+                       }
+-                      if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+-                                              emulation_type))
++                      if (reexecute_instruction(vcpu, cr2_or_gpa,
++                                                write_fault_to_spt,
++                                                emulation_type))
+                               return 1;
+                       if (ctxt->have_exception) {
+                               /*
+@@ -6741,7 +6739,7 @@ int x86_emulate_instruction(struct kvm_v
+               return 1;
+       }
+-      if (retry_instruction(ctxt, cr2, emulation_type))
++      if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+               return 1;
+       /* this is needed for vmware backdoor interface to work since it
+@@ -6753,7 +6751,7 @@ int x86_emulate_instruction(struct kvm_v
+ restart:
+       /* Save the faulting GPA (cr2) in the address field */
+-      ctxt->exception.address = cr2;
++      ctxt->exception.address = cr2_or_gpa;
+       r = x86_emulate_insn(ctxt);
+@@ -6761,7 +6759,7 @@ restart:
+               return 1;
+       if (r == EMULATION_FAILED) {
+-              if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
++              if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
+                                       emulation_type))
+                       return 1;
+@@ -10045,7 +10043,7 @@ void kvm_arch_async_page_ready(struct kv
+             work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
+               return;
+-      vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
++      vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true);
+ }
+ static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+@@ -10158,7 +10156,7 @@ void kvm_arch_async_page_not_present(str
+ {
+       struct x86_exception fault;
+-      trace_kvm_async_pf_not_present(work->arch.token, work->gva);
++      trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
+       kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+       if (kvm_can_deliver_async_pf(vcpu) &&
+@@ -10193,7 +10191,7 @@ void kvm_arch_async_page_present(struct
+               work->arch.token = ~0; /* broadcast wakeup */
+       else
+               kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+-      trace_kvm_async_pf_ready(work->arch.token, work->gva);
++      trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
+       if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
+           !apf_get_user(vcpu, &val)) {
+--- a/arch/x86/kvm/x86.h
++++ b/arch/x86/kvm/x86.h
+@@ -289,7 +289,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vc
+ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                         int page_num);
+ bool kvm_vector_hashing_enabled(void);
+-int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
++int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                           int emulation_type, void *insn, int insn_len);
+ #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -204,7 +204,7 @@ struct kvm_async_pf {
+       struct list_head queue;
+       struct kvm_vcpu *vcpu;
+       struct mm_struct *mm;
+-      gva_t gva;
++      gpa_t cr2_or_gpa;
+       unsigned long addr;
+       struct kvm_arch_async_pf arch;
+       bool   wakeup_all;
+@@ -212,8 +212,8 @@ struct kvm_async_pf {
+ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
+ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
+-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
+-                     struct kvm_arch_async_pf *arch);
++int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
++                     unsigned long hva, struct kvm_arch_async_pf *arch);
+ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
+ #endif
+--- a/virt/kvm/async_pf.c
++++ b/virt/kvm/async_pf.c
+@@ -64,7 +64,7 @@ static void async_pf_execute(struct work
+       struct mm_struct *mm = apf->mm;
+       struct kvm_vcpu *vcpu = apf->vcpu;
+       unsigned long addr = apf->addr;
+-      gva_t gva = apf->gva;
++      gpa_t cr2_or_gpa = apf->cr2_or_gpa;
+       int locked = 1;
+       might_sleep();
+@@ -92,7 +92,7 @@ static void async_pf_execute(struct work
+        * this point
+        */
+-      trace_kvm_async_pf_completed(addr, gva);
++      trace_kvm_async_pf_completed(addr, cr2_or_gpa);
+       if (swq_has_sleeper(&vcpu->wq))
+               swake_up_one(&vcpu->wq);
+@@ -165,8 +165,8 @@ void kvm_check_async_pf_completion(struc
+       }
+ }
+-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
+-                     struct kvm_arch_async_pf *arch)
++int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
++                     unsigned long hva, struct kvm_arch_async_pf *arch)
+ {
+       struct kvm_async_pf *work;
+@@ -185,7 +185,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *
+       work->wakeup_all = false;
+       work->vcpu = vcpu;
+-      work->gva = gva;
++      work->cr2_or_gpa = cr2_or_gpa;
+       work->addr = hva;
+       work->arch = *arch;
+       work->mm = current->mm;
diff --git a/queue-5.5/kvm-x86-use-raw-clock-values-consistently.patch b/queue-5.5/kvm-x86-use-raw-clock-values-consistently.patch
new file mode 100644 (file)
index 0000000..f8d2aec
--- /dev/null
@@ -0,0 +1,138 @@
+From 8171cd68806bd2fc28ef688e32fb2a3b3deb04e5 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Wed, 22 Jan 2020 14:36:09 +0100
+Subject: KVM: x86: use raw clock values consistently
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 8171cd68806bd2fc28ef688e32fb2a3b3deb04e5 upstream.
+
+Commit 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw
+clock") changed kvmclock to use tkr_raw instead of tkr_mono.  However,
+the default kvmclock_offset for the VM was still based on the monotonic
+clock and, if the raw clock drifted enough from the monotonic clock,
+this could cause a negative system_time to be written to the guest's
+struct pvclock.  RHEL5 does not like it and (if it boots fast enough to
+observe a negative time value) it hangs.
+
+There is another thing to be careful about: getboottime64 returns the
+host boot time with tkr_mono frequency, and subtracting the tkr_raw-based
+kvmclock value will cause the wallclock to be off if tkr_raw drifts
+from tkr_mono.  To avoid this, compute the wallclock delta from the
+current time instead of being clever and using getboottime64.
+
+Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock")
+Cc: stable@vger.kernel.org
+Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |   38 +++++++++++++++++++++++---------------
+ 1 file changed, 23 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -1625,6 +1625,18 @@ static void update_pvclock_gtod(struct t
+       write_seqcount_end(&vdata->seq);
+ }
++
++static s64 get_kvmclock_base_ns(void)
++{
++      /* Count up from boot time, but with the frequency of the raw clock.  */
++      return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
++}
++#else
++static s64 get_kvmclock_base_ns(void)
++{
++      /* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
++      return ktime_get_boottime_ns();
++}
+ #endif
+ void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+@@ -1638,7 +1650,7 @@ static void kvm_write_wall_clock(struct
+       int version;
+       int r;
+       struct pvclock_wall_clock wc;
+-      struct timespec64 boot;
++      u64 wall_nsec;
+       if (!wall_clock)
+               return;
+@@ -1658,17 +1670,12 @@ static void kvm_write_wall_clock(struct
+       /*
+        * The guest calculates current wall clock time by adding
+        * system time (updated by kvm_guest_time_update below) to the
+-       * wall clock specified here.  guest system time equals host
+-       * system time for us, thus we must fill in host boot time here.
++       * wall clock specified here.  We do the reverse here.
+        */
+-      getboottime64(&boot);
++      wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+-      if (kvm->arch.kvmclock_offset) {
+-              struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+-              boot = timespec64_sub(boot, ts);
+-      }
+-      wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
+-      wc.nsec = boot.tv_nsec;
++      wc.nsec = do_div(wall_nsec, 1000000000);
++      wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
+       wc.version = version;
+       kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+@@ -1916,7 +1923,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu
+       raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+       offset = kvm_compute_tsc_offset(vcpu, data);
+-      ns = ktime_get_boottime_ns();
++      ns = get_kvmclock_base_ns();
+       elapsed = ns - kvm->arch.last_tsc_nsec;
+       if (vcpu->arch.virtual_tsc_khz) {
+@@ -2254,7 +2261,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
+       spin_lock(&ka->pvclock_gtod_sync_lock);
+       if (!ka->use_master_clock) {
+               spin_unlock(&ka->pvclock_gtod_sync_lock);
+-              return ktime_get_boottime_ns() + ka->kvmclock_offset;
++              return get_kvmclock_base_ns() + ka->kvmclock_offset;
+       }
+       hv_clock.tsc_timestamp = ka->master_cycle_now;
+@@ -2270,7 +2277,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
+                                  &hv_clock.tsc_to_system_mul);
+               ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+       } else
+-              ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
++              ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
+       put_cpu();
+@@ -2369,7 +2376,7 @@ static int kvm_guest_time_update(struct
+       }
+       if (!use_master_clock) {
+               host_tsc = rdtsc();
+-              kernel_ns = ktime_get_boottime_ns();
++              kernel_ns = get_kvmclock_base_ns();
+       }
+       tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
+@@ -2409,6 +2416,7 @@ static int kvm_guest_time_update(struct
+       vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+       vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+       vcpu->last_guest_tsc = tsc_timestamp;
++      WARN_ON(vcpu->hv_clock.system_time < 0);
+       /* If the host uses TSC clocksource, then it is stable */
+       pvclock_flags = 0;
+@@ -9580,7 +9588,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un
+       mutex_init(&kvm->arch.apic_map_lock);
+       spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+-      kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
++      kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
+       pvclock_update_vm_gtod_copy(kvm);
+       kvm->arch.guest_can_read_msr_platform_info = true;
diff --git a/queue-5.5/mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch b/queue-5.5/mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch
new file mode 100644 (file)
index 0000000..185453a
--- /dev/null
@@ -0,0 +1,134 @@
+From e822969cab48b786b64246aad1a3ba2a774f5d23 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Mon, 3 Feb 2020 17:33:48 -0800
+Subject: mm/page_alloc.c: fix uninitialized memmaps on a partially populated last section
+
+From: David Hildenbrand <david@redhat.com>
+
+commit e822969cab48b786b64246aad1a3ba2a774f5d23 upstream.
+
+Patch series "mm: fix max_pfn not falling on section boundary", v2.
+
+Playing with different memory sizes for a x86-64 guest, I discovered that
+some memmaps (highest section if max_mem does not fall on the section
+boundary) are marked as being valid and online, but contain garbage.  We
+have to properly initialize these memmaps.
+
+Looking at /proc/kpageflags and friends, I found some more issues,
+partially related to this.
+
+This patch (of 3):
+
+If max_pfn is not aligned to a section boundary, we can easily run into
+BUGs.  This can e.g., be triggered on x86-64 under QEMU by specifying a
+memory size that is not a multiple of 128MB (e.g., 4097MB, but also
+4160MB).  I was told that on real HW, we can easily have this scenario
+(esp., one of the main reasons sub-section hotadd of devmem was added).
+
+The issue is, that we have a valid memmap (pfn_valid()) for the whole
+section, and the whole section will be marked "online".
+pfn_to_online_page() will succeed, but the memmap contains garbage.
+
+E.g., doing a "./page-types -r -a 0x144001" when QEMU was started with "-m
+4160M" - (see tools/vm/page-types.c):
+
+[  200.476376] BUG: unable to handle page fault for address: fffffffffffffffe
+[  200.477500] #PF: supervisor read access in kernel mode
+[  200.478334] #PF: error_code(0x0000) - not-present page
+[  200.479076] PGD 59614067 P4D 59614067 PUD 59616067 PMD 0
+[  200.479557] Oops: 0000 [#4] SMP NOPTI
+[  200.479875] CPU: 0 PID: 603 Comm: page-types Tainted: G      D W         5.5.0-rc1-next-20191209 #93
+[  200.480646] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4
+[  200.481648] RIP: 0010:stable_page_flags+0x4d/0x410
+[  200.482061] Code: f3 ff 41 89 c0 48 b8 00 00 00 00 01 00 00 00 45 84 c0 0f 85 cd 02 00 00 48 8b 53 08 48 8b 2b 48f
+[  200.483644] RSP: 0018:ffffb139401cbe60 EFLAGS: 00010202
+[  200.484091] RAX: fffffffffffffffe RBX: fffffbeec5100040 RCX: 0000000000000000
+[  200.484697] RDX: 0000000000000001 RSI: ffffffff9535c7cd RDI: 0000000000000246
+[  200.485313] RBP: ffffffffffffffff R08: 0000000000000000 R09: 0000000000000000
+[  200.485917] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000144001
+[  200.486523] R13: 00007ffd6ba55f48 R14: 00007ffd6ba55f40 R15: ffffb139401cbf08
+[  200.487130] FS:  00007f68df717580(0000) GS:ffff9ec77fa00000(0000) knlGS:0000000000000000
+[  200.487804] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  200.488295] CR2: fffffffffffffffe CR3: 0000000135d48000 CR4: 00000000000006f0
+[  200.488897] Call Trace:
+[  200.489115]  kpageflags_read+0xe9/0x140
+[  200.489447]  proc_reg_read+0x3c/0x60
+[  200.489755]  vfs_read+0xc2/0x170
+[  200.490037]  ksys_pread64+0x65/0xa0
+[  200.490352]  do_syscall_64+0x5c/0xa0
+[  200.490665]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+But it can be triggered much easier via "cat /proc/kpageflags > /dev/null"
+after cold/hot plugging a DIMM to such a system:
+
+[root@localhost ~]# cat /proc/kpageflags > /dev/null
+[  111.517275] BUG: unable to handle page fault for address: fffffffffffffffe
+[  111.517907] #PF: supervisor read access in kernel mode
+[  111.518333] #PF: error_code(0x0000) - not-present page
+[  111.518771] PGD a240e067 P4D a240e067 PUD a2410067 PMD 0
+
+This patch fixes that by at least zero-ing out that memmap (so e.g.,
+page_to_pfn() will not crash).  Commit 907ec5fca3dc ("mm: zero remaining
+unavailable struct pages") tried to fix a similar issue, but forgot to
+consider this special case.
+
+After this patch, there are still problems to solve.  E.g., not all of
+these pages falling into a memory hole will actually get initialized later
+and set PageReserved - they are only zeroed out - but at least the
+immediate crashes are gone.  A follow-up patch will take care of this.
+
+Link: http://lkml.kernel.org/r/20191211163201.17179-2-david@redhat.com
+Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Tested-by: Daniel Jordan <daniel.m.jordan@oracle.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Steven Sistare <steven.sistare@oracle.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
+Cc: Bob Picco <bob.picco@oracle.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Alexey Dobriyan <adobriyan@gmail.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: <stable@vger.kernel.org>   [4.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |   14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -6921,7 +6921,8 @@ static u64 zero_pfn_range(unsigned long
+  * This function also addresses a similar issue where struct pages are left
+  * uninitialized because the physical address range is not covered by
+  * memblock.memory or memblock.reserved. That could happen when memblock
+- * layout is manually configured via memmap=.
++ * layout is manually configured via memmap=, or when the highest physical
++ * address (max_pfn) does not end on a section boundary.
+  */
+ void __init zero_resv_unavail(void)
+ {
+@@ -6939,7 +6940,16 @@ void __init zero_resv_unavail(void)
+                       pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+               next = end;
+       }
+-      pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
++
++      /*
++       * Early sections always have a fully populated memmap for the whole
++       * section - see pfn_valid(). If the last section has holes at the
++       * end and that section is marked "online", the memmap will be
++       * considered initialized. Make sure that memmap has a well defined
++       * state.
++       */
++      pgcnt += zero_pfn_range(PFN_DOWN(next),
++                              round_up(max_pfn, PAGES_PER_SECTION));
+       /*
+        * Struct pages that do not have backing memory. This could be because
diff --git a/queue-5.5/ocfs2-fix-oops-when-writing-cloned-file.patch b/queue-5.5/ocfs2-fix-oops-when-writing-cloned-file.patch
new file mode 100644 (file)
index 0000000..fde63c4
--- /dev/null
@@ -0,0 +1,139 @@
+From 2d797e9ff95ecbcf0a83d657928ed20579444857 Mon Sep 17 00:00:00 2001
+From: Gang He <GHe@suse.com>
+Date: Mon, 3 Feb 2020 17:33:45 -0800
+Subject: ocfs2: fix oops when writing cloned file
+
+From: Gang He <GHe@suse.com>
+
+commit 2d797e9ff95ecbcf0a83d657928ed20579444857 upstream.
+
+Writing a cloned file triggers a kernel oops and the user-space command
+process is also killed by the system.  The bug can be reproduced stably
+via:
+
+1) create a file under ocfs2 file system directory.
+
+  journalctl -b > aa.txt
+
+2) create a cloned file for this file.
+
+  reflink aa.txt bb.txt
+
+3) write the cloned file with dd command.
+
+  dd if=/dev/zero of=bb.txt bs=512 count=1 conv=notrunc
+
+The dd command is killed by the kernel, then you can see the oops message
+via dmesg command.
+
+[  463.875404] BUG: kernel NULL pointer dereference, address: 0000000000000028
+[  463.875413] #PF: supervisor read access in kernel mode
+[  463.875416] #PF: error_code(0x0000) - not-present page
+[  463.875418] PGD 0 P4D 0
+[  463.875425] Oops: 0000 [#1] SMP PTI
+[  463.875431] CPU: 1 PID: 2291 Comm: dd Tainted: G           OE     5.3.16-2-default
+[  463.875433] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+[  463.875500] RIP: 0010:ocfs2_refcount_cow+0xa4/0x5d0 [ocfs2]
+[  463.875505] Code: 06 89 6c 24 38 89 eb f6 44 24 3c 02 74 be 49 8b 47 28
+[  463.875508] RSP: 0018:ffffa2cb409dfce8 EFLAGS: 00010202
+[  463.875512] RAX: ffff8b1ebdca8000 RBX: 0000000000000001 RCX: ffff8b1eb73a9df0
+[  463.875515] RDX: 0000000000056a01 RSI: 0000000000000000 RDI: 0000000000000000
+[  463.875517] RBP: 0000000000000001 R08: ffff8b1eb73a9de0 R09: 0000000000000000
+[  463.875520] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
+[  463.875522] R13: ffff8b1eb922f048 R14: 0000000000000000 R15: ffff8b1eb922f048
+[  463.875526] FS:  00007f8f44d15540(0000) GS:ffff8b1ebeb00000(0000) knlGS:0000000000000000
+[  463.875529] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  463.875532] CR2: 0000000000000028 CR3: 000000003c17a000 CR4: 00000000000006e0
+[  463.875546] Call Trace:
+[  463.875596]  ? ocfs2_inode_lock_full_nested+0x18b/0x960 [ocfs2]
+[  463.875648]  ocfs2_file_write_iter+0xaf8/0xc70 [ocfs2]
+[  463.875672]  new_sync_write+0x12d/0x1d0
+[  463.875688]  vfs_write+0xad/0x1a0
+[  463.875697]  ksys_write+0xa1/0xe0
+[  463.875710]  do_syscall_64+0x60/0x1f0
+[  463.875743]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+[  463.875758] RIP: 0033:0x7f8f4482ed44
+[  463.875762] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 80 00 00 00
+[  463.875765] RSP: 002b:00007fff300a79d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
+[  463.875769] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f8f4482ed44
+[  463.875771] RDX: 0000000000000200 RSI: 000055f771b5c000 RDI: 0000000000000001
+[  463.875774] RBP: 0000000000000200 R08: 00007f8f44af9c78 R09: 0000000000000003
+[  463.875776] R10: 000000000000089f R11: 0000000000000246 R12: 000055f771b5c000
+[  463.875779] R13: 0000000000000200 R14: 0000000000000000 R15: 000055f771b5c000
+
+This regression problem was introduced by commit e74540b28556 ("ocfs2:
+protect extent tree in ocfs2_prepare_inode_for_write()").
+
+Link: http://lkml.kernel.org/r/20200121050153.13290-1-ghe@suse.com
+Fixes: e74540b28556 ("ocfs2: protect extent tree in ocfs2_prepare_inode_for_write()").
+Signed-off-by: Gang He <ghe@suse.com>
+Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Mark Fasheh <mark@fasheh.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Changwei Ge <gechangwei@live.cn>
+Cc: Jun Piao <piaojun@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/file.c |   14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+--- a/fs/ocfs2/file.c
++++ b/fs/ocfs2/file.c
+@@ -2101,17 +2101,15 @@ static int ocfs2_is_io_unaligned(struct
+ static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+                                           struct buffer_head **di_bh,
+                                           int meta_level,
+-                                          int overwrite_io,
+                                           int write_sem,
+                                           int wait)
+ {
+       int ret = 0;
+       if (wait)
+-              ret = ocfs2_inode_lock(inode, NULL, meta_level);
++              ret = ocfs2_inode_lock(inode, di_bh, meta_level);
+       else
+-              ret = ocfs2_try_inode_lock(inode,
+-                      overwrite_io ? NULL : di_bh, meta_level);
++              ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
+       if (ret < 0)
+               goto out;
+@@ -2136,6 +2134,7 @@ static int ocfs2_inode_lock_for_extent_t
+ out_unlock:
+       brelse(*di_bh);
++      *di_bh = NULL;
+       ocfs2_inode_unlock(inode, meta_level);
+ out:
+       return ret;
+@@ -2177,7 +2176,6 @@ static int ocfs2_prepare_inode_for_write
+               ret = ocfs2_inode_lock_for_extent_tree(inode,
+                                                      &di_bh,
+                                                      meta_level,
+-                                                     overwrite_io,
+                                                      write_sem,
+                                                      wait);
+               if (ret < 0) {
+@@ -2233,13 +2231,13 @@ static int ocfs2_prepare_inode_for_write
+                                                          &di_bh,
+                                                          meta_level,
+                                                          write_sem);
++                      meta_level = 1;
++                      write_sem = 1;
+                       ret = ocfs2_inode_lock_for_extent_tree(inode,
+                                                              &di_bh,
+                                                              meta_level,
+-                                                             overwrite_io,
+-                                                             1,
++                                                             write_sem,
+                                                              wait);
+-                      write_sem = 1;
+                       if (ret < 0) {
+                               if (ret != -EAGAIN)
+                                       mlog_errno(ret);
index f7041a62f0c1b3d92e71a0e4aa778f3f4d4ff368..ea60440d1349356a0bdde563e2e3fe4b7520896b 100644 (file)
@@ -241,3 +241,22 @@ kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch
 kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch
 kvm-svm-pku-not-currently-supported.patch
 kvm-x86-mmu-apply-max-pa-check-for-mmio-sptes-to-32-bit-kvm.patch
+x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch
+kvm-x86-use-cpuid-to-locate-host-page-table-reserved-bits.patch
+x86-kvm-introduce-kvm_-un-map_gfn.patch
+x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch
+x86-kvm-cache-gfn-to-pfn-translation.patch
+x86-kvm-clean-up-host-s-steal-time-structure.patch
+kvm-x86-use-gpa_t-for-cr2-gpa-to-fix-tdp-support-on-32-bit-kvm.patch
+kvm-vmx-add-non-canonical-check-on-writes-to-rtit-address-msrs.patch
+kvm-x86-don-t-let-userspace-set-host-reserved-cr4-bits.patch
+kvm-x86-free-wbinvd_dirty_mask-if-vcpu-creation-fails.patch
+kvm-x86-fix-overlap-between-spte_mmio_mask-and-generation.patch
+kvm-x86-handle-tif_need_fpu_load-in-kvm_-load-put-_guest_fpu.patch
+kvm-x86-ensure-guest-s-fpu-state-is-loaded-when-accessing-for-emulation.patch
+kvm-x86-revert-kvm-x86-fix-fpu-state-crash-in-kvm-guest.patch
+kvm-s390-do-not-clobber-registers-during-guest-reset-store-status.patch
+kvm-x86-reorganize-pvclock_gtod_data-members.patch
+kvm-x86-use-raw-clock-values-consistently.patch
+ocfs2-fix-oops-when-writing-cloned-file.patch
+mm-page_alloc.c-fix-uninitialized-memmaps-on-a-partially-populated-last-section.patch
diff --git a/queue-5.5/x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch b/queue-5.5/x86-kvm-be-careful-not-to-clear-kvm_vcpu_flush_tlb-bit.patch
new file mode 100644 (file)
index 0000000..f5fe643
--- /dev/null
@@ -0,0 +1,39 @@
+From 8c6de56a42e0c657955e12b882a81ef07d1d073e Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Wed, 30 Oct 2019 19:01:31 +0000
+Subject: x86/kvm: Be careful not to clear KVM_VCPU_FLUSH_TLB bit
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit 8c6de56a42e0c657955e12b882a81ef07d1d073e upstream.
+
+kvm_steal_time_set_preempted() may accidentally clear KVM_VCPU_FLUSH_TLB
+bit if it is called more than once while VCPU is preempted.
+
+This is part of CVE-2019-3016.
+
+(This bug was also independently discovered by Jim Mattson
+<jmattson@google.com>)
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3514,6 +3514,9 @@ static void kvm_steal_time_set_preempted
+       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+               return;
++      if (vcpu->arch.st.steal.preempted)
++              return;
++
+       vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
+       kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
diff --git a/queue-5.5/x86-kvm-cache-gfn-to-pfn-translation.patch b/queue-5.5/x86-kvm-cache-gfn-to-pfn-translation.patch
new file mode 100644 (file)
index 0000000..ebffaa4
--- /dev/null
@@ -0,0 +1,285 @@
+From 917248144db5d7320655dbb41d3af0b8a0f3d589 Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Thu, 5 Dec 2019 01:30:51 +0000
+Subject: x86/kvm: Cache gfn to pfn translation
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit 917248144db5d7320655dbb41d3af0b8a0f3d589 upstream.
+
+__kvm_map_gfn()'s call to gfn_to_pfn_memslot() is
+* relatively expensive
+* in certain cases (such as when done from atomic context) cannot be called
+
+Stashing gfn-to-pfn mapping should help with both cases.
+
+This is part of CVE-2019-3016.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h |    1 
+ arch/x86/kvm/x86.c              |   10 ++++
+ include/linux/kvm_host.h        |    7 ++
+ include/linux/kvm_types.h       |    9 +++
+ virt/kvm/kvm_main.c             |   98 ++++++++++++++++++++++++++++++++--------
+ 5 files changed, 103 insertions(+), 22 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -689,6 +689,7 @@ struct kvm_vcpu_arch {
+               u64 last_steal;
+               struct gfn_to_hva_cache stime;
+               struct kvm_steal_time steal;
++              struct gfn_to_pfn_cache cache;
+       } st;
+       u64 tsc_offset;
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -9111,6 +9111,9 @@ static void fx_init(struct kvm_vcpu *vcp
+ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+ {
+       void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
++      struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
++
++      kvm_release_pfn(cache->pfn, cache->dirty, cache);
+       kvmclock_reset(vcpu);
+@@ -9784,11 +9787,18 @@ out_free:
+ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+ {
++      struct kvm_vcpu *vcpu;
++      int i;
++
+       /*
+        * memslots->generation has been incremented.
+        * mmio generation may have reached its maximum value.
+        */
+       kvm_mmu_invalidate_mmio_sptes(kvm, gen);
++
++      /* Force re-initialization of steal_time cache */
++      kvm_for_each_vcpu(i, vcpu, kvm)
++              kvm_vcpu_kick(vcpu);
+ }
+ int kvm_arch_prepare_memory_region(struct kvm *kvm,
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -723,6 +723,7 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn);
+ void kvm_set_pfn_accessed(kvm_pfn_t pfn);
+ void kvm_get_pfn(kvm_pfn_t pfn);
++void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
+ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+                       int len);
+ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+@@ -775,10 +776,12 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
+-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map);
++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
++              struct gfn_to_pfn_cache *cache, bool atomic);
+ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
+-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
++                struct gfn_to_pfn_cache *cache, bool dirty, bool atomic);
+ unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
+ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
+ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
+--- a/include/linux/kvm_types.h
++++ b/include/linux/kvm_types.h
+@@ -18,7 +18,7 @@ struct kvm_memslots;
+ enum kvm_mr_change;
+-#include <asm/types.h>
++#include <linux/types.h>
+ /*
+  * Address types:
+@@ -51,4 +51,11 @@ struct gfn_to_hva_cache {
+       struct kvm_memory_slot *memslot;
+ };
++struct gfn_to_pfn_cache {
++      u64 generation;
++      gfn_t gfn;
++      kvm_pfn_t pfn;
++      bool dirty;
++};
++
+ #endif /* __KVM_TYPES_H__ */
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -1821,27 +1821,72 @@ struct page *gfn_to_page(struct kvm *kvm
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_page);
++void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
++{
++      if (pfn == 0)
++              return;
++
++      if (cache)
++              cache->pfn = cache->gfn = 0;
++
++      if (dirty)
++              kvm_release_pfn_dirty(pfn);
++      else
++              kvm_release_pfn_clean(pfn);
++}
++
++static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
++                               struct gfn_to_pfn_cache *cache, u64 gen)
++{
++      kvm_release_pfn(cache->pfn, cache->dirty, cache);
++
++      cache->pfn = gfn_to_pfn_memslot(slot, gfn);
++      cache->gfn = gfn;
++      cache->dirty = false;
++      cache->generation = gen;
++}
++
+ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+-                       struct kvm_host_map *map)
++                       struct kvm_host_map *map,
++                       struct gfn_to_pfn_cache *cache,
++                       bool atomic)
+ {
+       kvm_pfn_t pfn;
+       void *hva = NULL;
+       struct page *page = KVM_UNMAPPED_PAGE;
+       struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
++      u64 gen = slots->generation;
+       if (!map)
+               return -EINVAL;
+-      pfn = gfn_to_pfn_memslot(slot, gfn);
++      if (cache) {
++              if (!cache->pfn || cache->gfn != gfn ||
++                      cache->generation != gen) {
++                      if (atomic)
++                              return -EAGAIN;
++                      kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
++              }
++              pfn = cache->pfn;
++      } else {
++              if (atomic)
++                      return -EAGAIN;
++              pfn = gfn_to_pfn_memslot(slot, gfn);
++      }
+       if (is_error_noslot_pfn(pfn))
+               return -EINVAL;
+       if (pfn_valid(pfn)) {
+               page = pfn_to_page(pfn);
+-              hva = kmap(page);
++              if (atomic)
++                      hva = kmap_atomic(page);
++              else
++                      hva = kmap(page);
+ #ifdef CONFIG_HAS_IOMEM
+-      } else {
++      } else if (!atomic) {
+               hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
++      } else {
++              return -EINVAL;
+ #endif
+       }
+@@ -1856,20 +1901,25 @@ static int __kvm_map_gfn(struct kvm_mems
+       return 0;
+ }
+-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
++              struct gfn_to_pfn_cache *cache, bool atomic)
+ {
+-      return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map);
++      return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
++                      cache, atomic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_map_gfn);
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+ {
+-      return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map);
++      return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
++              NULL, false);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
+-                      struct kvm_host_map *map, bool dirty)
++                      struct kvm_host_map *map,
++                      struct gfn_to_pfn_cache *cache,
++                      bool dirty, bool atomic)
+ {
+       if (!map)
+               return;
+@@ -1877,34 +1927,44 @@ static void __kvm_unmap_gfn(struct kvm_m
+       if (!map->hva)
+               return;
+-      if (map->page != KVM_UNMAPPED_PAGE)
+-              kunmap(map->page);
++      if (map->page != KVM_UNMAPPED_PAGE) {
++              if (atomic)
++                      kunmap_atomic(map->hva);
++              else
++                      kunmap(map->page);
++      }
+ #ifdef CONFIG_HAS_IOMEM
+-      else
++      else if (!atomic)
+               memunmap(map->hva);
++      else
++              WARN_ONCE(1, "Unexpected unmapping in atomic context");
+ #endif
+-      if (dirty) {
++      if (dirty)
+               mark_page_dirty_in_slot(memslot, map->gfn);
+-              kvm_release_pfn_dirty(map->pfn);
+-      } else {
+-              kvm_release_pfn_clean(map->pfn);
+-      }
++
++      if (cache)
++              cache->dirty |= dirty;
++      else
++              kvm_release_pfn(map->pfn, dirty, NULL);
+       map->hva = NULL;
+       map->page = NULL;
+ }
+-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
++                struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
+ {
+-      __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty);
++      __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
++                      cache, dirty, atomic);
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
+ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
+ {
+-      __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty);
++      __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
++                      dirty, false);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
diff --git a/queue-5.5/x86-kvm-clean-up-host-s-steal-time-structure.patch b/queue-5.5/x86-kvm-clean-up-host-s-steal-time-structure.patch
new file mode 100644 (file)
index 0000000..2817d66
--- /dev/null
@@ -0,0 +1,81 @@
+From a6bd811f1209fe1c64c9f6fd578101d6436c6b6e Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Fri, 6 Dec 2019 15:36:12 +0000
+Subject: x86/KVM: Clean up host's steal time structure
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit a6bd811f1209fe1c64c9f6fd578101d6436c6b6e upstream.
+
+Now that we are mapping kvm_steal_time from the guest directly we
+don't need keep a copy of it in kvm_vcpu_arch.st. The same is true
+for the stime field.
+
+This is part of CVE-2019-3016.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h |    3 +--
+ arch/x86/kvm/x86.c              |   11 +++--------
+ 2 files changed, 4 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -685,10 +685,9 @@ struct kvm_vcpu_arch {
+       bool pvclock_set_guest_stopped_request;
+       struct {
++              u8 preempted;
+               u64 msr_val;
+               u64 last_steal;
+-              struct gfn_to_hva_cache stime;
+-              struct kvm_steal_time steal;
+               struct gfn_to_pfn_cache cache;
+       } st;
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2611,7 +2611,7 @@ static void record_steal_time(struct kvm
+       if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+               kvm_vcpu_flush_tlb(vcpu, false);
+-      vcpu->arch.st.steal.preempted = 0;
++      vcpu->arch.st.preempted = 0;
+       if (st->version & 1)
+               st->version += 1;  /* first time write, random junk */
+@@ -2795,11 +2795,6 @@ int kvm_set_msr_common(struct kvm_vcpu *
+               if (data & KVM_STEAL_RESERVED_MASK)
+                       return 1;
+-              if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+-                                              data & KVM_STEAL_VALID_BITS,
+-                                              sizeof(struct kvm_steal_time)))
+-                      return 1;
+-
+               vcpu->arch.st.msr_val = data;
+               if (!(data & KVM_MSR_ENABLED))
+@@ -3519,7 +3514,7 @@ static void kvm_steal_time_set_preempted
+       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+               return;
+-      if (vcpu->arch.st.steal.preempted)
++      if (vcpu->arch.st.preempted)
+               return;
+       if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+@@ -3529,7 +3524,7 @@ static void kvm_steal_time_set_preempted
+       st = map.hva +
+               offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+-      st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
++      st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+       kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ }
diff --git a/queue-5.5/x86-kvm-introduce-kvm_-un-map_gfn.patch b/queue-5.5/x86-kvm-introduce-kvm_-un-map_gfn.patch
new file mode 100644 (file)
index 0000000..0e6c864
--- /dev/null
@@ -0,0 +1,109 @@
+From 1eff70a9abd46f175defafd29bc17ad456f398a7 Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Tue, 12 Nov 2019 16:35:06 +0000
+Subject: x86/kvm: Introduce kvm_(un)map_gfn()
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit 1eff70a9abd46f175defafd29bc17ad456f398a7 upstream.
+
+kvm_vcpu_(un)map operates on gfns from any current address space.
+In certain cases we want to make sure we are not mapping SMRAM
+and for that we can use kvm_(un)map_gfn() that we are introducing
+in this patch.
+
+This is part of CVE-2019-3016.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/kvm_host.h |    2 ++
+ virt/kvm/kvm_main.c      |   29 ++++++++++++++++++++++++-----
+ 2 files changed, 26 insertions(+), 5 deletions(-)
+
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -775,8 +775,10 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map);
+ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
+ unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
+ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
+ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -1821,12 +1821,13 @@ struct page *gfn_to_page(struct kvm *kvm
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_page);
+-static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
++static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+                        struct kvm_host_map *map)
+ {
+       kvm_pfn_t pfn;
+       void *hva = NULL;
+       struct page *page = KVM_UNMAPPED_PAGE;
++      struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
+       if (!map)
+               return -EINVAL;
+@@ -1855,14 +1856,20 @@ static int __kvm_map_gfn(struct kvm_memo
+       return 0;
+ }
++int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
++{
++      return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map);
++}
++EXPORT_SYMBOL_GPL(kvm_map_gfn);
++
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+ {
+-      return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
++      return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+-void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
+-                  bool dirty)
++static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
++                      struct kvm_host_map *map, bool dirty)
+ {
+       if (!map)
+               return;
+@@ -1878,7 +1885,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcp
+ #endif
+       if (dirty) {
+-              kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
++              mark_page_dirty_in_slot(memslot, map->gfn);
+               kvm_release_pfn_dirty(map->pfn);
+       } else {
+               kvm_release_pfn_clean(map->pfn);
+@@ -1887,6 +1894,18 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcp
+       map->hva = NULL;
+       map->page = NULL;
+ }
++
++int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
++{
++      __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty);
++      return 0;
++}
++EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
++
++void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
++{
++      __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty);
++}
+ EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
diff --git a/queue-5.5/x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch b/queue-5.5/x86-kvm-make-sure-kvm_vcpu_flush_tlb-flag-is-not-missed.patch
new file mode 100644 (file)
index 0000000..a3687de
--- /dev/null
@@ -0,0 +1,129 @@
+From b043138246a41064527cf019a3d51d9f015e9796 Mon Sep 17 00:00:00 2001
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Date: Thu, 5 Dec 2019 03:45:32 +0000
+Subject: x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed
+
+From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+
+commit b043138246a41064527cf019a3d51d9f015e9796 upstream.
+
+There is a potential race in record_steal_time() between setting
+host-local vcpu->arch.st.steal.preempted to zero (i.e. clearing
+KVM_VCPU_PREEMPTED) and propagating this value to the guest with
+kvm_write_guest_cached(). Between those two events the guest may
+still see KVM_VCPU_PREEMPTED in its copy of kvm_steal_time, set
+KVM_VCPU_FLUSH_TLB and assume that hypervisor will do the right
+thing. Which it won't.
+
+Instad of copying, we should map kvm_steal_time and that will
+guarantee atomicity of accesses to @preempted.
+
+This is part of CVE-2019-3016.
+
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |   51 ++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 30 insertions(+), 21 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2588,45 +2588,47 @@ static void kvm_vcpu_flush_tlb(struct kv
+ static void record_steal_time(struct kvm_vcpu *vcpu)
+ {
++      struct kvm_host_map map;
++      struct kvm_steal_time *st;
++
+       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+               return;
+-      if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+-              &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
++      /* -EAGAIN is returned in atomic context so we can just return. */
++      if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
++                      &map, &vcpu->arch.st.cache, false))
+               return;
++      st = map.hva +
++              offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
++
+       /*
+        * Doing a TLB flush here, on the guest's behalf, can avoid
+        * expensive IPIs.
+        */
+       trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+-              vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB);
+-      if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
++              st->preempted & KVM_VCPU_FLUSH_TLB);
++      if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+               kvm_vcpu_flush_tlb(vcpu, false);
+-      if (vcpu->arch.st.steal.version & 1)
+-              vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
++      vcpu->arch.st.steal.preempted = 0;
+-      vcpu->arch.st.steal.version += 1;
++      if (st->version & 1)
++              st->version += 1;  /* first time write, random junk */
+-      kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+-              &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
++      st->version += 1;
+       smp_wmb();
+-      vcpu->arch.st.steal.steal += current->sched_info.run_delay -
++      st->steal += current->sched_info.run_delay -
+               vcpu->arch.st.last_steal;
+       vcpu->arch.st.last_steal = current->sched_info.run_delay;
+-      kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+-              &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+-
+       smp_wmb();
+-      vcpu->arch.st.steal.version += 1;
++      st->version += 1;
+-      kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+-              &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
++      kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ }
+ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+@@ -3511,18 +3513,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu
+ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+ {
++      struct kvm_host_map map;
++      struct kvm_steal_time *st;
++
+       if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+               return;
+       if (vcpu->arch.st.steal.preempted)
+               return;
+-      vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
++      if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
++                      &vcpu->arch.st.cache, true))
++              return;
++
++      st = map.hva +
++              offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
++
++      st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
+-      kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+-                      &vcpu->arch.st.steal.preempted,
+-                      offsetof(struct kvm_steal_time, preempted),
+-                      sizeof(vcpu->arch.st.steal.preempted));
++      kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ }
+ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)