]> git.ipfire.org Git - thirdparty/linux.git/blobdiff - arch/x86/kvm/x86.c
Merge tag 'kvm-x86-fixes-6.7-rcN' of https://github.com/kvm-x86/linux into kvm-master
[thirdparty/linux.git] / arch / x86 / kvm / x86.c
index a8bebf5c0a0577b81a5e6813317f2668c77e62fc..1a3aaa7dafae44fb2cba1eefc0e4b41f5532d69f 100644 (file)
@@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
 EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
 
 static bool __read_mostly ignore_msrs = 0;
-module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_msrs, bool, 0644);
 
 bool __read_mostly report_ignored_msrs = true;
-module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(report_ignored_msrs, bool, 0644);
 EXPORT_SYMBOL_GPL(report_ignored_msrs);
 
 unsigned int min_timer_period_us = 200;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+module_param(min_timer_period_us, uint, 0644);
 
 static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+module_param(kvmclock_periodic_sync, bool, 0444);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
-module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+module_param(tsc_tolerance_ppm, uint, 0644);
 
 /*
  * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
@@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
  * tuning, i.e. allows privileged userspace to set an exact advancement time.
  */
 static int __read_mostly lapic_timer_advance_ns = -1;
-module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
+module_param(lapic_timer_advance_ns, int, 0644);
 
 static bool __read_mostly vector_hashing = true;
-module_param(vector_hashing, bool, S_IRUGO);
+module_param(vector_hashing, bool, 0444);
 
 bool __read_mostly enable_vmware_backdoor = false;
-module_param(enable_vmware_backdoor, bool, S_IRUGO);
+module_param(enable_vmware_backdoor, bool, 0444);
 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
 
 /*
@@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix;
 module_param(force_emulation_prefix, int, 0644);
 
 int __read_mostly pi_inject_timer = -1;
-module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+module_param(pi_inject_timer, bint, 0644);
 
 /* Enable/disable PMU virtualization */
 bool __read_mostly enable_pmu = true;
@@ -962,7 +962,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
                kvm_mmu_reset_context(vcpu);
 
        if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
-           kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+           kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
            !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
                kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
 }
@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
        if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
                return;
 
-       /*
-        * The guest calculates current wall clock time by adding
-        * system time (updated by kvm_guest_time_update below) to the
-        * wall clock specified here.  We do the reverse here.
-        */
-       wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+       wall_nsec = kvm_get_wall_clock_epoch(kvm);
 
-       wc.nsec = do_div(wall_nsec, 1000000000);
+       wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
        wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
        wc.version = version;
 
@@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
        kvm_track_tsc_matching(vcpu);
 }
 
-static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
 {
+       u64 data = user_value ? *user_value : 0;
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
@@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        if (vcpu->arch.virtual_tsc_khz) {
                if (data == 0) {
                        /*
-                        * detection of vcpu initialization -- need to sync
-                        * with other vCPUs. This particularly helps to keep
-                        * kvm_clock stable after CPU hotplug
+                        * Force synchronization when creating a vCPU, or when
+                        * userspace explicitly writes a zero value.
                         */
                        synchronizing = true;
-               } else {
+               } else if (kvm->arch.user_set_tsc) {
                        u64 tsc_exp = kvm->arch.last_tsc_write +
                                                nsec_to_cycles(vcpu, elapsed);
                        u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
                        /*
-                        * Special case: TSC write with a small delta (1 second)
-                        * of virtual cycle time against real time is
-                        * interpreted as an attempt to synchronize the CPU.
+                        * Here lies UAPI baggage: when a user-initiated TSC write has
+                        * a small delta (1 second) of virtual cycle time against the
+                        * previously set vCPU, we assume that they were intended to be
+                        * in sync and the delta was only due to the racy nature of the
+                        * legacy API.
+                        *
+                        * This trick falls down when restoring a guest which genuinely
+                        * has been running for less time than the 1 second of imprecision
+                        * which we allow for in the legacy API. In this case, the first
+                        * value written by userspace (on any vCPU) should not be subject
+                        * to this 'correction' to make it sync up with values that only
+                        * come from the kernel's default vCPU creation. Make the 1-second
+                        * slop hack only trigger if the user_set_tsc flag is already set.
                         */
                        synchronizing = data < tsc_exp + tsc_hz &&
                                        data + tsc_hz > tsc_exp;
                }
        }
 
+       if (user_value)
+               kvm->arch.user_set_tsc = true;
+
        /*
         * For a reliable TSC, we can match TSC offsets, and for an unstable
         * TSC, we add elapsed time in this computation.  We could let the
@@ -3232,15 +3240,93 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        if (vcpu->pv_time.active)
                kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+#ifdef CONFIG_KVM_XEN
        if (vcpu->xen.vcpu_info_cache.active)
                kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
                                        offsetof(struct compat_vcpu_info, time));
        if (vcpu->xen.vcpu_time_info_cache.active)
                kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+#endif
        kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
 }
 
+/*
+ * The pvclock_wall_clock ABI tells the guest the wall clock time at
+ * which it started (i.e. its epoch, when its kvmclock was zero).
+ *
+ * In fact those clocks are subtly different; wall clock frequency is
+ * adjusted by NTP and has leap seconds, while the kvmclock is a
+ * simple function of the TSC without any such adjustment.
+ *
+ * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
+ * that and kvmclock, but even that would be subject to change over
+ * time.
+ *
+ * Attempt to calculate the epoch at a given moment using the *same*
+ * TSC reading via kvm_get_walltime_and_clockread() to obtain both
+ * wallclock and kvmclock times, and subtracting one from the other.
+ *
+ * Fall back to using their values at slightly different moments by
+ * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
+ */
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+       struct pvclock_vcpu_time_info hv_clock;
+       struct kvm_arch *ka = &kvm->arch;
+       unsigned long seq, local_tsc_khz;
+       struct timespec64 ts;
+       uint64_t host_tsc;
+
+       do {
+               seq = read_seqcount_begin(&ka->pvclock_sc);
+
+               local_tsc_khz = 0;
+               if (!ka->use_master_clock)
+                       break;
+
+               /*
+                * The TSC read and the call to get_cpu_tsc_khz() must happen
+                * on the same CPU.
+                */
+               get_cpu();
+
+               local_tsc_khz = get_cpu_tsc_khz();
+
+               if (local_tsc_khz &&
+                   !kvm_get_walltime_and_clockread(&ts, &host_tsc))
+                       local_tsc_khz = 0; /* Fall back to old method */
+
+               put_cpu();
+
+               /*
+                * These values must be snapshotted within the seqcount loop.
+                * After that, it's just mathematics which can happen on any
+                * CPU at any time.
+                */
+               hv_clock.tsc_timestamp = ka->master_cycle_now;
+               hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+
+       } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+       /*
+        * If the conditions were right, and obtaining the wallclock+TSC was
+        * successful, calculate the KVM clock at the corresponding time and
+        * subtract one from the other to get the guest's epoch in nanoseconds
+        * since 1970-01-01.
+        */
+       if (local_tsc_khz) {
+               kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
+                                  &hv_clock.tsc_shift,
+                                  &hv_clock.tsc_to_system_mul);
+               return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
+                       __pvclock_read_cycles(&hv_clock, host_tsc);
+       }
+#endif
+       return ktime_get_real_ns() - get_kvmclock_ns(kvm);
+}
+
 /*
  * kvmclock updates which are isolated to a given vcpu, such as
  * vcpu->cpu migration, should not allow system_timestamp from
@@ -3290,9 +3376,6 @@ static void kvmclock_sync_fn(struct work_struct *work)
                                           kvmclock_sync_work);
        struct kvm *kvm = container_of(ka, struct kvm, arch);
 
-       if (!kvmclock_periodic_sync)
-               return;
-
        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                        KVMCLOCK_SYNC_PERIOD);
@@ -3641,6 +3724,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_AMD64_PATCH_LOADER:
        case MSR_AMD64_BU_CFG2:
        case MSR_AMD64_DC_CFG:
+       case MSR_AMD64_TW_CFG:
        case MSR_F15H_EX_CFG:
                break;
 
@@ -3670,17 +3754,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.perf_capabilities = data;
                kvm_pmu_refresh(vcpu);
                break;
-       case MSR_IA32_PRED_CMD:
-               if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
-                       return 1;
+       case MSR_IA32_PRED_CMD: {
+               u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
+
+               if (!msr_info->host_initiated) {
+                       if ((!guest_has_pred_cmd_msr(vcpu)))
+                               return 1;
+
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+                           !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+                               reserved_bits |= PRED_CMD_IBPB;
+
+                       if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
+                               reserved_bits |= PRED_CMD_SBPB;
+               }
+
+               if (!boot_cpu_has(X86_FEATURE_IBPB))
+                       reserved_bits |= PRED_CMD_IBPB;
 
-               if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+               if (!boot_cpu_has(X86_FEATURE_SBPB))
+                       reserved_bits |= PRED_CMD_SBPB;
+
+               if (data & reserved_bits)
                        return 1;
+
                if (!data)
                        break;
 
-               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+               wrmsrl(MSR_IA32_PRED_CMD, data);
                break;
+       }
        case MSR_IA32_FLUSH_CMD:
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
@@ -3700,13 +3803,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
                data &= ~(u64)0x8;      /* ignore TLB cache disable */
 
-               /* Handle McStatusWrEn */
-               if (data == BIT_ULL(18)) {
-                       vcpu->arch.msr_hwcr = data;
-               } else if (data != 0) {
+               /*
+                * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
+                * through at least v6.6 whine if TscFreqSel is clear,
+                * depending on F/M/S.
+                */
+               if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
                        kvm_pr_unimpl_wrmsr(vcpu, msr, data);
                        return 1;
                }
+               vcpu->arch.msr_hwcr = data;
                break;
        case MSR_FAM10H_MMIO_CONF_BASE:
                if (data != 0) {
@@ -3777,7 +3883,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_TSC:
                if (msr_info->host_initiated) {
-                       kvm_synchronize_tsc(vcpu, data);
+                       kvm_synchronize_tsc(vcpu, &data);
                } else {
                        u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
                        adjust_tsc_offset_guest(vcpu, adj);
@@ -4065,6 +4171,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_AMD64_BU_CFG2:
        case MSR_IA32_PERF_CTL:
        case MSR_AMD64_DC_CFG:
+       case MSR_AMD64_TW_CFG:
        case MSR_F15H_EX_CFG:
        /*
         * Intel Sandy Bridge CPUs must support the RAPL (running average power
@@ -5547,6 +5654,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
                tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
                ns = get_kvmclock_base_ns();
 
+               kvm->arch.user_set_tsc = true;
                __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
                raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
@@ -6259,6 +6367,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
        struct kvm_vcpu *vcpu;
        unsigned long i;
 
+       if (!kvm_x86_ops.cpu_dirty_log_size)
+               return;
+
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_vcpu_kick(vcpu);
 }
@@ -7485,11 +7596,11 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
-static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
-                               void *insn, int insn_len)
+static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+                                 void *insn, int insn_len)
 {
-       return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
-                                                           insn, insn_len);
+       return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
+                                                             insn, insn_len);
 }
 
 int handle_ud(struct kvm_vcpu *vcpu)
@@ -7499,8 +7610,10 @@ int handle_ud(struct kvm_vcpu *vcpu)
        int emul_type = EMULTYPE_TRAP_UD;
        char sig[5]; /* ud2; .ascii "kvm" */
        struct x86_exception e;
+       int r;
 
-       if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
+       r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0);
+       if (r != X86EMUL_CONTINUE)
                return 1;
 
        if (fep_flags &&
@@ -8882,8 +8995,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
 
-       if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
-               return 1;
+       r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
+       if (r != X86EMUL_CONTINUE) {
+               if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
+                       return 1;
+
+               WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
+               return handle_emulation_failure(vcpu, emulation_type);
+       }
 
        vcpu->arch.l1tf_flush_l1d = true;
 
@@ -10587,16 +10706,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                }
                if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
                        record_steal_time(vcpu);
+               if (kvm_check_request(KVM_REQ_PMU, vcpu))
+                       kvm_pmu_handle_event(vcpu);
+               if (kvm_check_request(KVM_REQ_PMI, vcpu))
+                       kvm_pmu_deliver_pmi(vcpu);
 #ifdef CONFIG_KVM_SMM
                if (kvm_check_request(KVM_REQ_SMI, vcpu))
                        process_smi(vcpu);
 #endif
                if (kvm_check_request(KVM_REQ_NMI, vcpu))
                        process_nmi(vcpu);
-               if (kvm_check_request(KVM_REQ_PMU, vcpu))
-                       kvm_pmu_handle_event(vcpu);
-               if (kvm_check_request(KVM_REQ_PMI, vcpu))
-                       kvm_pmu_deliver_pmi(vcpu);
                if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
                        BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
                        if (test_bit(vcpu->arch.pending_ioapic_eoi,
@@ -11532,7 +11651,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
 
        *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
        static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
-       vcpu->arch.cr0 = sregs->cr0;
 
        *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
        static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
@@ -11576,8 +11694,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        if (ret)
                return ret;
 
-       if (mmu_reset_needed)
+       if (mmu_reset_needed) {
                kvm_mmu_reset_context(vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+       }
 
        max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
@@ -11618,8 +11738,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
                mmu_reset_needed = 1;
                vcpu->arch.pdptrs_from_userspace = true;
        }
-       if (mmu_reset_needed)
+       if (mmu_reset_needed) {
                kvm_mmu_reset_context(vcpu);
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+       }
        return 0;
 }
 
@@ -11970,7 +12092,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        if (mutex_lock_killable(&vcpu->mutex))
                return;
        vcpu_load(vcpu);
-       kvm_synchronize_tsc(vcpu, 0);
+       kvm_synchronize_tsc(vcpu, NULL);
        vcpu_put(vcpu);
 
        /* poll control enabled by default */
@@ -12326,7 +12448,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
                goto out_uninit_mmu;
 
        INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
-       INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -13205,15 +13326,30 @@ bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
 
+static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
+{
+       /*
+        * Non-coherent DMA assignment and de-assignment will affect
+        * whether KVM honors guest MTRRs and cause changes in memtypes
+        * in TDP.
+        * So, pass %true unconditionally to indicate non-coherent DMA was,
+        * or will be involved, and that zapping SPTEs might be necessary.
+        */
+       if (__kvm_mmu_honors_guest_mtrrs(true))
+               kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
+}
+
 void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
 {
-       atomic_inc(&kvm->arch.noncoherent_dma_count);
+       if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
+               kvm_noncoherent_dma_assignment_start_or_stop(kvm);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
 
 void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
 {
-       atomic_dec(&kvm->arch.noncoherent_dma_count);
+       if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
+               kvm_noncoherent_dma_assignment_start_or_stop(kvm);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);