Merge tag 'kvm-x86-lam-6.8' of https://github.com/kvm-x86/linux into HEAD

[thirdparty/kernel/stable.git] / arch / x86 / kvm / mmu / mmu.c
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index c57e181bba21b43f4f15e8d36e906d57e499efb5..d0590b417d3005cf28546ef6c380f3ee1e85d9e1 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -271,15 +271,11 @@ static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
  
  static inline bool kvm_available_flush_remote_tlbs_range(void)
  {
+#if IS_ENABLED(CONFIG_HYPERV)
         return kvm_x86_ops.flush_remote_tlbs_range;
-}
-
-int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
-{
-       if (!kvm_x86_ops.flush_remote_tlbs_range)
-               return -EOPNOTSUPP;
-
-       return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+#else
+       return false;
+#endif
  }
  
  static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
@@ -795,16 +791,26 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
         return &slot->arch.lpage_info[level - 2][idx];
  }
  
+/*
+ * The most significant bit in disallow_lpage tracks whether or not memory
+ * attributes are mixed, i.e. not identical for all gfns at the current level.
+ * The lower order bits are used to refcount other cases where a hugepage is
+ * disallowed, e.g. if KVM has shadow a page table at the gfn.
+ */
+#define KVM_LPAGE_MIXED_FLAG   BIT(31)
+
  static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
                                             gfn_t gfn, int count)
  {
         struct kvm_lpage_info *linfo;
-       int i;
+       int old, i;
  
         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
                 linfo = lpage_info_slot(gfn, slot, i);
+
+               old = linfo->disallow_lpage;
                 linfo->disallow_lpage += count;
-               WARN_ON_ONCE(linfo->disallow_lpage < 0);
+               WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
         }
  }
  
@@ -3056,7 +3062,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
   *
   * There are several ways to safely use this helper:
   *
- * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
+ * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
   *   consuming it.  In this case, mmu_lock doesn't need to be held during the
   *   lookup, but it does need to be held while checking the MMU notifier.
   *
@@ -3137,9 +3143,9 @@ out:
         return level;
  }
  
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
-                             const struct kvm_memory_slot *slot, gfn_t gfn,
-                             int max_level)
+static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
+                                      const struct kvm_memory_slot *slot,
+                                      gfn_t gfn, int max_level, bool is_private)
  {
         struct kvm_lpage_info *linfo;
         int host_level;
@@ -3151,6 +3157,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
                         break;
         }
  
+       if (is_private)
+               return max_level;
+
         if (max_level == PG_LEVEL_4K)
                 return PG_LEVEL_4K;
  
@@ -3158,6 +3167,16 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
         return min(host_level, max_level);
  }
  
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+                             const struct kvm_memory_slot *slot, gfn_t gfn,
+                             int max_level)
+{
+       bool is_private = kvm_slot_can_be_private(slot) &&
+                         kvm_mem_is_private(kvm, gfn);
+
+       return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
+}
+
  void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
  {
         struct kvm_memory_slot *slot = fault->slot;
@@ -3178,8 +3197,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
          * Enforce the iTLB multihit workaround after capturing the requested
          * level, which will be used to do precise, accurate accounting.
          */
-       fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
-                                                    fault->gfn, fault->max_level);
+       fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+                                                      fault->gfn, fault->max_level,
+                                                      fault->is_private);
         if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
                 return;
  
@@ -3739,7 +3759,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
             kvm_page_track_write_tracking_enabled(kvm))
                 goto out_success;
  
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                 slots = __kvm_memslots(kvm, i);
                 kvm_for_each_memslot(slot, bkt, slots) {
                         /*
@@ -3782,7 +3802,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         hpa_t root;
  
         root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
-       root_gfn = root_pgd >> PAGE_SHIFT;
+       root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
  
         if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
                 mmu->root.hpa = kvm_mmu_get_dummy_root();
@@ -4259,6 +4279,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
         kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
  }
  
+static inline u8 kvm_max_level_for_order(int order)
+{
+       BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+       KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
+                       order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
+                       order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
+
+       if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+               return PG_LEVEL_1G;
+
+       if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+               return PG_LEVEL_2M;
+
+       return PG_LEVEL_4K;
+}
+
+static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+                                             struct kvm_page_fault *fault)
+{
+       kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+                                     PAGE_SIZE, fault->write, fault->exec,
+                                     fault->is_private);
+}
+
+static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
+                                  struct kvm_page_fault *fault)
+{
+       int max_order, r;
+
+       if (!kvm_slot_can_be_private(fault->slot)) {
+               kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+               return -EFAULT;
+       }
+
+       r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
+                            &max_order);
+       if (r) {
+               kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+               return r;
+       }
+
+       fault->max_level = min(kvm_max_level_for_order(max_order),
+                              fault->max_level);
+       fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+
+       return RET_PF_CONTINUE;
+}
+
  static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
  {
         struct kvm_memory_slot *slot = fault->slot;
@@ -4291,6 +4360,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
                         return RET_PF_EMULATE;
         }
  
+       if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+               kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+               return -EFAULT;
+       }
+
+       if (fault->is_private)
+               return kvm_faultin_pfn_private(vcpu, fault);
+
         async = false;
         fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
                                           fault->write, &fault->map_writable,
@@ -4366,7 +4443,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
                 return true;
  
         return fault->slot &&
-              mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
+              mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
  }
  
  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -6228,7 +6305,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
         if (!kvm_memslots_have_rmaps(kvm))
                 return flush;
  
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                 slots = __kvm_memslots(kvm, i);
  
                 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
@@ -6260,7 +6337,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
  
         write_lock(&kvm->mmu_lock);
  
-       kvm_mmu_invalidate_begin(kvm, 0, -1ul);
+       kvm_mmu_invalidate_begin(kvm);
+
+       kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
  
         flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
  
@@ -6270,7 +6349,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
         if (flush)
                 kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
  
-       kvm_mmu_invalidate_end(kvm, 0, -1ul);
+       kvm_mmu_invalidate_end(kvm);
  
         write_unlock(&kvm->mmu_lock);
  }
@@ -6723,7 +6802,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
          * modifier prior to checking for a wrap of the MMIO generation so
          * that a wrap in any address space is detected.
          */
-       gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+       gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
  
         /*
          * The very rare case: if the MMIO generation number has wrapped,
@@ -7176,3 +7255,163 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
         if (kvm->arch.nx_huge_page_recovery_thread)
                 kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
  }
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+                                       struct kvm_gfn_range *range)
+{
+       /*
+        * Zap SPTEs even if the slot can't be mapped PRIVATE.  KVM x86 only
+        * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
+        * can simply ignore such slots.  But if userspace is making memory
+        * PRIVATE, then KVM must prevent the guest from accessing the memory
+        * as shared.  And if userspace is making memory SHARED and this point
+        * is reached, then at least one page within the range was previously
+        * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
+        * Zapping SPTEs in this case ensures KVM will reassess whether or not
+        * a hugepage can be used for affected ranges.
+        */
+       if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+               return false;
+
+       return kvm_unmap_gfn_range(kvm, range);
+}
+
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+                               int level)
+{
+       return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+                                int level)
+{
+       lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+                              int level)
+{
+       lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
+static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
+                              gfn_t gfn, int level, unsigned long attrs)
+{
+       const unsigned long start = gfn;
+       const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
+
+       if (level == PG_LEVEL_2M)
+               return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+
+       for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
+               if (hugepage_test_mixed(slot, gfn, level - 1) ||
+                   attrs != kvm_get_memory_attributes(kvm, gfn))
+                       return false;
+       }
+       return true;
+}
+
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+                                        struct kvm_gfn_range *range)
+{
+       unsigned long attrs = range->arg.attributes;
+       struct kvm_memory_slot *slot = range->slot;
+       int level;
+
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       lockdep_assert_held(&kvm->slots_lock);
+
+       /*
+        * Calculate which ranges can be mapped with hugepages even if the slot
+        * can't map memory PRIVATE.  KVM mustn't create a SHARED hugepage over
+        * a range that has PRIVATE GFNs, and conversely converting a range to
+        * SHARED may now allow hugepages.
+        */
+       if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+               return false;
+
+       /*
+        * The sequence matters here: upper levels consume the result of lower
+        * level's scanning.
+        */
+       for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+               gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+               gfn_t gfn = gfn_round_for_level(range->start, level);
+
+               /* Process the head page if it straddles the range. */
+               if (gfn != range->start || gfn + nr_pages > range->end) {
+                       /*
+                        * Skip mixed tracking if the aligned gfn isn't covered
+                        * by the memslot, KVM can't use a hugepage due to the
+                        * misaligned address regardless of memory attributes.
+                        */
+                       if (gfn >= slot->base_gfn) {
+                               if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+                                       hugepage_clear_mixed(slot, gfn, level);
+                               else
+                                       hugepage_set_mixed(slot, gfn, level);
+                       }
+                       gfn += nr_pages;
+               }
+
+               /*
+                * Pages entirely covered by the range are guaranteed to have
+                * only the attributes which were just set.
+                */
+               for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
+                       hugepage_clear_mixed(slot, gfn, level);
+
+               /*
+                * Process the last tail page if it straddles the range and is
+                * contained by the memslot.  Like the head page, KVM can't
+                * create a hugepage if the slot size is misaligned.
+                */
+               if (gfn < range->end &&
+                   (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
+                       if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+                               hugepage_clear_mixed(slot, gfn, level);
+                       else
+                               hugepage_set_mixed(slot, gfn, level);
+               }
+       }
+       return false;
+}
+
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+                                           struct kvm_memory_slot *slot)
+{
+       int level;
+
+       if (!kvm_arch_has_private_mem(kvm))
+               return;
+
+       for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+               /*
+                * Don't bother tracking mixed attributes for pages that can't
+                * be huge due to alignment, i.e. process only pages that are
+                * entirely contained by the memslot.
+                */
+               gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
+               gfn_t start = gfn_round_for_level(slot->base_gfn, level);
+               gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+               gfn_t gfn;
+
+               if (start < slot->base_gfn)
+                       start += nr_pages;
+
+               /*
+                * Unlike setting attributes, every potential hugepage needs to
+                * be manually checked as the attributes may already be mixed.
+                */
+               for (gfn = start; gfn < end; gfn += nr_pages) {
+                       unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
+
+                       if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+                               hugepage_clear_mixed(slot, gfn, level);
+                       else
+                               hugepage_set_mixed(slot, gfn, level);
+               }
+       }
+}
+#endif