]> git.ipfire.org Git - thirdparty/kernel/stable.git/blobdiff - virt/kvm/kvm_main.c
Merge tag 'kvm-x86-generic-6.8' of https://github.com/kvm-x86/linux into HEAD
[thirdparty/kernel/stable.git] / virt / kvm / kvm_main.c
index 7db96875ac46279886e464d37801254367530891..10bfc88a69f72b6a0e310cca043fb04882e24eb1 100644 (file)
@@ -533,30 +533,43 @@ void kvm_destroy_vcpus(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
 
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
        return container_of(mn, struct kvm, mmu_notifier);
 }
 
-typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 
-typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
-                            unsigned long end);
+typedef void (*on_lock_fn_t)(struct kvm *kvm);
 
-typedef void (*on_unlock_fn_t)(struct kvm *kvm);
-
-struct kvm_hva_range {
-       unsigned long start;
-       unsigned long end;
+struct kvm_mmu_notifier_range {
+       /*
+        * 64-bit addresses, as KVM notifiers can operate on host virtual
+        * addresses (unsigned long) and guest physical addresses (64-bit).
+        */
+       u64 start;
+       u64 end;
        union kvm_mmu_notifier_arg arg;
-       hva_handler_t handler;
+       gfn_handler_t handler;
        on_lock_fn_t on_lock;
-       on_unlock_fn_t on_unlock;
        bool flush_on_ret;
        bool may_block;
 };
 
+/*
+ * The inner-most helper returns a tuple containing the return value from the
+ * arch- and action-specific handler, plus a flag indicating whether or not at
+ * least one memslot was found, i.e. if the handler found guest memory.
+ *
+ * Note, most notifiers are averse to booleans, so even though KVM tracks the
+ * return from arch code as a bool, outer helpers will cast it to an int. :-(
+ */
+typedef struct kvm_mmu_notifier_return {
+       bool ret;
+       bool found_memslot;
+} kvm_mn_ret_t;
+
 /*
  * Use a dedicated stub instead of NULL to indicate that there is no callback
  * function/handler.  The compiler technically can't guarantee that a real
@@ -578,26 +591,29 @@ static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
             node;                                                           \
             node = interval_tree_iter_next(node, start, last))      \
 
-static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
-                                                 const struct kvm_hva_range *range)
+static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
+                                                          const struct kvm_mmu_notifier_range *range)
 {
-       bool ret = false, locked = false;
+       struct kvm_mmu_notifier_return r = {
+               .ret = false,
+               .found_memslot = false,
+       };
        struct kvm_gfn_range gfn_range;
        struct kvm_memory_slot *slot;
        struct kvm_memslots *slots;
        int i, idx;
 
        if (WARN_ON_ONCE(range->end <= range->start))
-               return 0;
+               return r;
 
        /* A null handler is allowed if and only if on_lock() is provided. */
        if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
                         IS_KVM_NULL_FN(range->handler)))
-               return 0;
+               return r;
 
        idx = srcu_read_lock(&kvm->srcu);
 
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                struct interval_tree_node *node;
 
                slots = __kvm_memslots(kvm, i);
@@ -606,9 +622,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
                        unsigned long hva_start, hva_end;
 
                        slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
-                       hva_start = max(range->start, slot->userspace_addr);
-                       hva_end = min(range->end, slot->userspace_addr +
-                                                 (slot->npages << PAGE_SHIFT));
+                       hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
+                       hva_end = min_t(unsigned long, range->end,
+                                       slot->userspace_addr + (slot->npages << PAGE_SHIFT));
 
                        /*
                         * To optimize for the likely case where the address
@@ -627,71 +643,66 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
                        gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
                        gfn_range.slot = slot;
 
-                       if (!locked) {
-                               locked = true;
+                       if (!r.found_memslot) {
+                               r.found_memslot = true;
                                KVM_MMU_LOCK(kvm);
                                if (!IS_KVM_NULL_FN(range->on_lock))
-                                       range->on_lock(kvm, range->start, range->end);
+                                       range->on_lock(kvm);
+
                                if (IS_KVM_NULL_FN(range->handler))
                                        break;
                        }
-                       ret |= range->handler(kvm, &gfn_range);
+                       r.ret |= range->handler(kvm, &gfn_range);
                }
        }
 
-       if (range->flush_on_ret && ret)
+       if (range->flush_on_ret && r.ret)
                kvm_flush_remote_tlbs(kvm);
 
-       if (locked) {
+       if (r.found_memslot)
                KVM_MMU_UNLOCK(kvm);
-               if (!IS_KVM_NULL_FN(range->on_unlock))
-                       range->on_unlock(kvm);
-       }
 
        srcu_read_unlock(&kvm->srcu, idx);
 
-       /* The notifiers are averse to booleans. :-( */
-       return (int)ret;
+       return r;
 }
 
 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
                                                unsigned long start,
                                                unsigned long end,
                                                union kvm_mmu_notifier_arg arg,
-                                               hva_handler_t handler)
+                                               gfn_handler_t handler)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       const struct kvm_hva_range range = {
+       const struct kvm_mmu_notifier_range range = {
                .start          = start,
                .end            = end,
                .arg            = arg,
                .handler        = handler,
                .on_lock        = (void *)kvm_null_fn,
-               .on_unlock      = (void *)kvm_null_fn,
                .flush_on_ret   = true,
                .may_block      = false,
        };
 
-       return __kvm_handle_hva_range(kvm, &range);
+       return __kvm_handle_hva_range(kvm, &range).ret;
 }
 
 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
                                                         unsigned long start,
                                                         unsigned long end,
-                                                        hva_handler_t handler)
+                                                        gfn_handler_t handler)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       const struct kvm_hva_range range = {
+       const struct kvm_mmu_notifier_range range = {
                .start          = start,
                .end            = end,
                .handler        = handler,
                .on_lock        = (void *)kvm_null_fn,
-               .on_unlock      = (void *)kvm_null_fn,
                .flush_on_ret   = false,
                .may_block      = false,
        };
 
-       return __kvm_handle_hva_range(kvm, &range);
+       return __kvm_handle_hva_range(kvm, &range).ret;
 }
 
 static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -736,16 +747,29 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
        kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
 }
 
-void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
+void kvm_mmu_invalidate_begin(struct kvm *kvm)
 {
+       lockdep_assert_held_write(&kvm->mmu_lock);
        /*
         * The count increase must become visible at unlock time as no
         * spte can be established without taking the mmu_lock and
         * count is also read inside the mmu_lock critical section.
         */
        kvm->mmu_invalidate_in_progress++;
+
        if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+               kvm->mmu_invalidate_range_start = INVALID_GPA;
+               kvm->mmu_invalidate_range_end = INVALID_GPA;
+       }
+}
+
+void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+       lockdep_assert_held_write(&kvm->mmu_lock);
+
+       WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
+
+       if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
                kvm->mmu_invalidate_range_start = start;
                kvm->mmu_invalidate_range_end = end;
        } else {
@@ -765,16 +789,21 @@ void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
        }
 }
 
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+       kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
+       return kvm_unmap_gfn_range(kvm, range);
+}
+
 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                                        const struct mmu_notifier_range *range)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       const struct kvm_hva_range hva_range = {
+       const struct kvm_mmu_notifier_range hva_range = {
                .start          = range->start,
                .end            = range->end,
-               .handler        = kvm_unmap_gfn_range,
+               .handler        = kvm_mmu_unmap_gfn_range,
                .on_lock        = kvm_mmu_invalidate_begin,
-               .on_unlock      = kvm_arch_guest_memory_reclaimed,
                .flush_on_ret   = true,
                .may_block      = mmu_notifier_range_blockable(range),
        };
@@ -806,14 +835,21 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
        gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
                                          hva_range.may_block);
 
-       __kvm_handle_hva_range(kvm, &hva_range);
+       /*
+        * If one or more memslots were found and thus zapped, notify arch code
+        * that guest memory has been reclaimed.  This needs to be done *after*
+        * dropping mmu_lock, as x86's reclaim path is slooooow.
+        */
+       if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
+               kvm_arch_guest_memory_reclaimed(kvm);
 
        return 0;
 }
 
-void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
-                           unsigned long end)
+void kvm_mmu_invalidate_end(struct kvm *kvm)
 {
+       lockdep_assert_held_write(&kvm->mmu_lock);
+
        /*
         * This sequence increase will notify the kvm page fault that
         * the page that is going to be mapped in the spte could have
@@ -827,18 +863,24 @@ void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
         * in conjunction with the smp_rmb in mmu_invalidate_retry().
         */
        kvm->mmu_invalidate_in_progress--;
+       KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
+
+       /*
+        * Assert that at least one range was added between start() and end().
+        * Not adding a range isn't fatal, but it is a KVM bug.
+        */
+       WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
                                        const struct mmu_notifier_range *range)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       const struct kvm_hva_range hva_range = {
+       const struct kvm_mmu_notifier_range hva_range = {
                .start          = range->start,
                .end            = range->end,
                .handler        = (void *)kvm_null_fn,
                .on_lock        = kvm_mmu_invalidate_end,
-               .on_unlock      = (void *)kvm_null_fn,
                .flush_on_ret   = false,
                .may_block      = mmu_notifier_range_blockable(range),
        };
@@ -857,8 +899,6 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
         */
        if (wake)
                rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
-
-       BUG_ON(kvm->mmu_invalidate_in_progress < 0);
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -932,14 +972,14 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
        return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }
 
-#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+#else  /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
        return 0;
 }
 
-#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 static int kvm_pm_notifier_call(struct notifier_block *bl,
@@ -985,6 +1025,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 /* This does not remove the slot from struct kvm_memslots data structures */
 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
+       if (slot->flags & KVM_MEM_GUEST_MEMFD)
+               kvm_gmem_unbind(slot);
+
        kvm_destroy_dirty_bitmap(slot);
 
        kvm_arch_free_memslot(kvm, slot);
@@ -1166,6 +1209,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
        spin_lock_init(&kvm->mn_invalidate_lock);
        rcuwait_init(&kvm->mn_memslots_update_rcuwait);
        xa_init(&kvm->vcpu_array);
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+       xa_init(&kvm->mem_attr_array);
+#endif
 
        INIT_LIST_HEAD(&kvm->gpc_list);
        spin_lock_init(&kvm->gpc_lock);
@@ -1190,7 +1236,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
                goto out_err_no_irq_srcu;
 
        refcount_set(&kvm->users_count, 1);
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                for (j = 0; j < 2; j++) {
                        slots = &kvm->__memslots[i][j];
 
@@ -1222,7 +1268,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
        if (r)
                goto out_err_no_disable;
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
        INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
 
@@ -1256,7 +1302,7 @@ out_err:
 out_err_no_debugfs:
        kvm_coalesced_mmio_free(kvm);
 out_no_coalesced_mmio:
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
        if (kvm->mmu_notifier.ops)
                mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
 #endif
@@ -1315,7 +1361,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
                kvm->buses[i] = NULL;
        }
        kvm_coalesced_mmio_free(kvm);
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
        /*
         * At this point, pending calls to invalidate_range_start()
@@ -1324,20 +1370,30 @@ static void kvm_destroy_vm(struct kvm *kvm)
         * No threads can be waiting in kvm_swap_active_memslots() as the
         * last reference on KVM has been dropped, but freeing
         * memslots would deadlock without this manual intervention.
+        *
+        * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
+        * notifier between a start() and end(), then there shouldn't be any
+        * in-progress invalidations.
         */
        WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
-       kvm->mn_active_invalidate_count = 0;
+       if (kvm->mn_active_invalidate_count)
+               kvm->mn_active_invalidate_count = 0;
+       else
+               WARN_ON(kvm->mmu_invalidate_in_progress);
 #else
        kvm_flush_shadow_all(kvm);
 #endif
        kvm_arch_destroy_vm(kvm);
        kvm_destroy_devices(kvm);
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
                kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
        }
        cleanup_srcu_struct(&kvm->irq_srcu);
        cleanup_srcu_struct(&kvm->srcu);
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+       xa_destroy(&kvm->mem_attr_array);
+#endif
        kvm_arch_free_vm(kvm);
        preempt_notifier_dec();
        hardware_disable_all();
@@ -1538,10 +1594,26 @@ static void kvm_replace_memslot(struct kvm *kvm,
        }
 }
 
-static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
+/*
+ * Flags that do not access any of the extra space of struct
+ * kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_V1_FLAGS
+ * only allows these.
+ */
+#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
+       (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
+
+static int check_memory_region_flags(struct kvm *kvm,
+                                    const struct kvm_userspace_memory_region2 *mem)
 {
        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
+       if (kvm_arch_has_private_mem(kvm))
+               valid_flags |= KVM_MEM_GUEST_MEMFD;
+
+       /* Dirty logging private memory is not currently supported. */
+       if (mem->flags & KVM_MEM_GUEST_MEMFD)
+               valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
 #ifdef __KVM_HAVE_READONLY_MEM
        valid_flags |= KVM_MEM_READONLY;
 #endif
@@ -1603,7 +1675,7 @@ static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
         * space 0 will use generations 0, 2, 4, ... while address space 1 will
         * use generations 1, 3, 5, ...
         */
-       gen += KVM_ADDRESS_SPACE_NUM;
+       gen += kvm_arch_nr_memslot_as_ids(kvm);
 
        kvm_arch_memslots_updated(kvm, gen);
 
@@ -1940,7 +2012,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
  * Must be called holding kvm->slots_lock for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
-                           const struct kvm_userspace_memory_region *mem)
+                           const struct kvm_userspace_memory_region2 *mem)
 {
        struct kvm_memory_slot *old, *new;
        struct kvm_memslots *slots;
@@ -1950,7 +2022,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        int as_id, id;
        int r;
 
-       r = check_memory_region_flags(mem);
+       r = check_memory_region_flags(kvm, mem);
        if (r)
                return r;
 
@@ -1969,7 +2041,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
             !access_ok((void __user *)(unsigned long)mem->userspace_addr,
                        mem->memory_size))
                return -EINVAL;
-       if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
+       if (mem->flags & KVM_MEM_GUEST_MEMFD &&
+           (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
+            mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
+               return -EINVAL;
+       if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
                return -EINVAL;
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
                return -EINVAL;
@@ -2007,6 +2083,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
                if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
                        return -EINVAL;
        } else { /* Modify an existing slot. */
+               /* Private memslots are immutable, they can only be deleted. */
+               if (mem->flags & KVM_MEM_GUEST_MEMFD)
+                       return -EINVAL;
                if ((mem->userspace_addr != old->userspace_addr) ||
                    (npages != old->npages) ||
                    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
@@ -2035,16 +2114,29 @@ int __kvm_set_memory_region(struct kvm *kvm,
        new->npages = npages;
        new->flags = mem->flags;
        new->userspace_addr = mem->userspace_addr;
+       if (mem->flags & KVM_MEM_GUEST_MEMFD) {
+               r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
+               if (r)
+                       goto out;
+       }
 
        r = kvm_set_memslot(kvm, old, new, change);
        if (r)
-               kfree(new);
+               goto out_unbind;
+
+       return 0;
+
+out_unbind:
+       if (mem->flags & KVM_MEM_GUEST_MEMFD)
+               kvm_gmem_unbind(new);
+out:
+       kfree(new);
        return r;
 }
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 
 int kvm_set_memory_region(struct kvm *kvm,
-                         const struct kvm_userspace_memory_region *mem)
+                         const struct kvm_userspace_memory_region2 *mem)
 {
        int r;
 
@@ -2056,7 +2148,7 @@ int kvm_set_memory_region(struct kvm *kvm,
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 
 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-                                         struct kvm_userspace_memory_region *mem)
+                                         struct kvm_userspace_memory_region2 *mem)
 {
        if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
                return -EINVAL;
@@ -2089,7 +2181,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
 
        as_id = log->slot >> 16;
        id = (u16)log->slot;
-       if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+       if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
                return -EINVAL;
 
        slots = __kvm_memslots(kvm, as_id);
@@ -2151,7 +2243,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
 
        as_id = log->slot >> 16;
        id = (u16)log->slot;
-       if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+       if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
                return -EINVAL;
 
        slots = __kvm_memslots(kvm, as_id);
@@ -2263,7 +2355,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
 
        as_id = log->slot >> 16;
        id = (u16)log->slot;
-       if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+       if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
                return -EINVAL;
 
        if (log->first_page & 63)
@@ -2335,6 +2427,200 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+/*
+ * Returns true if _all_ gfns in the range [@start, @end) have attributes
+ * matching @attrs.
+ */
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+                                    unsigned long attrs)
+{
+       XA_STATE(xas, &kvm->mem_attr_array, start);
+       unsigned long index;
+       bool has_attrs;
+       void *entry;
+
+       rcu_read_lock();
+
+       if (!attrs) {
+               has_attrs = !xas_find(&xas, end - 1);
+               goto out;
+       }
+
+       has_attrs = true;
+       for (index = start; index < end; index++) {
+               do {
+                       entry = xas_next(&xas);
+               } while (xas_retry(&xas, entry));
+
+               if (xas.xa_index != index || xa_to_value(entry) != attrs) {
+                       has_attrs = false;
+                       break;
+               }
+       }
+
+out:
+       rcu_read_unlock();
+       return has_attrs;
+}
+
+static u64 kvm_supported_mem_attributes(struct kvm *kvm)
+{
+       if (!kvm || kvm_arch_has_private_mem(kvm))
+               return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+       return 0;
+}
+
+static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
+                                                struct kvm_mmu_notifier_range *range)
+{
+       struct kvm_gfn_range gfn_range;
+       struct kvm_memory_slot *slot;
+       struct kvm_memslots *slots;
+       struct kvm_memslot_iter iter;
+       bool found_memslot = false;
+       bool ret = false;
+       int i;
+
+       gfn_range.arg = range->arg;
+       gfn_range.may_block = range->may_block;
+
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+               slots = __kvm_memslots(kvm, i);
+
+               kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
+                       slot = iter.slot;
+                       gfn_range.slot = slot;
+
+                       gfn_range.start = max(range->start, slot->base_gfn);
+                       gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
+                       if (gfn_range.start >= gfn_range.end)
+                               continue;
+
+                       if (!found_memslot) {
+                               found_memslot = true;
+                               KVM_MMU_LOCK(kvm);
+                               if (!IS_KVM_NULL_FN(range->on_lock))
+                                       range->on_lock(kvm);
+                       }
+
+                       ret |= range->handler(kvm, &gfn_range);
+               }
+       }
+
+       if (range->flush_on_ret && ret)
+               kvm_flush_remote_tlbs(kvm);
+
+       if (found_memslot)
+               KVM_MMU_UNLOCK(kvm);
+}
+
+static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
+                                         struct kvm_gfn_range *range)
+{
+       /*
+        * Unconditionally add the range to the invalidation set, regardless of
+        * whether or not the arch callback actually needs to zap SPTEs.  E.g.
+        * if KVM supports RWX attributes in the future and the attributes are
+        * going from R=>RW, zapping isn't strictly necessary.  Unconditionally
+        * adding the range allows KVM to require that MMU invalidations add at
+        * least one range between begin() and end(), e.g. allows KVM to detect
+        * bugs where the add() is missed.  Relaxing the rule *might* be safe,
+        * but it's not obvious that allowing new mappings while the attributes
+        * are in flux is desirable or worth the complexity.
+        */
+       kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
+
+       return kvm_arch_pre_set_memory_attributes(kvm, range);
+}
+
+/* Set @attributes for the gfn range [@start, @end). */
+static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+                                    unsigned long attributes)
+{
+       struct kvm_mmu_notifier_range pre_set_range = {
+               .start = start,
+               .end = end,
+               .handler = kvm_pre_set_memory_attributes,
+               .on_lock = kvm_mmu_invalidate_begin,
+               .flush_on_ret = true,
+               .may_block = true,
+       };
+       struct kvm_mmu_notifier_range post_set_range = {
+               .start = start,
+               .end = end,
+               .arg.attributes = attributes,
+               .handler = kvm_arch_post_set_memory_attributes,
+               .on_lock = kvm_mmu_invalidate_end,
+               .may_block = true,
+       };
+       unsigned long i;
+       void *entry;
+       int r = 0;
+
+       entry = attributes ? xa_mk_value(attributes) : NULL;
+
+       mutex_lock(&kvm->slots_lock);
+
+       /* Nothing to do if the entire range as the desired attributes. */
+       if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
+               goto out_unlock;
+
+       /*
+        * Reserve memory ahead of time to avoid having to deal with failures
+        * partway through setting the new attributes.
+        */
+       for (i = start; i < end; i++) {
+               r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
+               if (r)
+                       goto out_unlock;
+       }
+
+       kvm_handle_gfn_range(kvm, &pre_set_range);
+
+       for (i = start; i < end; i++) {
+               r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
+                                   GFP_KERNEL_ACCOUNT));
+               KVM_BUG_ON(r, kvm);
+       }
+
+       kvm_handle_gfn_range(kvm, &post_set_range);
+
+out_unlock:
+       mutex_unlock(&kvm->slots_lock);
+
+       return r;
+}
+static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
+                                          struct kvm_memory_attributes *attrs)
+{
+       gfn_t start, end;
+
+       /* flags is currently not used. */
+       if (attrs->flags)
+               return -EINVAL;
+       if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
+               return -EINVAL;
+       if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
+               return -EINVAL;
+       if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
+               return -EINVAL;
+
+       start = attrs->address >> PAGE_SHIFT;
+       end = (attrs->address + attrs->size) >> PAGE_SHIFT;
+
+       /*
+        * xarray tracks data using "unsigned long", and as a result so does
+        * KVM.  For simplicity, supports generic attributes only on 64-bit
+        * architectures.
+        */
+       BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
+
+       return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
+}
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -4527,13 +4813,14 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 {
        switch (arg) {
        case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_USER_MEMORY2:
        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
        case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
        case KVM_CAP_INTERNAL_ERROR_DATA:
 #ifdef CONFIG_HAVE_KVM_MSI
        case KVM_CAP_SIGNAL_MSI:
 #endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
        case KVM_CAP_IRQFD:
 #endif
        case KVM_CAP_IOEVENTFD_ANY_LENGTH:
@@ -4555,9 +4842,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
        case KVM_CAP_IRQ_ROUTING:
                return KVM_MAX_IRQ_ROUTES;
 #endif
-#if KVM_ADDRESS_SPACE_NUM > 1
+#if KVM_MAX_NR_ADDRESS_SPACES > 1
        case KVM_CAP_MULTI_ADDRESS_SPACE:
-               return KVM_ADDRESS_SPACE_NUM;
+               if (kvm)
+                       return kvm_arch_nr_memslot_as_ids(kvm);
+               return KVM_MAX_NR_ADDRESS_SPACES;
 #endif
        case KVM_CAP_NR_MEMSLOTS:
                return KVM_USER_MEM_SLOTS;
@@ -4578,7 +4867,16 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #endif
        case KVM_CAP_BINARY_STATS_FD:
        case KVM_CAP_SYSTEM_EVENT_DATA:
+       case KVM_CAP_DEVICE_CTRL:
                return 1;
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+       case KVM_CAP_MEMORY_ATTRIBUTES:
+               return kvm_supported_mem_attributes(kvm);
+#endif
+#ifdef CONFIG_KVM_PRIVATE_MEM
+       case KVM_CAP_GUEST_MEMFD:
+               return !kvm || kvm_arch_has_private_mem(kvm);
+#endif
        default:
                break;
        }
@@ -4657,7 +4955,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm)
 
        lockdep_assert_held(&kvm->slots_lock);
 
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
                        return false;
        }
@@ -4783,6 +5081,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
        return fd;
 }
 
+#define SANITY_CHECK_MEM_REGION_FIELD(field)                                   \
+do {                                                                           \
+       BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) !=             \
+                    offsetof(struct kvm_userspace_memory_region2, field));     \
+       BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) !=         \
+                    sizeof_field(struct kvm_userspace_memory_region2, field)); \
+} while (0)
+
 static long kvm_vm_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
 {
@@ -4805,15 +5111,39 @@ static long kvm_vm_ioctl(struct file *filp,
                r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
                break;
        }
+       case KVM_SET_USER_MEMORY_REGION2:
        case KVM_SET_USER_MEMORY_REGION: {
-               struct kvm_userspace_memory_region kvm_userspace_mem;
+               struct kvm_userspace_memory_region2 mem;
+               unsigned long size;
+
+               if (ioctl == KVM_SET_USER_MEMORY_REGION) {
+                       /*
+                        * Fields beyond struct kvm_userspace_memory_region shouldn't be
+                        * accessed, but avoid leaking kernel memory in case of a bug.
+                        */
+                       memset(&mem, 0, sizeof(mem));
+                       size = sizeof(struct kvm_userspace_memory_region);
+               } else {
+                       size = sizeof(struct kvm_userspace_memory_region2);
+               }
+
+               /* Ensure the common parts of the two structs are identical. */
+               SANITY_CHECK_MEM_REGION_FIELD(slot);
+               SANITY_CHECK_MEM_REGION_FIELD(flags);
+               SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
+               SANITY_CHECK_MEM_REGION_FIELD(memory_size);
+               SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
 
                r = -EFAULT;
-               if (copy_from_user(&kvm_userspace_mem, argp,
-                                               sizeof(kvm_userspace_mem)))
+               if (copy_from_user(&mem, argp, size))
                        goto out;
 
-               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
+               r = -EINVAL;
+               if (ioctl == KVM_SET_USER_MEMORY_REGION &&
+                   (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
+                       goto out;
+
+               r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
                break;
        }
        case KVM_GET_DIRTY_LOG: {
@@ -4927,9 +5257,8 @@ static long kvm_vm_ioctl(struct file *filp,
                        goto out;
                if (routing.nr) {
                        urouting = argp;
-                       entries = vmemdup_user(urouting->entries,
-                                              array_size(sizeof(*entries),
-                                                         routing.nr));
+                       entries = vmemdup_array_user(urouting->entries,
+                                                    routing.nr, sizeof(*entries));
                        if (IS_ERR(entries)) {
                                r = PTR_ERR(entries);
                                goto out;
@@ -4941,6 +5270,18 @@ static long kvm_vm_ioctl(struct file *filp,
                break;
        }
 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+       case KVM_SET_MEMORY_ATTRIBUTES: {
+               struct kvm_memory_attributes attrs;
+
+               r = -EFAULT;
+               if (copy_from_user(&attrs, argp, sizeof(attrs)))
+                       goto out;
+
+               r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
+               break;
+       }
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
        case KVM_CREATE_DEVICE: {
                struct kvm_create_device cd;
 
@@ -4968,6 +5309,18 @@ static long kvm_vm_ioctl(struct file *filp,
        case KVM_GET_STATS_FD:
                r = kvm_vm_ioctl_get_stats_fd(kvm);
                break;
+#ifdef CONFIG_KVM_PRIVATE_MEM
+       case KVM_CREATE_GUEST_MEMFD: {
+               struct kvm_create_guest_memfd guest_memfd;
+
+               r = -EFAULT;
+               if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
+                       goto out;
+
+               r = kvm_gmem_create(kvm, &guest_memfd);
+               break;
+       }
+#endif
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
        }
@@ -5139,11 +5492,6 @@ static long kvm_dev_ioctl(struct file *filp,
                r += PAGE_SIZE;    /* coalesced mmio ring page */
 #endif
                break;
-       case KVM_TRACE_ENABLE:
-       case KVM_TRACE_PAUSE:
-       case KVM_TRACE_DISABLE:
-               r = -EOPNOTSUPP;
-               break;
        default:
                return kvm_arch_dev_ioctl(filp, ioctl, arg);
        }
@@ -6104,6 +6452,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
        if (WARN_ON_ONCE(r))
                goto err_vfio;
 
+       kvm_gmem_init(module);
+
        /*
         * Registration _must_ be the very last thing done, as this exposes
         * /dev/kvm to userspace, i.e. all infrastructure must be setup!