virt/kvm/kvm_main.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * Copyright (C) 2006 Qumranet, Inc.
   9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10  *
  11  * Authors:
  12  *   Avi Kivity   <avi@qumranet.com>
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  */
  15
  16 #include <kvm/iodev.h>
  17
  18 #include <linux/kvm_host.h>
  19 #include <linux/kvm.h>
  20 #include <linux/module.h>
  21 #include <linux/errno.h>
  22 #include <linux/percpu.h>
  23 #include <linux/mm.h>
  24 #include <linux/miscdevice.h>
  25 #include <linux/vmalloc.h>
  26 #include <linux/reboot.h>
  27 #include <linux/debugfs.h>
  28 #include <linux/highmem.h>
  29 #include <linux/file.h>
  30 #include <linux/syscore_ops.h>
  31 #include <linux/cpu.h>
  32 #include <linux/sched/signal.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/sched/stat.h>
  35 #include <linux/cpumask.h>
  36 #include <linux/smp.h>
  37 #include <linux/anon_inodes.h>
  38 #include <linux/profile.h>
  39 #include <linux/kvm_para.h>
  40 #include <linux/pagemap.h>
  41 #include <linux/mman.h>
  42 #include <linux/swap.h>
  43 #include <linux/bitops.h>
  44 #include <linux/spinlock.h>
  45 #include <linux/compat.h>
  46 #include <linux/srcu.h>
  47 #include <linux/hugetlb.h>
  48 #include <linux/slab.h>
  49 #include <linux/sort.h>
  50 #include <linux/bsearch.h>
  51 #include <linux/io.h>
  52 #include <linux/lockdep.h>
  53 #include <linux/kthread.h>
  54 #include <linux/suspend.h>
  55
  56 #include <asm/processor.h>
  57 #include <asm/ioctl.h>
  58 #include <linux/uaccess.h>
  59
  60 #include "coalesced_mmio.h"
  61 #include "async_pf.h"
  62 #include "kvm_mm.h"
  63 #include "vfio.h"
  64
  65 #include <trace/events/ipi.h>
  66
  67 #define CREATE_TRACE_POINTS
  68 #include <trace/events/kvm.h>
  69
  70 #include <linux/kvm_dirty_ring.h>
  71
  72
  73 /* Worst case buffer size needed for holding an integer. */
  74 #define ITOA_MAX_LEN 12
  75
  76 MODULE_AUTHOR("Qumranet");
  77 MODULE_LICENSE("GPL");
  78
  79 /* Architectures should define their poll value according to the halt latency */
  80 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  81 module_param(halt_poll_ns, uint, 0644);
  82 EXPORT_SYMBOL_GPL(halt_poll_ns);
  83
  84 /* Default doubles per-vcpu halt_poll_ns. */
  85 unsigned int halt_poll_ns_grow = 2;
  86 module_param(halt_poll_ns_grow, uint, 0644);
  87 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
  88
  89 /* The start value to grow halt_poll_ns from */
  90 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  91 module_param(halt_poll_ns_grow_start, uint, 0644);
  92 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
  93
  94 /* Default resets per-vcpu halt_poll_ns . */
  95 unsigned int halt_poll_ns_shrink;
  96 module_param(halt_poll_ns_shrink, uint, 0644);
  97 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  98
  99 /*
 100  * Ordering of locks:
 101  *
 102  *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 103  */
 104
 105 DEFINE_MUTEX(kvm_lock);
 106 LIST_HEAD(vm_list);
 107
 108 static struct kmem_cache *kvm_vcpu_cache;
 109
 110 static __read_mostly struct preempt_ops kvm_preempt_ops;
 111 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 112
 113 struct dentry *kvm_debugfs_dir;
 114 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 115
 116 static const struct file_operations stat_fops_per_vm;
 117
 118 static struct file_operations kvm_chardev_ops;
 119
 120 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 121                            unsigned long arg);
 122 #ifdef CONFIG_KVM_COMPAT
 123 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 124                                   unsigned long arg);
 125 #define KVM_COMPAT(c)   .compat_ioctl   = (c)
 126 #else
 127 /*
 128  * For architectures that don't implement a compat infrastructure,
 129  * adopt a double line of defense:
 130  * - Prevent a compat task from opening /dev/kvm
 131  * - If the open has been done by a 64bit task, and the KVM fd
 132  *   passed to a compat task, let the ioctls fail.
 133  */
 134 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
 135                                 unsigned long arg) { return -EINVAL; }
 136
 137 static int kvm_no_compat_open(struct inode *inode, struct file *file)
 138 {
 139         return is_compat_task() ? -ENODEV : 0;
 140 }
 141 #define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
 142                         .open           = kvm_no_compat_open
 143 #endif
 144 static int hardware_enable_all(void);
 145 static void hardware_disable_all(void);
 146
 147 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 148
 149 #define KVM_EVENT_CREATE_VM 0
 150 #define KVM_EVENT_DESTROY_VM 1
 151 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 152 static unsigned long long kvm_createvm_count;
 153 static unsigned long long kvm_active_vms;
 154
 155 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
 156
 157 __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
 158 {
 159 }
 160
 161 bool kvm_is_zone_device_page(struct page *page)
 162 {
 163         /*
 164          * The metadata used by is_zone_device_page() to determine whether or
 165          * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
 166          * the device has been pinned, e.g. by get_user_pages().  WARN if the
 167          * page_count() is zero to help detect bad usage of this helper.
 168          */
 169         if (WARN_ON_ONCE(!page_count(page)))
 170                 return false;
 171
 172         return is_zone_device_page(page);
 173 }
 174
 175 /*
 176  * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
 177  * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
 178  * is likely incomplete, it has been compiled purely through people wanting to
 179  * back guest with a certain type of memory and encountering issues.
 180  */
 181 struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
 182 {
 183         struct page *page;
 184
 185         if (!pfn_valid(pfn))
 186                 return NULL;
 187
 188         page = pfn_to_page(pfn);
 189         if (!PageReserved(page))
 190                 return page;
 191
 192         /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
 193         if (is_zero_pfn(pfn))
 194                 return page;
 195
 196         /*
 197          * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
 198          * perspective they are "normal" pages, albeit with slightly different
 199          * usage rules.
 200          */
 201         if (kvm_is_zone_device_page(page))
 202                 return page;
 203
 204         return NULL;
 205 }
 206
 207 /*
 208  * Switches to specified vcpu, until a matching vcpu_put()
 209  */
 210 void vcpu_load(struct kvm_vcpu *vcpu)
 211 {
 212         int cpu = get_cpu();
 213
 214         __this_cpu_write(kvm_running_vcpu, vcpu);
 215         preempt_notifier_register(&vcpu->preempt_notifier);
 216         kvm_arch_vcpu_load(vcpu, cpu);
 217         put_cpu();
 218 }
 219 EXPORT_SYMBOL_GPL(vcpu_load);
 220
 221 void vcpu_put(struct kvm_vcpu *vcpu)
 222 {
 223         preempt_disable();
 224         kvm_arch_vcpu_put(vcpu);
 225         preempt_notifier_unregister(&vcpu->preempt_notifier);
 226         __this_cpu_write(kvm_running_vcpu, NULL);
 227         preempt_enable();
 228 }
 229 EXPORT_SYMBOL_GPL(vcpu_put);
 230
 231 /* TODO: merge with kvm_arch_vcpu_should_kick */
 232 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 233 {
 234         int mode = kvm_vcpu_exiting_guest_mode(vcpu);
 235
 236         /*
 237          * We need to wait for the VCPU to reenable interrupts and get out of
 238          * READING_SHADOW_PAGE_TABLES mode.
 239          */
 240         if (req & KVM_REQUEST_WAIT)
 241                 return mode != OUTSIDE_GUEST_MODE;
 242
 243         /*
 244          * Need to kick a running VCPU, but otherwise there is nothing to do.
 245          */
 246         return mode == IN_GUEST_MODE;
 247 }
 248
 249 static void ack_kick(void *_completed)
 250 {
 251 }
 252
 253 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
 254 {
 255         if (cpumask_empty(cpus))
 256                 return false;
 257
 258         smp_call_function_many(cpus, ack_kick, NULL, wait);
 259         return true;
 260 }
 261
 262 static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
 263                                   struct cpumask *tmp, int current_cpu)
 264 {
 265         int cpu;
 266
 267         if (likely(!(req & KVM_REQUEST_NO_ACTION)))
 268                 __kvm_make_request(req, vcpu);
 269
 270         if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
 271                 return;
 272
 273         /*
 274          * Note, the vCPU could get migrated to a different pCPU at any point
 275          * after kvm_request_needs_ipi(), which could result in sending an IPI
 276          * to the previous pCPU.  But, that's OK because the purpose of the IPI
 277          * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
 278          * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
 279          * after this point is also OK, as the requirement is only that KVM wait
 280          * for vCPUs that were reading SPTEs _before_ any changes were
 281          * finalized. See kvm_vcpu_kick() for more details on handling requests.
 282          */
 283         if (kvm_request_needs_ipi(vcpu, req)) {
 284                 cpu = READ_ONCE(vcpu->cpu);
 285                 if (cpu != -1 && cpu != current_cpu)
 286                         __cpumask_set_cpu(cpu, tmp);
 287         }
 288 }
 289
 290 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 291                                  unsigned long *vcpu_bitmap)
 292 {
 293         struct kvm_vcpu *vcpu;
 294         struct cpumask *cpus;
 295         int i, me;
 296         bool called;
 297
 298         me = get_cpu();
 299
 300         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 301         cpumask_clear(cpus);
 302
 303         for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
 304                 vcpu = kvm_get_vcpu(kvm, i);
 305                 if (!vcpu)
 306                         continue;
 307                 kvm_make_vcpu_request(vcpu, req, cpus, me);
 308         }
 309
 310         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 311         put_cpu();
 312
 313         return called;
 314 }
 315
 316 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 317                                       struct kvm_vcpu *except)
 318 {
 319         struct kvm_vcpu *vcpu;
 320         struct cpumask *cpus;
 321         unsigned long i;
 322         bool called;
 323         int me;
 324
 325         me = get_cpu();
 326
 327         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 328         cpumask_clear(cpus);
 329
 330         kvm_for_each_vcpu(i, vcpu, kvm) {
 331                 if (vcpu == except)
 332                         continue;
 333                 kvm_make_vcpu_request(vcpu, req, cpus, me);
 334         }
 335
 336         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 337         put_cpu();
 338
 339         return called;
 340 }
 341
 342 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 343 {
 344         return kvm_make_all_cpus_request_except(kvm, req, NULL);
 345 }
 346 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 347
 348 void kvm_flush_remote_tlbs(struct kvm *kvm)
 349 {
 350         ++kvm->stat.generic.remote_tlb_flush_requests;
 351
 352         /*
 353          * We want to publish modifications to the page tables before reading
 354          * mode. Pairs with a memory barrier in arch-specific code.
 355          * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
 356          * and smp_mb in walk_shadow_page_lockless_begin/end.
 357          * - powerpc: smp_mb in kvmppc_prepare_to_enter.
 358          *
 359          * There is already an smp_mb__after_atomic() before
 360          * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 361          * barrier here.
 362          */
 363         if (!kvm_arch_flush_remote_tlbs(kvm)
 364             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 365                 ++kvm->stat.generic.remote_tlb_flush;
 366 }
 367 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 368
 369 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
 370 {
 371         if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
 372                 return;
 373
 374         /*
 375          * Fall back to a flushing entire TLBs if the architecture range-based
 376          * TLB invalidation is unsupported or can't be performed for whatever
 377          * reason.
 378          */
 379         kvm_flush_remote_tlbs(kvm);
 380 }
 381
 382 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
 383                                    const struct kvm_memory_slot *memslot)
 384 {
 385         /*
 386          * All current use cases for flushing the TLBs for a specific memslot
 387          * are related to dirty logging, and many do the TLB flush out of
 388          * mmu_lock. The interaction between the various operations on memslot
 389          * must be serialized by slots_locks to ensure the TLB flush from one
 390          * operation is observed by any other operation on the same memslot.
 391          */
 392         lockdep_assert_held(&kvm->slots_lock);
 393         kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
 394 }
 395
 396 static void kvm_flush_shadow_all(struct kvm *kvm)
 397 {
 398         kvm_arch_flush_shadow_all(kvm);
 399         kvm_arch_guest_memory_reclaimed(kvm);
 400 }
 401
 402 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 403 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 404                                                gfp_t gfp_flags)
 405 {
 406         gfp_flags |= mc->gfp_zero;
 407
 408         if (mc->kmem_cache)
 409                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 410         else
 411                 return (void *)__get_free_page(gfp_flags);
 412 }
 413
 414 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
 415 {
 416         gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
 417         void *obj;
 418
 419         if (mc->nobjs >= min)
 420                 return 0;
 421
 422         if (unlikely(!mc->objects)) {
 423                 if (WARN_ON_ONCE(!capacity))
 424                         return -EIO;
 425
 426                 mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
 427                 if (!mc->objects)
 428                         return -ENOMEM;
 429
 430                 mc->capacity = capacity;
 431         }
 432
 433         /* It is illegal to request a different capacity across topups. */
 434         if (WARN_ON_ONCE(mc->capacity != capacity))
 435                 return -EIO;
 436
 437         while (mc->nobjs < mc->capacity) {
 438                 obj = mmu_memory_cache_alloc_obj(mc, gfp);
 439                 if (!obj)
 440                         return mc->nobjs >= min ? 0 : -ENOMEM;
 441                 mc->objects[mc->nobjs++] = obj;
 442         }
 443         return 0;
 444 }
 445
 446 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 447 {
 448         return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
 449 }
 450
 451 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 452 {
 453         return mc->nobjs;
 454 }
 455
 456 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 457 {
 458         while (mc->nobjs) {
 459                 if (mc->kmem_cache)
 460                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
 461                 else
 462                         free_page((unsigned long)mc->objects[--mc->nobjs]);
 463         }
 464
 465         kvfree(mc->objects);
 466
 467         mc->objects = NULL;
 468         mc->capacity = 0;
 469 }
 470
 471 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 472 {
 473         void *p;
 474
 475         if (WARN_ON(!mc->nobjs))
 476                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
 477         else
 478                 p = mc->objects[--mc->nobjs];
 479         BUG_ON(!p);
 480         return p;
 481 }
 482 #endif
 483
 484 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 485 {
 486         mutex_init(&vcpu->mutex);
 487         vcpu->cpu = -1;
 488         vcpu->kvm = kvm;
 489         vcpu->vcpu_id = id;
 490         vcpu->pid = NULL;
 491 #ifndef __KVM_HAVE_ARCH_WQP
 492         rcuwait_init(&vcpu->wait);
 493 #endif
 494         kvm_async_pf_vcpu_init(vcpu);
 495
 496         kvm_vcpu_set_in_spin_loop(vcpu, false);
 497         kvm_vcpu_set_dy_eligible(vcpu, false);
 498         vcpu->preempted = false;
 499         vcpu->ready = false;
 500         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 501         vcpu->last_used_slot = NULL;
 502
 503         /* Fill the stats id string for the vcpu */
 504         snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
 505                  task_pid_nr(current), id);
 506 }
 507
 508 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 509 {
 510         kvm_arch_vcpu_destroy(vcpu);
 511         kvm_dirty_ring_free(&vcpu->dirty_ring);
 512
 513         /*
 514          * No need for rcu_read_lock as VCPU_RUN is the only place that changes
 515          * the vcpu->pid pointer, and at destruction time all file descriptors
 516          * are already gone.
 517          */
 518         put_pid(rcu_dereference_protected(vcpu->pid, 1));
 519
 520         free_page((unsigned long)vcpu->run);
 521         kmem_cache_free(kvm_vcpu_cache, vcpu);
 522 }
 523
 524 void kvm_destroy_vcpus(struct kvm *kvm)
 525 {
 526         unsigned long i;
 527         struct kvm_vcpu *vcpu;
 528
 529         kvm_for_each_vcpu(i, vcpu, kvm) {
 530                 kvm_vcpu_destroy(vcpu);
 531                 xa_erase(&kvm->vcpu_array, i);
 532         }
 533
 534         atomic_set(&kvm->online_vcpus, 0);
 535 }
 536 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
 537
 538 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 539 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 540 {
 541         return container_of(mn, struct kvm, mmu_notifier);
 542 }
 543
 544 typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 545
 546 typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
 547                              unsigned long end);
 548
 549 typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 550
 551 struct kvm_hva_range {
 552         unsigned long start;
 553         unsigned long end;
 554         union kvm_mmu_notifier_arg arg;
 555         hva_handler_t handler;
 556         on_lock_fn_t on_lock;
 557         on_unlock_fn_t on_unlock;
 558         bool flush_on_ret;
 559         bool may_block;
 560 };
 561
 562 /*
 563  * Use a dedicated stub instead of NULL to indicate that there is no callback
 564  * function/handler.  The compiler technically can't guarantee that a real
 565  * function will have a non-zero address, and so it will generate code to
 566  * check for !NULL, whereas comparing against a stub will be elided at compile
 567  * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
 568  */
 569 static void kvm_null_fn(void)
 570 {
 571
 572 }
 573 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 574
 575 static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
 576
 577 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
 578 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)          \
 579         for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
 580              node;                                                           \
 581              node = interval_tree_iter_next(node, start, last))      \
 582
 583 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 584                                                   const struct kvm_hva_range *range)
 585 {
 586         bool ret = false, locked = false;
 587         struct kvm_gfn_range gfn_range;
 588         struct kvm_memory_slot *slot;
 589         struct kvm_memslots *slots;
 590         int i, idx;
 591
 592         if (WARN_ON_ONCE(range->end <= range->start))
 593                 return 0;
 594
 595         /* A null handler is allowed if and only if on_lock() is provided. */
 596         if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
 597                          IS_KVM_NULL_FN(range->handler)))
 598                 return 0;
 599
 600         idx = srcu_read_lock(&kvm->srcu);
 601
 602         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 603                 struct interval_tree_node *node;
 604
 605                 slots = __kvm_memslots(kvm, i);
 606                 kvm_for_each_memslot_in_hva_range(node, slots,
 607                                                   range->start, range->end - 1) {
 608                         unsigned long hva_start, hva_end;
 609
 610                         slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
 611                         hva_start = max(range->start, slot->userspace_addr);
 612                         hva_end = min(range->end, slot->userspace_addr +
 613                                                   (slot->npages << PAGE_SHIFT));
 614
 615                         /*
 616                          * To optimize for the likely case where the address
 617                          * range is covered by zero or one memslots, don't
 618                          * bother making these conditional (to avoid writes on
 619                          * the second or later invocation of the handler).
 620                          */
 621                         gfn_range.arg = range->arg;
 622                         gfn_range.may_block = range->may_block;
 623
 624                         /*
 625                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 626                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 627                          */
 628                         gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
 629                         gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 630                         gfn_range.slot = slot;
 631
 632                         if (!locked) {
 633                                 locked = true;
 634                                 KVM_MMU_LOCK(kvm);
 635                                 if (!IS_KVM_NULL_FN(range->on_lock))
 636                                         range->on_lock(kvm, range->start, range->end);
 637                                 if (IS_KVM_NULL_FN(range->handler))
 638                                         break;
 639                         }
 640                         ret |= range->handler(kvm, &gfn_range);
 641                 }
 642         }
 643
 644         if (range->flush_on_ret && ret)
 645                 kvm_flush_remote_tlbs(kvm);
 646
 647         if (locked) {
 648                 KVM_MMU_UNLOCK(kvm);
 649                 if (!IS_KVM_NULL_FN(range->on_unlock))
 650                         range->on_unlock(kvm);
 651         }
 652
 653         srcu_read_unlock(&kvm->srcu, idx);
 654
 655         /* The notifiers are averse to booleans. :-( */
 656         return (int)ret;
 657 }
 658
 659 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 660                                                 unsigned long start,
 661                                                 unsigned long end,
 662                                                 union kvm_mmu_notifier_arg arg,
 663                                                 hva_handler_t handler)
 664 {
 665         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 666         const struct kvm_hva_range range = {
 667                 .start          = start,
 668                 .end            = end,
 669                 .arg            = arg,
 670                 .handler        = handler,
 671                 .on_lock        = (void *)kvm_null_fn,
 672                 .on_unlock      = (void *)kvm_null_fn,
 673                 .flush_on_ret   = true,
 674                 .may_block      = false,
 675         };
 676
 677         return __kvm_handle_hva_range(kvm, &range);
 678 }
 679
 680 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
 681                                                          unsigned long start,
 682                                                          unsigned long end,
 683                                                          hva_handler_t handler)
 684 {
 685         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 686         const struct kvm_hva_range range = {
 687                 .start          = start,
 688                 .end            = end,
 689                 .handler        = handler,
 690                 .on_lock        = (void *)kvm_null_fn,
 691                 .on_unlock      = (void *)kvm_null_fn,
 692                 .flush_on_ret   = false,
 693                 .may_block      = false,
 694         };
 695
 696         return __kvm_handle_hva_range(kvm, &range);
 697 }
 698
 699 static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 700 {
 701         /*
 702          * Skipping invalid memslots is correct if and only change_pte() is
 703          * surrounded by invalidate_range_{start,end}(), which is currently
 704          * guaranteed by the primary MMU.  If that ever changes, KVM needs to
 705          * unmap the memslot instead of skipping the memslot to ensure that KVM
 706          * doesn't hold references to the old PFN.
 707          */
 708         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 709
 710         if (range->slot->flags & KVM_MEMSLOT_INVALID)
 711                 return false;
 712
 713         return kvm_set_spte_gfn(kvm, range);
 714 }
 715
 716 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 717                                         struct mm_struct *mm,
 718                                         unsigned long address,
 719                                         pte_t pte)
 720 {
 721         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 722         const union kvm_mmu_notifier_arg arg = { .pte = pte };
 723
 724         trace_kvm_set_spte_hva(address);
 725
 726         /*
 727          * .change_pte() must be surrounded by .invalidate_range_{start,end}().
 728          * If mmu_invalidate_in_progress is zero, then no in-progress
 729          * invalidations, including this one, found a relevant memslot at
 730          * start(); rechecking memslots here is unnecessary.  Note, a false
 731          * positive (count elevated by a different invalidation) is sub-optimal
 732          * but functionally ok.
 733          */
 734         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 735         if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
 736                 return;
 737
 738         kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
 739 }
 740
 741 void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
 742                               unsigned long end)
 743 {
 744         /*
 745          * The count increase must become visible at unlock time as no
 746          * spte can be established without taking the mmu_lock and
 747          * count is also read inside the mmu_lock critical section.
 748          */
 749         kvm->mmu_invalidate_in_progress++;
 750         if (likely(kvm->mmu_invalidate_in_progress == 1)) {
 751                 kvm->mmu_invalidate_range_start = start;
 752                 kvm->mmu_invalidate_range_end = end;
 753         } else {
 754                 /*
 755                  * Fully tracking multiple concurrent ranges has diminishing
 756                  * returns. Keep things simple and just find the minimal range
 757                  * which includes the current and new ranges. As there won't be
 758                  * enough information to subtract a range after its invalidate
 759                  * completes, any ranges invalidated concurrently will
 760                  * accumulate and persist until all outstanding invalidates
 761                  * complete.
 762                  */
 763                 kvm->mmu_invalidate_range_start =
 764                         min(kvm->mmu_invalidate_range_start, start);
 765                 kvm->mmu_invalidate_range_end =
 766                         max(kvm->mmu_invalidate_range_end, end);
 767         }
 768 }
 769
 770 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 771                                         const struct mmu_notifier_range *range)
 772 {
 773         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 774         const struct kvm_hva_range hva_range = {
 775                 .start          = range->start,
 776                 .end            = range->end,
 777                 .handler        = kvm_unmap_gfn_range,
 778                 .on_lock        = kvm_mmu_invalidate_begin,
 779                 .on_unlock      = kvm_arch_guest_memory_reclaimed,
 780                 .flush_on_ret   = true,
 781                 .may_block      = mmu_notifier_range_blockable(range),
 782         };
 783
 784         trace_kvm_unmap_hva_range(range->start, range->end);
 785
 786         /*
 787          * Prevent memslot modification between range_start() and range_end()
 788          * so that conditionally locking provides the same result in both
 789          * functions.  Without that guarantee, the mmu_invalidate_in_progress
 790          * adjustments will be imbalanced.
 791          *
 792          * Pairs with the decrement in range_end().
 793          */
 794         spin_lock(&kvm->mn_invalidate_lock);
 795         kvm->mn_active_invalidate_count++;
 796         spin_unlock(&kvm->mn_invalidate_lock);
 797
 798         /*
 799          * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
 800          * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
 801          * each cache's lock.  There are relatively few caches in existence at
 802          * any given time, and the caches themselves can check for hva overlap,
 803          * i.e. don't need to rely on memslot overlap checks for performance.
 804          * Because this runs without holding mmu_lock, the pfn caches must use
 805          * mn_active_invalidate_count (see above) instead of
 806          * mmu_invalidate_in_progress.
 807          */
 808         gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
 809                                           hva_range.may_block);
 810
 811         __kvm_handle_hva_range(kvm, &hva_range);
 812
 813         return 0;
 814 }
 815
 816 void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
 817                             unsigned long end)
 818 {
 819         /*
 820          * This sequence increase will notify the kvm page fault that
 821          * the page that is going to be mapped in the spte could have
 822          * been freed.
 823          */
 824         kvm->mmu_invalidate_seq++;
 825         smp_wmb();
 826         /*
 827          * The above sequence increase must be visible before the
 828          * below count decrease, which is ensured by the smp_wmb above
 829          * in conjunction with the smp_rmb in mmu_invalidate_retry().
 830          */
 831         kvm->mmu_invalidate_in_progress--;
 832 }
 833
 834 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 835                                         const struct mmu_notifier_range *range)
 836 {
 837         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 838         const struct kvm_hva_range hva_range = {
 839                 .start          = range->start,
 840                 .end            = range->end,
 841                 .handler        = (void *)kvm_null_fn,
 842                 .on_lock        = kvm_mmu_invalidate_end,
 843                 .on_unlock      = (void *)kvm_null_fn,
 844                 .flush_on_ret   = false,
 845                 .may_block      = mmu_notifier_range_blockable(range),
 846         };
 847         bool wake;
 848
 849         __kvm_handle_hva_range(kvm, &hva_range);
 850
 851         /* Pairs with the increment in range_start(). */
 852         spin_lock(&kvm->mn_invalidate_lock);
 853         wake = (--kvm->mn_active_invalidate_count == 0);
 854         spin_unlock(&kvm->mn_invalidate_lock);
 855
 856         /*
 857          * There can only be one waiter, since the wait happens under
 858          * slots_lock.
 859          */
 860         if (wake)
 861                 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 862
 863         BUG_ON(kvm->mmu_invalidate_in_progress < 0);
 864 }
 865
 866 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 867                                               struct mm_struct *mm,
 868                                               unsigned long start,
 869                                               unsigned long end)
 870 {
 871         trace_kvm_age_hva(start, end);
 872
 873         return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
 874                                     kvm_age_gfn);
 875 }
 876
 877 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 878                                         struct mm_struct *mm,
 879                                         unsigned long start,
 880                                         unsigned long end)
 881 {
 882         trace_kvm_age_hva(start, end);
 883
 884         /*
 885          * Even though we do not flush TLB, this will still adversely
 886          * affect performance on pre-Haswell Intel EPT, where there is
 887          * no EPT Access Bit to clear so that we have to tear down EPT
 888          * tables instead. If we find this unacceptable, we can always
 889          * add a parameter to kvm_age_hva so that it effectively doesn't
 890          * do anything on clear_young.
 891          *
 892          * Also note that currently we never issue secondary TLB flushes
 893          * from clear_young, leaving this job up to the regular system
 894          * cadence. If we find this inaccurate, we might come up with a
 895          * more sophisticated heuristic later.
 896          */
 897         return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 898 }
 899
 900 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 901                                        struct mm_struct *mm,
 902                                        unsigned long address)
 903 {
 904         trace_kvm_test_age_hva(address);
 905
 906         return kvm_handle_hva_range_no_flush(mn, address, address + 1,
 907                                              kvm_test_age_gfn);
 908 }
 909
 910 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 911                                      struct mm_struct *mm)
 912 {
 913         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 914         int idx;
 915
 916         idx = srcu_read_lock(&kvm->srcu);
 917         kvm_flush_shadow_all(kvm);
 918         srcu_read_unlock(&kvm->srcu, idx);
 919 }
 920
 921 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 922         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 923         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 924         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 925         .clear_young            = kvm_mmu_notifier_clear_young,
 926         .test_young             = kvm_mmu_notifier_test_young,
 927         .change_pte             = kvm_mmu_notifier_change_pte,
 928         .release                = kvm_mmu_notifier_release,
 929 };
 930
 931 static int kvm_init_mmu_notifier(struct kvm *kvm)
 932 {
 933         kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 934         return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 935 }
 936
 937 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 938
 939 static int kvm_init_mmu_notifier(struct kvm *kvm)
 940 {
 941         return 0;
 942 }
 943
 944 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 945
 946 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 947 static int kvm_pm_notifier_call(struct notifier_block *bl,
 948                                 unsigned long state,
 949                                 void *unused)
 950 {
 951         struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
 952
 953         return kvm_arch_pm_notifier(kvm, state);
 954 }
 955
 956 static void kvm_init_pm_notifier(struct kvm *kvm)
 957 {
 958         kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
 959         /* Suspend KVM before we suspend ftrace, RCU, etc. */
 960         kvm->pm_notifier.priority = INT_MAX;
 961         register_pm_notifier(&kvm->pm_notifier);
 962 }
 963
 964 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 965 {
 966         unregister_pm_notifier(&kvm->pm_notifier);
 967 }
 968 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
 969 static void kvm_init_pm_notifier(struct kvm *kvm)
 970 {
 971 }
 972
 973 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 974 {
 975 }
 976 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 977
 978 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 979 {
 980         if (!memslot->dirty_bitmap)
 981                 return;
 982
 983         kvfree(memslot->dirty_bitmap);
 984         memslot->dirty_bitmap = NULL;
 985 }
 986
 987 /* This does not remove the slot from struct kvm_memslots data structures */
 988 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 989 {
 990         kvm_destroy_dirty_bitmap(slot);
 991
 992         kvm_arch_free_memslot(kvm, slot);
 993
 994         kfree(slot);
 995 }
 996
 997 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 998 {
 999         struct hlist_node *idnode;
1000         struct kvm_memory_slot *memslot;
1001         int bkt;
1002
1003         /*
1004          * The same memslot objects live in both active and inactive sets,
1005          * arbitrarily free using index '1' so the second invocation of this
1006          * function isn't operating over a structure with dangling pointers
1007          * (even though this function isn't actually touching them).
1008          */
1009         if (!slots->node_idx)
1010                 return;
1011
1012         hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1013                 kvm_free_memslot(kvm, memslot);
1014 }
1015
1016 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1017 {
1018         switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1019         case KVM_STATS_TYPE_INSTANT:
1020                 return 0444;
1021         case KVM_STATS_TYPE_CUMULATIVE:
1022         case KVM_STATS_TYPE_PEAK:
1023         default:
1024                 return 0644;
1025         }
1026 }
1027
1028
1029 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1030 {
1031         int i;
1032         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1033                                       kvm_vcpu_stats_header.num_desc;
1034
1035         if (IS_ERR(kvm->debugfs_dentry))
1036                 return;
1037
1038         debugfs_remove_recursive(kvm->debugfs_dentry);
1039
1040         if (kvm->debugfs_stat_data) {
1041                 for (i = 0; i < kvm_debugfs_num_entries; i++)
1042                         kfree(kvm->debugfs_stat_data[i]);
1043                 kfree(kvm->debugfs_stat_data);
1044         }
1045 }
1046
1047 static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1048 {
1049         static DEFINE_MUTEX(kvm_debugfs_lock);
1050         struct dentry *dent;
1051         char dir_name[ITOA_MAX_LEN * 2];
1052         struct kvm_stat_data *stat_data;
1053         const struct _kvm_stats_desc *pdesc;
1054         int i, ret = -ENOMEM;
1055         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1056                                       kvm_vcpu_stats_header.num_desc;
1057
1058         if (!debugfs_initialized())
1059                 return 0;
1060
1061         snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1062         mutex_lock(&kvm_debugfs_lock);
1063         dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1064         if (dent) {
1065                 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1066                 dput(dent);
1067                 mutex_unlock(&kvm_debugfs_lock);
1068                 return 0;
1069         }
1070         dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1071         mutex_unlock(&kvm_debugfs_lock);
1072         if (IS_ERR(dent))
1073                 return 0;
1074
1075         kvm->debugfs_dentry = dent;
1076         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1077                                          sizeof(*kvm->debugfs_stat_data),
1078                                          GFP_KERNEL_ACCOUNT);
1079         if (!kvm->debugfs_stat_data)
1080                 goto out_err;
1081
1082         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1083                 pdesc = &kvm_vm_stats_desc[i];
1084                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1085                 if (!stat_data)
1086                         goto out_err;
1087
1088                 stat_data->kvm = kvm;
1089                 stat_data->desc = pdesc;
1090                 stat_data->kind = KVM_STAT_VM;
1091                 kvm->debugfs_stat_data[i] = stat_data;
1092                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1093                                     kvm->debugfs_dentry, stat_data,
1094                                     &stat_fops_per_vm);
1095         }
1096
1097         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1098                 pdesc = &kvm_vcpu_stats_desc[i];
1099                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1100                 if (!stat_data)
1101                         goto out_err;
1102
1103                 stat_data->kvm = kvm;
1104                 stat_data->desc = pdesc;
1105                 stat_data->kind = KVM_STAT_VCPU;
1106                 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1107                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1108                                     kvm->debugfs_dentry, stat_data,
1109                                     &stat_fops_per_vm);
1110         }
1111
1112         ret = kvm_arch_create_vm_debugfs(kvm);
1113         if (ret)
1114                 goto out_err;
1115
1116         return 0;
1117 out_err:
1118         kvm_destroy_vm_debugfs(kvm);
1119         return ret;
1120 }
1121
1122 /*
1123  * Called after the VM is otherwise initialized, but just before adding it to
1124  * the vm_list.
1125  */
1126 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1127 {
1128         return 0;
1129 }
1130
1131 /*
1132  * Called just after removing the VM from the vm_list, but before doing any
1133  * other destruction.
1134  */
1135 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1136 {
1137 }
1138
1139 /*
1140  * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1141  * be setup already, so we can create arch-specific debugfs entries under it.
1142  * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1143  * a per-arch destroy interface is not needed.
1144  */
1145 int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1146 {
1147         return 0;
1148 }
1149
1150 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1151 {
1152         struct kvm *kvm = kvm_arch_alloc_vm();
1153         struct kvm_memslots *slots;
1154         int r = -ENOMEM;
1155         int i, j;
1156
1157         if (!kvm)
1158                 return ERR_PTR(-ENOMEM);
1159
1160         /* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
1161         __module_get(kvm_chardev_ops.owner);
1162
1163         KVM_MMU_LOCK_INIT(kvm);
1164         mmgrab(current->mm);
1165         kvm->mm = current->mm;
1166         kvm_eventfd_init(kvm);
1167         mutex_init(&kvm->lock);
1168         mutex_init(&kvm->irq_lock);
1169         mutex_init(&kvm->slots_lock);
1170         mutex_init(&kvm->slots_arch_lock);
1171         spin_lock_init(&kvm->mn_invalidate_lock);
1172         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1173         xa_init(&kvm->vcpu_array);
1174
1175         INIT_LIST_HEAD(&kvm->gpc_list);
1176         spin_lock_init(&kvm->gpc_lock);
1177
1178         INIT_LIST_HEAD(&kvm->devices);
1179         kvm->max_vcpus = KVM_MAX_VCPUS;
1180
1181         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1182
1183         /*
1184          * Force subsequent debugfs file creations to fail if the VM directory
1185          * is not created (by kvm_create_vm_debugfs()).
1186          */
1187         kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1188
1189         snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1190                  task_pid_nr(current));
1191
1192         if (init_srcu_struct(&kvm->srcu))
1193                 goto out_err_no_srcu;
1194         if (init_srcu_struct(&kvm->irq_srcu))
1195                 goto out_err_no_irq_srcu;
1196
1197         refcount_set(&kvm->users_count, 1);
1198         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1199                 for (j = 0; j < 2; j++) {
1200                         slots = &kvm->__memslots[i][j];
1201
1202                         atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1203                         slots->hva_tree = RB_ROOT_CACHED;
1204                         slots->gfn_tree = RB_ROOT;
1205                         hash_init(slots->id_hash);
1206                         slots->node_idx = j;
1207
1208                         /* Generations must be different for each address space. */
1209                         slots->generation = i;
1210                 }
1211
1212                 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1213         }
1214
1215         for (i = 0; i < KVM_NR_BUSES; i++) {
1216                 rcu_assign_pointer(kvm->buses[i],
1217                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1218                 if (!kvm->buses[i])
1219                         goto out_err_no_arch_destroy_vm;
1220         }
1221
1222         r = kvm_arch_init_vm(kvm, type);
1223         if (r)
1224                 goto out_err_no_arch_destroy_vm;
1225
1226         r = hardware_enable_all();
1227         if (r)
1228                 goto out_err_no_disable;
1229
1230 #ifdef CONFIG_HAVE_KVM_IRQFD
1231         INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1232 #endif
1233
1234         r = kvm_init_mmu_notifier(kvm);
1235         if (r)
1236                 goto out_err_no_mmu_notifier;
1237
1238         r = kvm_coalesced_mmio_init(kvm);
1239         if (r < 0)
1240                 goto out_no_coalesced_mmio;
1241
1242         r = kvm_create_vm_debugfs(kvm, fdname);
1243         if (r)
1244                 goto out_err_no_debugfs;
1245
1246         r = kvm_arch_post_init_vm(kvm);
1247         if (r)
1248                 goto out_err;
1249
1250         mutex_lock(&kvm_lock);
1251         list_add(&kvm->vm_list, &vm_list);
1252         mutex_unlock(&kvm_lock);
1253
1254         preempt_notifier_inc();
1255         kvm_init_pm_notifier(kvm);
1256
1257         return kvm;
1258
1259 out_err:
1260         kvm_destroy_vm_debugfs(kvm);
1261 out_err_no_debugfs:
1262         kvm_coalesced_mmio_free(kvm);
1263 out_no_coalesced_mmio:
1264 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1265         if (kvm->mmu_notifier.ops)
1266                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1267 #endif
1268 out_err_no_mmu_notifier:
1269         hardware_disable_all();
1270 out_err_no_disable:
1271         kvm_arch_destroy_vm(kvm);
1272 out_err_no_arch_destroy_vm:
1273         WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1274         for (i = 0; i < KVM_NR_BUSES; i++)
1275                 kfree(kvm_get_bus(kvm, i));
1276         cleanup_srcu_struct(&kvm->irq_srcu);
1277 out_err_no_irq_srcu:
1278         cleanup_srcu_struct(&kvm->srcu);
1279 out_err_no_srcu:
1280         kvm_arch_free_vm(kvm);
1281         mmdrop(current->mm);
1282         module_put(kvm_chardev_ops.owner);
1283         return ERR_PTR(r);
1284 }
1285
1286 static void kvm_destroy_devices(struct kvm *kvm)
1287 {
1288         struct kvm_device *dev, *tmp;
1289
1290         /*
1291          * We do not need to take the kvm->lock here, because nobody else
1292          * has a reference to the struct kvm at this point and therefore
1293          * cannot access the devices list anyhow.
1294          */
1295         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1296                 list_del(&dev->vm_node);
1297                 dev->ops->destroy(dev);
1298         }
1299 }
1300
1301 static void kvm_destroy_vm(struct kvm *kvm)
1302 {
1303         int i;
1304         struct mm_struct *mm = kvm->mm;
1305
1306         kvm_destroy_pm_notifier(kvm);
1307         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1308         kvm_destroy_vm_debugfs(kvm);
1309         kvm_arch_sync_events(kvm);
1310         mutex_lock(&kvm_lock);
1311         list_del(&kvm->vm_list);
1312         mutex_unlock(&kvm_lock);
1313         kvm_arch_pre_destroy_vm(kvm);
1314
1315         kvm_free_irq_routing(kvm);
1316         for (i = 0; i < KVM_NR_BUSES; i++) {
1317                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1318
1319                 if (bus)
1320                         kvm_io_bus_destroy(bus);
1321                 kvm->buses[i] = NULL;
1322         }
1323         kvm_coalesced_mmio_free(kvm);
1324 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1325         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1326         /*
1327          * At this point, pending calls to invalidate_range_start()
1328          * have completed but no more MMU notifiers will run, so
1329          * mn_active_invalidate_count may remain unbalanced.
1330          * No threads can be waiting in kvm_swap_active_memslots() as the
1331          * last reference on KVM has been dropped, but freeing
1332          * memslots would deadlock without this manual intervention.
1333          */
1334         WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1335         kvm->mn_active_invalidate_count = 0;
1336 #else
1337         kvm_flush_shadow_all(kvm);
1338 #endif
1339         kvm_arch_destroy_vm(kvm);
1340         kvm_destroy_devices(kvm);
1341         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1342                 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1343                 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1344         }
1345         cleanup_srcu_struct(&kvm->irq_srcu);
1346         cleanup_srcu_struct(&kvm->srcu);
1347         kvm_arch_free_vm(kvm);
1348         preempt_notifier_dec();
1349         hardware_disable_all();
1350         mmdrop(mm);
1351         module_put(kvm_chardev_ops.owner);
1352 }
1353
1354 void kvm_get_kvm(struct kvm *kvm)
1355 {
1356         refcount_inc(&kvm->users_count);
1357 }
1358 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1359
1360 /*
1361  * Make sure the vm is not during destruction, which is a safe version of
1362  * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1363  */
1364 bool kvm_get_kvm_safe(struct kvm *kvm)
1365 {
1366         return refcount_inc_not_zero(&kvm->users_count);
1367 }
1368 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1369
1370 void kvm_put_kvm(struct kvm *kvm)
1371 {
1372         if (refcount_dec_and_test(&kvm->users_count))
1373                 kvm_destroy_vm(kvm);
1374 }
1375 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1376
1377 /*
1378  * Used to put a reference that was taken on behalf of an object associated
1379  * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1380  * of the new file descriptor fails and the reference cannot be transferred to
1381  * its final owner.  In such cases, the caller is still actively using @kvm and
1382  * will fail miserably if the refcount unexpectedly hits zero.
1383  */
1384 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1385 {
1386         WARN_ON(refcount_dec_and_test(&kvm->users_count));
1387 }
1388 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1389
1390 static int kvm_vm_release(struct inode *inode, struct file *filp)
1391 {
1392         struct kvm *kvm = filp->private_data;
1393
1394         kvm_irqfd_release(kvm);
1395
1396         kvm_put_kvm(kvm);
1397         return 0;
1398 }
1399
1400 /*
1401  * Allocation size is twice as large as the actual dirty bitmap size.
1402  * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1403  */
1404 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1405 {
1406         unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1407
1408         memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1409         if (!memslot->dirty_bitmap)
1410                 return -ENOMEM;
1411
1412         return 0;
1413 }
1414
1415 static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1416 {
1417         struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1418         int node_idx_inactive = active->node_idx ^ 1;
1419
1420         return &kvm->__memslots[as_id][node_idx_inactive];
1421 }
1422
1423 /*
1424  * Helper to get the address space ID when one of memslot pointers may be NULL.
1425  * This also serves as a sanity that at least one of the pointers is non-NULL,
1426  * and that their address space IDs don't diverge.
1427  */
1428 static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1429                                   struct kvm_memory_slot *b)
1430 {
1431         if (WARN_ON_ONCE(!a && !b))
1432                 return 0;
1433
1434         if (!a)
1435                 return b->as_id;
1436         if (!b)
1437                 return a->as_id;
1438
1439         WARN_ON_ONCE(a->as_id != b->as_id);
1440         return a->as_id;
1441 }
1442
1443 static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1444                                 struct kvm_memory_slot *slot)
1445 {
1446         struct rb_root *gfn_tree = &slots->gfn_tree;
1447         struct rb_node **node, *parent;
1448         int idx = slots->node_idx;
1449
1450         parent = NULL;
1451         for (node = &gfn_tree->rb_node; *node; ) {
1452                 struct kvm_memory_slot *tmp;
1453
1454                 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1455                 parent = *node;
1456                 if (slot->base_gfn < tmp->base_gfn)
1457                         node = &(*node)->rb_left;
1458                 else if (slot->base_gfn > tmp->base_gfn)
1459                         node = &(*node)->rb_right;
1460                 else
1461                         BUG();
1462         }
1463
1464         rb_link_node(&slot->gfn_node[idx], parent, node);
1465         rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1466 }
1467
1468 static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1469                                struct kvm_memory_slot *slot)
1470 {
1471         rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1472 }
1473
1474 static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1475                                  struct kvm_memory_slot *old,
1476                                  struct kvm_memory_slot *new)
1477 {
1478         int idx = slots->node_idx;
1479
1480         WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1481
1482         rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1483                         &slots->gfn_tree);
1484 }
1485
1486 /*
1487  * Replace @old with @new in the inactive memslots.
1488  *
1489  * With NULL @old this simply adds @new.
1490  * With NULL @new this simply removes @old.
1491  *
1492  * If @new is non-NULL its hva_node[slots_idx] range has to be set
1493  * appropriately.
1494  */
1495 static void kvm_replace_memslot(struct kvm *kvm,
1496                                 struct kvm_memory_slot *old,
1497                                 struct kvm_memory_slot *new)
1498 {
1499         int as_id = kvm_memslots_get_as_id(old, new);
1500         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1501         int idx = slots->node_idx;
1502
1503         if (old) {
1504                 hash_del(&old->id_node[idx]);
1505                 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1506
1507                 if ((long)old == atomic_long_read(&slots->last_used_slot))
1508                         atomic_long_set(&slots->last_used_slot, (long)new);
1509
1510                 if (!new) {
1511                         kvm_erase_gfn_node(slots, old);
1512                         return;
1513                 }
1514         }
1515
1516         /*
1517          * Initialize @new's hva range.  Do this even when replacing an @old
1518          * slot, kvm_copy_memslot() deliberately does not touch node data.
1519          */
1520         new->hva_node[idx].start = new->userspace_addr;
1521         new->hva_node[idx].last = new->userspace_addr +
1522                                   (new->npages << PAGE_SHIFT) - 1;
1523
1524         /*
1525          * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
1526          * hva_node needs to be swapped with remove+insert even though hva can't
1527          * change when replacing an existing slot.
1528          */
1529         hash_add(slots->id_hash, &new->id_node[idx], new->id);
1530         interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1531
1532         /*
1533          * If the memslot gfn is unchanged, rb_replace_node() can be used to
1534          * switch the node in the gfn tree instead of removing the old and
1535          * inserting the new as two separate operations. Replacement is a
1536          * single O(1) operation versus two O(log(n)) operations for
1537          * remove+insert.
1538          */
1539         if (old && old->base_gfn == new->base_gfn) {
1540                 kvm_replace_gfn_node(slots, old, new);
1541         } else {
1542                 if (old)
1543                         kvm_erase_gfn_node(slots, old);
1544                 kvm_insert_gfn_node(slots, new);
1545         }
1546 }
1547
1548 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1549 {
1550         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1551
1552 #ifdef __KVM_HAVE_READONLY_MEM
1553         valid_flags |= KVM_MEM_READONLY;
1554 #endif
1555
1556         if (mem->flags & ~valid_flags)
1557                 return -EINVAL;
1558
1559         return 0;
1560 }
1561
1562 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1563 {
1564         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1565
1566         /* Grab the generation from the activate memslots. */
1567         u64 gen = __kvm_memslots(kvm, as_id)->generation;
1568
1569         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1570         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1571
1572         /*
1573          * Do not store the new memslots while there are invalidations in
1574          * progress, otherwise the locking in invalidate_range_start and
1575          * invalidate_range_end will be unbalanced.
1576          */
1577         spin_lock(&kvm->mn_invalidate_lock);
1578         prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1579         while (kvm->mn_active_invalidate_count) {
1580                 set_current_state(TASK_UNINTERRUPTIBLE);
1581                 spin_unlock(&kvm->mn_invalidate_lock);
1582                 schedule();
1583                 spin_lock(&kvm->mn_invalidate_lock);
1584         }
1585         finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1586         rcu_assign_pointer(kvm->memslots[as_id], slots);
1587         spin_unlock(&kvm->mn_invalidate_lock);
1588
1589         /*
1590          * Acquired in kvm_set_memslot. Must be released before synchronize
1591          * SRCU below in order to avoid deadlock with another thread
1592          * acquiring the slots_arch_lock in an srcu critical section.
1593          */
1594         mutex_unlock(&kvm->slots_arch_lock);
1595
1596         synchronize_srcu_expedited(&kvm->srcu);
1597
1598         /*
1599          * Increment the new memslot generation a second time, dropping the
1600          * update in-progress flag and incrementing the generation based on
1601          * the number of address spaces.  This provides a unique and easily
1602          * identifiable generation number while the memslots are in flux.
1603          */
1604         gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1605
1606         /*
1607          * Generations must be unique even across address spaces.  We do not need
1608          * a global counter for that, instead the generation space is evenly split
1609          * across address spaces.  For example, with two address spaces, address
1610          * space 0 will use generations 0, 2, 4, ... while address space 1 will
1611          * use generations 1, 3, 5, ...
1612          */
1613         gen += KVM_ADDRESS_SPACE_NUM;
1614
1615         kvm_arch_memslots_updated(kvm, gen);
1616
1617         slots->generation = gen;
1618 }
1619
1620 static int kvm_prepare_memory_region(struct kvm *kvm,
1621                                      const struct kvm_memory_slot *old,
1622                                      struct kvm_memory_slot *new,
1623                                      enum kvm_mr_change change)
1624 {
1625         int r;
1626
1627         /*
1628          * If dirty logging is disabled, nullify the bitmap; the old bitmap
1629          * will be freed on "commit".  If logging is enabled in both old and
1630          * new, reuse the existing bitmap.  If logging is enabled only in the
1631          * new and KVM isn't using a ring buffer, allocate and initialize a
1632          * new bitmap.
1633          */
1634         if (change != KVM_MR_DELETE) {
1635                 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1636                         new->dirty_bitmap = NULL;
1637                 else if (old && old->dirty_bitmap)
1638                         new->dirty_bitmap = old->dirty_bitmap;
1639                 else if (kvm_use_dirty_bitmap(kvm)) {
1640                         r = kvm_alloc_dirty_bitmap(new);
1641                         if (r)
1642                                 return r;
1643
1644                         if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1645                                 bitmap_set(new->dirty_bitmap, 0, new->npages);
1646                 }
1647         }
1648
1649         r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1650
1651         /* Free the bitmap on failure if it was allocated above. */
1652         if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1653                 kvm_destroy_dirty_bitmap(new);
1654
1655         return r;
1656 }
1657
1658 static void kvm_commit_memory_region(struct kvm *kvm,
1659                                      struct kvm_memory_slot *old,
1660                                      const struct kvm_memory_slot *new,
1661                                      enum kvm_mr_change change)
1662 {
1663         int old_flags = old ? old->flags : 0;
1664         int new_flags = new ? new->flags : 0;
1665         /*
1666          * Update the total number of memslot pages before calling the arch
1667          * hook so that architectures can consume the result directly.
1668          */
1669         if (change == KVM_MR_DELETE)
1670                 kvm->nr_memslot_pages -= old->npages;
1671         else if (change == KVM_MR_CREATE)
1672                 kvm->nr_memslot_pages += new->npages;
1673
1674         if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1675                 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1676                 atomic_set(&kvm->nr_memslots_dirty_logging,
1677                            atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1678         }
1679
1680         kvm_arch_commit_memory_region(kvm, old, new, change);
1681
1682         switch (change) {
1683         case KVM_MR_CREATE:
1684                 /* Nothing more to do. */
1685                 break;
1686         case KVM_MR_DELETE:
1687                 /* Free the old memslot and all its metadata. */
1688                 kvm_free_memslot(kvm, old);
1689                 break;
1690         case KVM_MR_MOVE:
1691         case KVM_MR_FLAGS_ONLY:
1692                 /*
1693                  * Free the dirty bitmap as needed; the below check encompasses
1694                  * both the flags and whether a ring buffer is being used)
1695                  */
1696                 if (old->dirty_bitmap && !new->dirty_bitmap)
1697                         kvm_destroy_dirty_bitmap(old);
1698
1699                 /*
1700                  * The final quirk.  Free the detached, old slot, but only its
1701                  * memory, not any metadata.  Metadata, including arch specific
1702                  * data, may be reused by @new.
1703                  */
1704                 kfree(old);
1705                 break;
1706         default:
1707                 BUG();
1708         }
1709 }
1710
1711 /*
1712  * Activate @new, which must be installed in the inactive slots by the caller,
1713  * by swapping the active slots and then propagating @new to @old once @old is
1714  * unreachable and can be safely modified.
1715  *
1716  * With NULL @old this simply adds @new to @active (while swapping the sets).
1717  * With NULL @new this simply removes @old from @active and frees it
1718  * (while also swapping the sets).
1719  */
1720 static void kvm_activate_memslot(struct kvm *kvm,
1721                                  struct kvm_memory_slot *old,
1722                                  struct kvm_memory_slot *new)
1723 {
1724         int as_id = kvm_memslots_get_as_id(old, new);
1725
1726         kvm_swap_active_memslots(kvm, as_id);
1727
1728         /* Propagate the new memslot to the now inactive memslots. */
1729         kvm_replace_memslot(kvm, old, new);
1730 }
1731
1732 static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1733                              const struct kvm_memory_slot *src)
1734 {
1735         dest->base_gfn = src->base_gfn;
1736         dest->npages = src->npages;
1737         dest->dirty_bitmap = src->dirty_bitmap;
1738         dest->arch = src->arch;
1739         dest->userspace_addr = src->userspace_addr;
1740         dest->flags = src->flags;
1741         dest->id = src->id;
1742         dest->as_id = src->as_id;
1743 }
1744
1745 static void kvm_invalidate_memslot(struct kvm *kvm,
1746                                    struct kvm_memory_slot *old,
1747                                    struct kvm_memory_slot *invalid_slot)
1748 {
1749         /*
1750          * Mark the current slot INVALID.  As with all memslot modifications,
1751          * this must be done on an unreachable slot to avoid modifying the
1752          * current slot in the active tree.
1753          */
1754         kvm_copy_memslot(invalid_slot, old);
1755         invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1756         kvm_replace_memslot(kvm, old, invalid_slot);
1757
1758         /*
1759          * Activate the slot that is now marked INVALID, but don't propagate
1760          * the slot to the now inactive slots. The slot is either going to be
1761          * deleted or recreated as a new slot.
1762          */
1763         kvm_swap_active_memslots(kvm, old->as_id);
1764
1765         /*
1766          * From this point no new shadow pages pointing to a deleted, or moved,
1767          * memslot will be created.  Validation of sp->gfn happens in:
1768          *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1769          *      - kvm_is_visible_gfn (mmu_check_root)
1770          */
1771         kvm_arch_flush_shadow_memslot(kvm, old);
1772         kvm_arch_guest_memory_reclaimed(kvm);
1773
1774         /* Was released by kvm_swap_active_memslots(), reacquire. */
1775         mutex_lock(&kvm->slots_arch_lock);
1776
1777         /*
1778          * Copy the arch-specific field of the newly-installed slot back to the
1779          * old slot as the arch data could have changed between releasing
1780          * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1781          * above.  Writers are required to retrieve memslots *after* acquiring
1782          * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1783          */
1784         old->arch = invalid_slot->arch;
1785 }
1786
1787 static void kvm_create_memslot(struct kvm *kvm,
1788                                struct kvm_memory_slot *new)
1789 {
1790         /* Add the new memslot to the inactive set and activate. */
1791         kvm_replace_memslot(kvm, NULL, new);
1792         kvm_activate_memslot(kvm, NULL, new);
1793 }
1794
1795 static void kvm_delete_memslot(struct kvm *kvm,
1796                                struct kvm_memory_slot *old,
1797                                struct kvm_memory_slot *invalid_slot)
1798 {
1799         /*
1800          * Remove the old memslot (in the inactive memslots) by passing NULL as
1801          * the "new" slot, and for the invalid version in the active slots.
1802          */
1803         kvm_replace_memslot(kvm, old, NULL);
1804         kvm_activate_memslot(kvm, invalid_slot, NULL);
1805 }
1806
1807 static void kvm_move_memslot(struct kvm *kvm,
1808                              struct kvm_memory_slot *old,
1809                              struct kvm_memory_slot *new,
1810                              struct kvm_memory_slot *invalid_slot)
1811 {
1812         /*
1813          * Replace the old memslot in the inactive slots, and then swap slots
1814          * and replace the current INVALID with the new as well.
1815          */
1816         kvm_replace_memslot(kvm, old, new);
1817         kvm_activate_memslot(kvm, invalid_slot, new);
1818 }
1819
1820 static void kvm_update_flags_memslot(struct kvm *kvm,
1821                                      struct kvm_memory_slot *old,
1822                                      struct kvm_memory_slot *new)
1823 {
1824         /*
1825          * Similar to the MOVE case, but the slot doesn't need to be zapped as
1826          * an intermediate step. Instead, the old memslot is simply replaced
1827          * with a new, updated copy in both memslot sets.
1828          */
1829         kvm_replace_memslot(kvm, old, new);
1830         kvm_activate_memslot(kvm, old, new);
1831 }
1832
1833 static int kvm_set_memslot(struct kvm *kvm,
1834                            struct kvm_memory_slot *old,
1835                            struct kvm_memory_slot *new,
1836                            enum kvm_mr_change change)
1837 {
1838         struct kvm_memory_slot *invalid_slot;
1839         int r;
1840
1841         /*
1842          * Released in kvm_swap_active_memslots().
1843          *
1844          * Must be held from before the current memslots are copied until after
1845          * the new memslots are installed with rcu_assign_pointer, then
1846          * released before the synchronize srcu in kvm_swap_active_memslots().
1847          *
1848          * When modifying memslots outside of the slots_lock, must be held
1849          * before reading the pointer to the current memslots until after all
1850          * changes to those memslots are complete.
1851          *
1852          * These rules ensure that installing new memslots does not lose
1853          * changes made to the previous memslots.
1854          */
1855         mutex_lock(&kvm->slots_arch_lock);
1856
1857         /*
1858          * Invalidate the old slot if it's being deleted or moved.  This is
1859          * done prior to actually deleting/moving the memslot to allow vCPUs to
1860          * continue running by ensuring there are no mappings or shadow pages
1861          * for the memslot when it is deleted/moved.  Without pre-invalidation
1862          * (and without a lock), a window would exist between effecting the
1863          * delete/move and committing the changes in arch code where KVM or a
1864          * guest could access a non-existent memslot.
1865          *
1866          * Modifications are done on a temporary, unreachable slot.  The old
1867          * slot needs to be preserved in case a later step fails and the
1868          * invalidation needs to be reverted.
1869          */
1870         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1871                 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1872                 if (!invalid_slot) {
1873                         mutex_unlock(&kvm->slots_arch_lock);
1874                         return -ENOMEM;
1875                 }
1876                 kvm_invalidate_memslot(kvm, old, invalid_slot);
1877         }
1878
1879         r = kvm_prepare_memory_region(kvm, old, new, change);
1880         if (r) {
1881                 /*
1882                  * For DELETE/MOVE, revert the above INVALID change.  No
1883                  * modifications required since the original slot was preserved
1884                  * in the inactive slots.  Changing the active memslots also
1885                  * release slots_arch_lock.
1886                  */
1887                 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1888                         kvm_activate_memslot(kvm, invalid_slot, old);
1889                         kfree(invalid_slot);
1890                 } else {
1891                         mutex_unlock(&kvm->slots_arch_lock);
1892                 }
1893                 return r;
1894         }
1895
1896         /*
1897          * For DELETE and MOVE, the working slot is now active as the INVALID
1898          * version of the old slot.  MOVE is particularly special as it reuses
1899          * the old slot and returns a copy of the old slot (in working_slot).
1900          * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
1901          * old slot is detached but otherwise preserved.
1902          */
1903         if (change == KVM_MR_CREATE)
1904                 kvm_create_memslot(kvm, new);
1905         else if (change == KVM_MR_DELETE)
1906                 kvm_delete_memslot(kvm, old, invalid_slot);
1907         else if (change == KVM_MR_MOVE)
1908                 kvm_move_memslot(kvm, old, new, invalid_slot);
1909         else if (change == KVM_MR_FLAGS_ONLY)
1910                 kvm_update_flags_memslot(kvm, old, new);
1911         else
1912                 BUG();
1913
1914         /* Free the temporary INVALID slot used for DELETE and MOVE. */
1915         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1916                 kfree(invalid_slot);
1917
1918         /*
1919          * No need to refresh new->arch, changes after dropping slots_arch_lock
1920          * will directly hit the final, active memslot.  Architectures are
1921          * responsible for knowing that new->arch may be stale.
1922          */
1923         kvm_commit_memory_region(kvm, old, new, change);
1924
1925         return 0;
1926 }
1927
1928 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1929                                       gfn_t start, gfn_t end)
1930 {
1931         struct kvm_memslot_iter iter;
1932
1933         kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1934                 if (iter.slot->id != id)
1935                         return true;
1936         }
1937
1938         return false;
1939 }
1940
1941 /*
1942  * Allocate some memory and give it an address in the guest physical address
1943  * space.
1944  *
1945  * Discontiguous memory is allowed, mostly for framebuffers.
1946  *
1947  * Must be called holding kvm->slots_lock for write.
1948  */
1949 int __kvm_set_memory_region(struct kvm *kvm,
1950                             const struct kvm_userspace_memory_region *mem)
1951 {
1952         struct kvm_memory_slot *old, *new;
1953         struct kvm_memslots *slots;
1954         enum kvm_mr_change change;
1955         unsigned long npages;
1956         gfn_t base_gfn;
1957         int as_id, id;
1958         int r;
1959
1960         r = check_memory_region_flags(mem);
1961         if (r)
1962                 return r;
1963
1964         as_id = mem->slot >> 16;
1965         id = (u16)mem->slot;
1966
1967         /* General sanity checks */
1968         if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1969             (mem->memory_size != (unsigned long)mem->memory_size))
1970                 return -EINVAL;
1971         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1972                 return -EINVAL;
1973         /* We can read the guest memory with __xxx_user() later on. */
1974         if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1975             (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1976              !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1977                         mem->memory_size))
1978                 return -EINVAL;
1979         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1980                 return -EINVAL;
1981         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1982                 return -EINVAL;
1983         if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
1984                 return -EINVAL;
1985
1986         slots = __kvm_memslots(kvm, as_id);
1987
1988         /*
1989          * Note, the old memslot (and the pointer itself!) may be invalidated
1990          * and/or destroyed by kvm_set_memslot().
1991          */
1992         old = id_to_memslot(slots, id);
1993
1994         if (!mem->memory_size) {
1995                 if (!old || !old->npages)
1996                         return -EINVAL;
1997
1998                 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
1999                         return -EIO;
2000
2001                 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
2002         }
2003
2004         base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2005         npages = (mem->memory_size >> PAGE_SHIFT);
2006
2007         if (!old || !old->npages) {
2008                 change = KVM_MR_CREATE;
2009
2010                 /*
2011                  * To simplify KVM internals, the total number of pages across
2012                  * all memslots must fit in an unsigned long.
2013                  */
2014                 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2015                         return -EINVAL;
2016         } else { /* Modify an existing slot. */
2017                 if ((mem->userspace_addr != old->userspace_addr) ||
2018                     (npages != old->npages) ||
2019                     ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2020                         return -EINVAL;
2021
2022                 if (base_gfn != old->base_gfn)
2023                         change = KVM_MR_MOVE;
2024                 else if (mem->flags != old->flags)
2025                         change = KVM_MR_FLAGS_ONLY;
2026                 else /* Nothing to change. */
2027                         return 0;
2028         }
2029
2030         if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2031             kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2032                 return -EEXIST;
2033
2034         /* Allocate a slot that will persist in the memslot. */
2035         new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2036         if (!new)
2037                 return -ENOMEM;
2038
2039         new->as_id = as_id;
2040         new->id = id;
2041         new->base_gfn = base_gfn;
2042         new->npages = npages;
2043         new->flags = mem->flags;
2044         new->userspace_addr = mem->userspace_addr;
2045
2046         r = kvm_set_memslot(kvm, old, new, change);
2047         if (r)
2048                 kfree(new);
2049         return r;
2050 }
2051 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2052
2053 int kvm_set_memory_region(struct kvm *kvm,
2054                           const struct kvm_userspace_memory_region *mem)
2055 {
2056         int r;
2057
2058         mutex_lock(&kvm->slots_lock);
2059         r = __kvm_set_memory_region(kvm, mem);
2060         mutex_unlock(&kvm->slots_lock);
2061         return r;
2062 }
2063 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2064
2065 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2066                                           struct kvm_userspace_memory_region *mem)
2067 {
2068         if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2069                 return -EINVAL;
2070
2071         return kvm_set_memory_region(kvm, mem);
2072 }
2073
2074 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2075 /**
2076  * kvm_get_dirty_log - get a snapshot of dirty pages
2077  * @kvm:        pointer to kvm instance
2078  * @log:        slot id and address to which we copy the log
2079  * @is_dirty:   set to '1' if any dirty pages were found
2080  * @memslot:    set to the associated memslot, always valid on success
2081  */
2082 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2083                       int *is_dirty, struct kvm_memory_slot **memslot)
2084 {
2085         struct kvm_memslots *slots;
2086         int i, as_id, id;
2087         unsigned long n;
2088         unsigned long any = 0;
2089
2090         /* Dirty ring tracking may be exclusive to dirty log tracking */
2091         if (!kvm_use_dirty_bitmap(kvm))
2092                 return -ENXIO;
2093
2094         *memslot = NULL;
2095         *is_dirty = 0;
2096
2097         as_id = log->slot >> 16;
2098         id = (u16)log->slot;
2099         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2100                 return -EINVAL;
2101
2102         slots = __kvm_memslots(kvm, as_id);
2103         *memslot = id_to_memslot(slots, id);
2104         if (!(*memslot) || !(*memslot)->dirty_bitmap)
2105                 return -ENOENT;
2106
2107         kvm_arch_sync_dirty_log(kvm, *memslot);
2108
2109         n = kvm_dirty_bitmap_bytes(*memslot);
2110
2111         for (i = 0; !any && i < n/sizeof(long); ++i)
2112                 any = (*memslot)->dirty_bitmap[i];
2113
2114         if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2115                 return -EFAULT;
2116
2117         if (any)
2118                 *is_dirty = 1;
2119         return 0;
2120 }
2121 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2122
2123 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2124 /**
2125  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2126  *      and reenable dirty page tracking for the corresponding pages.
2127  * @kvm:        pointer to kvm instance
2128  * @log:        slot id and address to which we copy the log
2129  *
2130  * We need to keep it in mind that VCPU threads can write to the bitmap
2131  * concurrently. So, to avoid losing track of dirty pages we keep the
2132  * following order:
2133  *
2134  *    1. Take a snapshot of the bit and clear it if needed.
2135  *    2. Write protect the corresponding page.
2136  *    3. Copy the snapshot to the userspace.
2137  *    4. Upon return caller flushes TLB's if needed.
2138  *
2139  * Between 2 and 4, the guest may write to the page using the remaining TLB
2140  * entry.  This is not a problem because the page is reported dirty using
2141  * the snapshot taken before and step 4 ensures that writes done after
2142  * exiting to userspace will be logged for the next call.
2143  *
2144  */
2145 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2146 {
2147         struct kvm_memslots *slots;
2148         struct kvm_memory_slot *memslot;
2149         int i, as_id, id;
2150         unsigned long n;
2151         unsigned long *dirty_bitmap;
2152         unsigned long *dirty_bitmap_buffer;
2153         bool flush;
2154
2155         /* Dirty ring tracking may be exclusive to dirty log tracking */
2156         if (!kvm_use_dirty_bitmap(kvm))
2157                 return -ENXIO;
2158
2159         as_id = log->slot >> 16;
2160         id = (u16)log->slot;
2161         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2162                 return -EINVAL;
2163
2164         slots = __kvm_memslots(kvm, as_id);
2165         memslot = id_to_memslot(slots, id);
2166         if (!memslot || !memslot->dirty_bitmap)
2167                 return -ENOENT;
2168
2169         dirty_bitmap = memslot->dirty_bitmap;
2170
2171         kvm_arch_sync_dirty_log(kvm, memslot);
2172
2173         n = kvm_dirty_bitmap_bytes(memslot);
2174         flush = false;
2175         if (kvm->manual_dirty_log_protect) {
2176                 /*
2177                  * Unlike kvm_get_dirty_log, we always return false in *flush,
2178                  * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
2179                  * is some code duplication between this function and
2180                  * kvm_get_dirty_log, but hopefully all architecture
2181                  * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2182                  * can be eliminated.
2183                  */
2184                 dirty_bitmap_buffer = dirty_bitmap;
2185         } else {
2186                 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2187                 memset(dirty_bitmap_buffer, 0, n);
2188
2189                 KVM_MMU_LOCK(kvm);
2190                 for (i = 0; i < n / sizeof(long); i++) {
2191                         unsigned long mask;
2192                         gfn_t offset;
2193
2194                         if (!dirty_bitmap[i])
2195                                 continue;
2196
2197                         flush = true;
2198                         mask = xchg(&dirty_bitmap[i], 0);
2199                         dirty_bitmap_buffer[i] = mask;
2200
2201                         offset = i * BITS_PER_LONG;
2202                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2203                                                                 offset, mask);
2204                 }
2205                 KVM_MMU_UNLOCK(kvm);
2206         }
2207
2208         if (flush)
2209                 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2210
2211         if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2212                 return -EFAULT;
2213         return 0;
2214 }
2215
2216
2217 /**
2218  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2219  * @kvm: kvm instance
2220  * @log: slot id and address to which we copy the log
2221  *
2222  * Steps 1-4 below provide general overview of dirty page logging. See
2223  * kvm_get_dirty_log_protect() function description for additional details.
2224  *
2225  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2226  * always flush the TLB (step 4) even if previous step failed  and the dirty
2227  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2228  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2229  * writes will be marked dirty for next log read.
2230  *
2231  *   1. Take a snapshot of the bit and clear it if needed.
2232  *   2. Write protect the corresponding page.
2233  *   3. Copy the snapshot to the userspace.
2234  *   4. Flush TLB's if needed.
2235  */
2236 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2237                                       struct kvm_dirty_log *log)
2238 {
2239         int r;
2240
2241         mutex_lock(&kvm->slots_lock);
2242
2243         r = kvm_get_dirty_log_protect(kvm, log);
2244
2245         mutex_unlock(&kvm->slots_lock);
2246         return r;
2247 }
2248
2249 /**
2250  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2251  *      and reenable dirty page tracking for the corresponding pages.
2252  * @kvm:        pointer to kvm instance
2253  * @log:        slot id and address from which to fetch the bitmap of dirty pages
2254  */
2255 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2256                                        struct kvm_clear_dirty_log *log)
2257 {
2258         struct kvm_memslots *slots;
2259         struct kvm_memory_slot *memslot;
2260         int as_id, id;
2261         gfn_t offset;
2262         unsigned long i, n;
2263         unsigned long *dirty_bitmap;
2264         unsigned long *dirty_bitmap_buffer;
2265         bool flush;
2266
2267         /* Dirty ring tracking may be exclusive to dirty log tracking */
2268         if (!kvm_use_dirty_bitmap(kvm))
2269                 return -ENXIO;
2270
2271         as_id = log->slot >> 16;
2272         id = (u16)log->slot;
2273         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2274                 return -EINVAL;
2275
2276         if (log->first_page & 63)
2277                 return -EINVAL;
2278
2279         slots = __kvm_memslots(kvm, as_id);
2280         memslot = id_to_memslot(slots, id);
2281         if (!memslot || !memslot->dirty_bitmap)
2282                 return -ENOENT;
2283
2284         dirty_bitmap = memslot->dirty_bitmap;
2285
2286         n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2287
2288         if (log->first_page > memslot->npages ||
2289             log->num_pages > memslot->npages - log->first_page ||
2290             (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2291             return -EINVAL;
2292
2293         kvm_arch_sync_dirty_log(kvm, memslot);
2294
2295         flush = false;
2296         dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2297         if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2298                 return -EFAULT;
2299
2300         KVM_MMU_LOCK(kvm);
2301         for (offset = log->first_page, i = offset / BITS_PER_LONG,
2302                  n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2303              i++, offset += BITS_PER_LONG) {
2304                 unsigned long mask = *dirty_bitmap_buffer++;
2305                 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2306                 if (!mask)
2307                         continue;
2308
2309                 mask &= atomic_long_fetch_andnot(mask, p);
2310
2311                 /*
2312                  * mask contains the bits that really have been cleared.  This
2313                  * never includes any bits beyond the length of the memslot (if
2314                  * the length is not aligned to 64 pages), therefore it is not
2315                  * a problem if userspace sets them in log->dirty_bitmap.
2316                 */
2317                 if (mask) {
2318                         flush = true;
2319                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2320                                                                 offset, mask);
2321                 }
2322         }
2323         KVM_MMU_UNLOCK(kvm);
2324
2325         if (flush)
2326                 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2327
2328         return 0;
2329 }
2330
2331 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2332                                         struct kvm_clear_dirty_log *log)
2333 {
2334         int r;
2335
2336         mutex_lock(&kvm->slots_lock);
2337
2338         r = kvm_clear_dirty_log_protect(kvm, log);
2339
2340         mutex_unlock(&kvm->slots_lock);
2341         return r;
2342 }
2343 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2344
2345 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2346 {
2347         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2348 }
2349 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2350
2351 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2352 {
2353         struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2354         u64 gen = slots->generation;
2355         struct kvm_memory_slot *slot;
2356
2357         /*
2358          * This also protects against using a memslot from a different address space,
2359          * since different address spaces have different generation numbers.
2360          */
2361         if (unlikely(gen != vcpu->last_used_slot_gen)) {
2362                 vcpu->last_used_slot = NULL;
2363                 vcpu->last_used_slot_gen = gen;
2364         }
2365
2366         slot = try_get_memslot(vcpu->last_used_slot, gfn);
2367         if (slot)
2368                 return slot;
2369
2370         /*
2371          * Fall back to searching all memslots. We purposely use
2372          * search_memslots() instead of __gfn_to_memslot() to avoid
2373          * thrashing the VM-wide last_used_slot in kvm_memslots.
2374          */
2375         slot = search_memslots(slots, gfn, false);
2376         if (slot) {
2377                 vcpu->last_used_slot = slot;
2378                 return slot;
2379         }
2380
2381         return NULL;
2382 }
2383
2384 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2385 {
2386         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2387
2388         return kvm_is_visible_memslot(memslot);
2389 }
2390 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2391
2392 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2393 {
2394         struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2395
2396         return kvm_is_visible_memslot(memslot);
2397 }
2398 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2399
2400 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2401 {
2402         struct vm_area_struct *vma;
2403         unsigned long addr, size;
2404
2405         size = PAGE_SIZE;
2406
2407         addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2408         if (kvm_is_error_hva(addr))
2409                 return PAGE_SIZE;
2410
2411         mmap_read_lock(current->mm);
2412         vma = find_vma(current->mm, addr);
2413         if (!vma)
2414                 goto out;
2415
2416         size = vma_kernel_pagesize(vma);
2417
2418 out:
2419         mmap_read_unlock(current->mm);
2420
2421         return size;
2422 }
2423
2424 static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2425 {
2426         return slot->flags & KVM_MEM_READONLY;
2427 }
2428
2429 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2430                                        gfn_t *nr_pages, bool write)
2431 {
2432         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2433                 return KVM_HVA_ERR_BAD;
2434
2435         if (memslot_is_readonly(slot) && write)
2436                 return KVM_HVA_ERR_RO_BAD;
2437
2438         if (nr_pages)
2439                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2440
2441         return __gfn_to_hva_memslot(slot, gfn);
2442 }
2443
2444 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2445                                      gfn_t *nr_pages)
2446 {
2447         return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2448 }
2449
2450 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2451                                         gfn_t gfn)
2452 {
2453         return gfn_to_hva_many(slot, gfn, NULL);
2454 }
2455 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2456
2457 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2458 {
2459         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2460 }
2461 EXPORT_SYMBOL_GPL(gfn_to_hva);
2462
2463 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2464 {
2465         return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2466 }
2467 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2468
2469 /*
2470  * Return the hva of a @gfn and the R/W attribute if possible.
2471  *
2472  * @slot: the kvm_memory_slot which contains @gfn
2473  * @gfn: the gfn to be translated
2474  * @writable: used to return the read/write attribute of the @slot if the hva
2475  * is valid and @writable is not NULL
2476  */
2477 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2478                                       gfn_t gfn, bool *writable)
2479 {
2480         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2481
2482         if (!kvm_is_error_hva(hva) && writable)
2483                 *writable = !memslot_is_readonly(slot);
2484
2485         return hva;
2486 }
2487
2488 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2489 {
2490         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2491
2492         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2493 }
2494
2495 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2496 {
2497         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2498
2499         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2500 }
2501
2502 static inline int check_user_page_hwpoison(unsigned long addr)
2503 {
2504         int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2505
2506         rc = get_user_pages(addr, 1, flags, NULL);
2507         return rc == -EHWPOISON;
2508 }
2509
2510 /*
2511  * The fast path to get the writable pfn which will be stored in @pfn,
2512  * true indicates success, otherwise false is returned.  It's also the
2513  * only part that runs if we can in atomic context.
2514  */
2515 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2516                             bool *writable, kvm_pfn_t *pfn)
2517 {
2518         struct page *page[1];
2519
2520         /*
2521          * Fast pin a writable pfn only if it is a write fault request
2522          * or the caller allows to map a writable pfn for a read fault
2523          * request.
2524          */
2525         if (!(write_fault || writable))
2526                 return false;
2527
2528         if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2529                 *pfn = page_to_pfn(page[0]);
2530
2531                 if (writable)
2532                         *writable = true;
2533                 return true;
2534         }
2535
2536         return false;
2537 }
2538
2539 /*
2540  * The slow path to get the pfn of the specified host virtual address,
2541  * 1 indicates success, -errno is returned if error is detected.
2542  */
2543 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2544                            bool interruptible, bool *writable, kvm_pfn_t *pfn)
2545 {
2546         /*
2547          * When a VCPU accesses a page that is not mapped into the secondary
2548          * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2549          * make progress. We always want to honor NUMA hinting faults in that
2550          * case, because GUP usage corresponds to memory accesses from the VCPU.
2551          * Otherwise, we'd not trigger NUMA hinting faults once a page is
2552          * mapped into the secondary MMU and gets accessed by a VCPU.
2553          *
2554          * Note that get_user_page_fast_only() and FOLL_WRITE for now
2555          * implicitly honor NUMA hinting faults and don't need this flag.
2556          */
2557         unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
2558         struct page *page;
2559         int npages;
2560
2561         might_sleep();
2562
2563         if (writable)
2564                 *writable = write_fault;
2565
2566         if (write_fault)
2567                 flags |= FOLL_WRITE;
2568         if (async)
2569                 flags |= FOLL_NOWAIT;
2570         if (interruptible)
2571                 flags |= FOLL_INTERRUPTIBLE;
2572
2573         npages = get_user_pages_unlocked(addr, 1, &page, flags);
2574         if (npages != 1)
2575                 return npages;
2576
2577         /* map read fault as writable if possible */
2578         if (unlikely(!write_fault) && writable) {
2579                 struct page *wpage;
2580
2581                 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2582                         *writable = true;
2583                         put_page(page);
2584                         page = wpage;
2585                 }
2586         }
2587         *pfn = page_to_pfn(page);
2588         return npages;
2589 }
2590
2591 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2592 {
2593         if (unlikely(!(vma->vm_flags & VM_READ)))
2594                 return false;
2595
2596         if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2597                 return false;
2598
2599         return true;
2600 }
2601
2602 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2603 {
2604         struct page *page = kvm_pfn_to_refcounted_page(pfn);
2605
2606         if (!page)
2607                 return 1;
2608
2609         return get_page_unless_zero(page);
2610 }
2611
2612 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2613                                unsigned long addr, bool write_fault,
2614                                bool *writable, kvm_pfn_t *p_pfn)
2615 {
2616         kvm_pfn_t pfn;
2617         pte_t *ptep;
2618         pte_t pte;
2619         spinlock_t *ptl;
2620         int r;
2621
2622         r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2623         if (r) {
2624                 /*
2625                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2626                  * not call the fault handler, so do it here.
2627                  */
2628                 bool unlocked = false;
2629                 r = fixup_user_fault(current->mm, addr,
2630                                      (write_fault ? FAULT_FLAG_WRITE : 0),
2631                                      &unlocked);
2632                 if (unlocked)
2633                         return -EAGAIN;
2634                 if (r)
2635                         return r;
2636
2637                 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2638                 if (r)
2639                         return r;
2640         }
2641
2642         pte = ptep_get(ptep);
2643
2644         if (write_fault && !pte_write(pte)) {
2645                 pfn = KVM_PFN_ERR_RO_FAULT;
2646                 goto out;
2647         }
2648
2649         if (writable)
2650                 *writable = pte_write(pte);
2651         pfn = pte_pfn(pte);
2652
2653         /*
2654          * Get a reference here because callers of *hva_to_pfn* and
2655          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2656          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2657          * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2658          * simply do nothing for reserved pfns.
2659          *
2660          * Whoever called remap_pfn_range is also going to call e.g.
2661          * unmap_mapping_range before the underlying pages are freed,
2662          * causing a call to our MMU notifier.
2663          *
2664          * Certain IO or PFNMAP mappings can be backed with valid
2665          * struct pages, but be allocated without refcounting e.g.,
2666          * tail pages of non-compound higher order allocations, which
2667          * would then underflow the refcount when the caller does the
2668          * required put_page. Don't allow those pages here.
2669          */
2670         if (!kvm_try_get_pfn(pfn))
2671                 r = -EFAULT;
2672
2673 out:
2674         pte_unmap_unlock(ptep, ptl);
2675         *p_pfn = pfn;
2676
2677         return r;
2678 }
2679
2680 /*
2681  * Pin guest page in memory and return its pfn.
2682  * @addr: host virtual address which maps memory to the guest
2683  * @atomic: whether this function can sleep
2684  * @interruptible: whether the process can be interrupted by non-fatal signals
2685  * @async: whether this function need to wait IO complete if the
2686  *         host page is not in the memory
2687  * @write_fault: whether we should get a writable host page
2688  * @writable: whether it allows to map a writable host page for !@write_fault
2689  *
2690  * The function will map a writable host page for these two cases:
2691  * 1): @write_fault = true
2692  * 2): @write_fault = false && @writable, @writable will tell the caller
2693  *     whether the mapping is writable.
2694  */
2695 kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2696                      bool *async, bool write_fault, bool *writable)
2697 {
2698         struct vm_area_struct *vma;
2699         kvm_pfn_t pfn;
2700         int npages, r;
2701
2702         /* we can do it either atomically or asynchronously, not both */
2703         BUG_ON(atomic && async);
2704
2705         if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2706                 return pfn;
2707
2708         if (atomic)
2709                 return KVM_PFN_ERR_FAULT;
2710
2711         npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2712                                  writable, &pfn);
2713         if (npages == 1)
2714                 return pfn;
2715         if (npages == -EINTR)
2716                 return KVM_PFN_ERR_SIGPENDING;
2717
2718         mmap_read_lock(current->mm);
2719         if (npages == -EHWPOISON ||
2720               (!async && check_user_page_hwpoison(addr))) {
2721                 pfn = KVM_PFN_ERR_HWPOISON;
2722                 goto exit;
2723         }
2724
2725 retry:
2726         vma = vma_lookup(current->mm, addr);
2727
2728         if (vma == NULL)
2729                 pfn = KVM_PFN_ERR_FAULT;
2730         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2731                 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
2732                 if (r == -EAGAIN)
2733                         goto retry;
2734                 if (r < 0)
2735                         pfn = KVM_PFN_ERR_FAULT;
2736         } else {
2737                 if (async && vma_is_valid(vma, write_fault))
2738                         *async = true;
2739                 pfn = KVM_PFN_ERR_FAULT;
2740         }
2741 exit:
2742         mmap_read_unlock(current->mm);
2743         return pfn;
2744 }
2745
2746 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2747                                bool atomic, bool interruptible, bool *async,
2748                                bool write_fault, bool *writable, hva_t *hva)
2749 {
2750         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2751
2752         if (hva)
2753                 *hva = addr;
2754
2755         if (addr == KVM_HVA_ERR_RO_BAD) {
2756                 if (writable)
2757                         *writable = false;
2758                 return KVM_PFN_ERR_RO_FAULT;
2759         }
2760
2761         if (kvm_is_error_hva(addr)) {
2762                 if (writable)
2763                         *writable = false;
2764                 return KVM_PFN_NOSLOT;
2765         }
2766
2767         /* Do not map writable pfn in the readonly memslot. */
2768         if (writable && memslot_is_readonly(slot)) {
2769                 *writable = false;
2770                 writable = NULL;
2771         }
2772
2773         return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
2774                           writable);
2775 }
2776 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2777
2778 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2779                       bool *writable)
2780 {
2781         return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
2782                                     NULL, write_fault, writable, NULL);
2783 }
2784 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2785
2786 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2787 {
2788         return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
2789                                     NULL, NULL);
2790 }
2791 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2792
2793 kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2794 {
2795         return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
2796                                     NULL, NULL);
2797 }
2798 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2799
2800 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2801 {
2802         return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2803 }
2804 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2805
2806 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2807 {
2808         return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2809 }
2810 EXPORT_SYMBOL_GPL(gfn_to_pfn);
2811
2812 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2813 {
2814         return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2815 }
2816 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2817
2818 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2819                             struct page **pages, int nr_pages)
2820 {
2821         unsigned long addr;
2822         gfn_t entry = 0;
2823
2824         addr = gfn_to_hva_many(slot, gfn, &entry);
2825         if (kvm_is_error_hva(addr))
2826                 return -1;
2827
2828         if (entry < nr_pages)
2829                 return 0;
2830
2831         return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2832 }
2833 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2834
2835 /*
2836  * Do not use this helper unless you are absolutely certain the gfn _must_ be
2837  * backed by 'struct page'.  A valid example is if the backing memslot is
2838  * controlled by KVM.  Note, if the returned page is valid, it's refcount has
2839  * been elevated by gfn_to_pfn().
2840  */
2841 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2842 {
2843         struct page *page;
2844         kvm_pfn_t pfn;
2845
2846         pfn = gfn_to_pfn(kvm, gfn);
2847
2848         if (is_error_noslot_pfn(pfn))
2849                 return KVM_ERR_PTR_BAD_PAGE;
2850
2851         page = kvm_pfn_to_refcounted_page(pfn);
2852         if (!page)
2853                 return KVM_ERR_PTR_BAD_PAGE;
2854
2855         return page;
2856 }
2857 EXPORT_SYMBOL_GPL(gfn_to_page);
2858
2859 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2860 {
2861         if (dirty)
2862                 kvm_release_pfn_dirty(pfn);
2863         else
2864                 kvm_release_pfn_clean(pfn);
2865 }
2866
2867 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2868 {
2869         kvm_pfn_t pfn;
2870         void *hva = NULL;
2871         struct page *page = KVM_UNMAPPED_PAGE;
2872
2873         if (!map)
2874                 return -EINVAL;
2875
2876         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2877         if (is_error_noslot_pfn(pfn))
2878                 return -EINVAL;
2879
2880         if (pfn_valid(pfn)) {
2881                 page = pfn_to_page(pfn);
2882                 hva = kmap(page);
2883 #ifdef CONFIG_HAS_IOMEM
2884         } else {
2885                 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2886 #endif
2887         }
2888
2889         if (!hva)
2890                 return -EFAULT;
2891
2892         map->page = page;
2893         map->hva = hva;
2894         map->pfn = pfn;
2895         map->gfn = gfn;
2896
2897         return 0;
2898 }
2899 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2900
2901 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2902 {
2903         if (!map)
2904                 return;
2905
2906         if (!map->hva)
2907                 return;
2908
2909         if (map->page != KVM_UNMAPPED_PAGE)
2910                 kunmap(map->page);
2911 #ifdef CONFIG_HAS_IOMEM
2912         else
2913                 memunmap(map->hva);
2914 #endif
2915
2916         if (dirty)
2917                 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2918
2919         kvm_release_pfn(map->pfn, dirty);
2920
2921         map->hva = NULL;
2922         map->page = NULL;
2923 }
2924 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2925
2926 static bool kvm_is_ad_tracked_page(struct page *page)
2927 {
2928         /*
2929          * Per page-flags.h, pages tagged PG_reserved "should in general not be
2930          * touched (e.g. set dirty) except by its owner".
2931          */
2932         return !PageReserved(page);
2933 }
2934
2935 static void kvm_set_page_dirty(struct page *page)
2936 {
2937         if (kvm_is_ad_tracked_page(page))
2938                 SetPageDirty(page);
2939 }
2940
2941 static void kvm_set_page_accessed(struct page *page)
2942 {
2943         if (kvm_is_ad_tracked_page(page))
2944                 mark_page_accessed(page);
2945 }
2946
2947 void kvm_release_page_clean(struct page *page)
2948 {
2949         WARN_ON(is_error_page(page));
2950
2951         kvm_set_page_accessed(page);
2952         put_page(page);
2953 }
2954 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2955
2956 void kvm_release_pfn_clean(kvm_pfn_t pfn)
2957 {
2958         struct page *page;
2959
2960         if (is_error_noslot_pfn(pfn))
2961                 return;
2962
2963         page = kvm_pfn_to_refcounted_page(pfn);
2964         if (!page)
2965                 return;
2966
2967         kvm_release_page_clean(page);
2968 }
2969 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2970
2971 void kvm_release_page_dirty(struct page *page)
2972 {
2973         WARN_ON(is_error_page(page));
2974
2975         kvm_set_page_dirty(page);
2976         kvm_release_page_clean(page);
2977 }
2978 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2979
2980 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2981 {
2982         struct page *page;
2983
2984         if (is_error_noslot_pfn(pfn))
2985                 return;
2986
2987         page = kvm_pfn_to_refcounted_page(pfn);
2988         if (!page)
2989                 return;
2990
2991         kvm_release_page_dirty(page);
2992 }
2993 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2994
2995 /*
2996  * Note, checking for an error/noslot pfn is the caller's responsibility when
2997  * directly marking a page dirty/accessed.  Unlike the "release" helpers, the
2998  * "set" helpers are not to be used when the pfn might point at garbage.
2999  */
3000 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
3001 {
3002         if (WARN_ON(is_error_noslot_pfn(pfn)))
3003                 return;
3004
3005         if (pfn_valid(pfn))
3006                 kvm_set_page_dirty(pfn_to_page(pfn));
3007 }
3008 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3009
3010 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
3011 {
3012         if (WARN_ON(is_error_noslot_pfn(pfn)))
3013                 return;
3014
3015         if (pfn_valid(pfn))
3016                 kvm_set_page_accessed(pfn_to_page(pfn));
3017 }
3018 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3019
3020 static int next_segment(unsigned long len, int offset)
3021 {
3022         if (len > PAGE_SIZE - offset)
3023                 return PAGE_SIZE - offset;
3024         else
3025                 return len;
3026 }
3027
3028 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3029                                  void *data, int offset, int len)
3030 {
3031         int r;
3032         unsigned long addr;
3033
3034         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3035         if (kvm_is_error_hva(addr))
3036                 return -EFAULT;
3037         r = __copy_from_user(data, (void __user *)addr + offset, len);
3038         if (r)
3039                 return -EFAULT;
3040         return 0;
3041 }
3042
3043 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3044                         int len)
3045 {
3046         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3047
3048         return __kvm_read_guest_page(slot, gfn, data, offset, len);
3049 }
3050 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3051
3052 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3053                              int offset, int len)
3054 {
3055         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3056
3057         return __kvm_read_guest_page(slot, gfn, data, offset, len);
3058 }
3059 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3060
3061 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3062 {
3063         gfn_t gfn = gpa >> PAGE_SHIFT;
3064         int seg;
3065         int offset = offset_in_page(gpa);
3066         int ret;
3067
3068         while ((seg = next_segment(len, offset)) != 0) {
3069                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3070                 if (ret < 0)
3071                         return ret;
3072                 offset = 0;
3073                 len -= seg;
3074                 data += seg;
3075                 ++gfn;
3076         }
3077         return 0;
3078 }
3079 EXPORT_SYMBOL_GPL(kvm_read_guest);
3080
3081 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3082 {
3083         gfn_t gfn = gpa >> PAGE_SHIFT;
3084         int seg;
3085         int offset = offset_in_page(gpa);
3086         int ret;
3087
3088         while ((seg = next_segment(len, offset)) != 0) {
3089                 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3090                 if (ret < 0)
3091                         return ret;
3092                 offset = 0;
3093                 len -= seg;
3094                 data += seg;
3095                 ++gfn;
3096         }
3097         return 0;
3098 }
3099 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3100
3101 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3102                                    void *data, int offset, unsigned long len)
3103 {
3104         int r;
3105         unsigned long addr;
3106
3107         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3108         if (kvm_is_error_hva(addr))
3109                 return -EFAULT;
3110         pagefault_disable();
3111         r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3112         pagefault_enable();
3113         if (r)
3114                 return -EFAULT;
3115         return 0;
3116 }
3117
3118 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3119                                void *data, unsigned long len)
3120 {
3121         gfn_t gfn = gpa >> PAGE_SHIFT;
3122         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3123         int offset = offset_in_page(gpa);
3124
3125         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3126 }
3127 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3128
3129 static int __kvm_write_guest_page(struct kvm *kvm,
3130                                   struct kvm_memory_slot *memslot, gfn_t gfn,
3131                                   const void *data, int offset, int len)
3132 {
3133         int r;
3134         unsigned long addr;
3135
3136         addr = gfn_to_hva_memslot(memslot, gfn);
3137         if (kvm_is_error_hva(addr))
3138                 return -EFAULT;
3139         r = __copy_to_user((void __user *)addr + offset, data, len);
3140         if (r)
3141                 return -EFAULT;
3142         mark_page_dirty_in_slot(kvm, memslot, gfn);
3143         return 0;
3144 }
3145
3146 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3147                          const void *data, int offset, int len)
3148 {
3149         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3150
3151         return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3152 }
3153 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3154
3155 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3156                               const void *data, int offset, int len)
3157 {
3158         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3159
3160         return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3161 }
3162 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3163
3164 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3165                     unsigned long len)
3166 {
3167         gfn_t gfn = gpa >> PAGE_SHIFT;
3168         int seg;
3169         int offset = offset_in_page(gpa);
3170         int ret;
3171
3172         while ((seg = next_segment(len, offset)) != 0) {
3173                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3174                 if (ret < 0)
3175                         return ret;
3176                 offset = 0;
3177                 len -= seg;
3178                 data += seg;
3179                 ++gfn;
3180         }
3181         return 0;
3182 }
3183 EXPORT_SYMBOL_GPL(kvm_write_guest);
3184
3185 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3186                          unsigned long len)
3187 {
3188         gfn_t gfn = gpa >> PAGE_SHIFT;
3189         int seg;
3190         int offset = offset_in_page(gpa);
3191         int ret;
3192
3193         while ((seg = next_segment(len, offset)) != 0) {
3194                 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3195                 if (ret < 0)
3196                         return ret;
3197                 offset = 0;
3198                 len -= seg;
3199                 data += seg;
3200                 ++gfn;
3201         }
3202         return 0;
3203 }
3204 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3205
3206 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3207                                        struct gfn_to_hva_cache *ghc,
3208                                        gpa_t gpa, unsigned long len)
3209 {
3210         int offset = offset_in_page(gpa);
3211         gfn_t start_gfn = gpa >> PAGE_SHIFT;
3212         gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3213         gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3214         gfn_t nr_pages_avail;
3215
3216         /* Update ghc->generation before performing any error checks. */
3217         ghc->generation = slots->generation;
3218
3219         if (start_gfn > end_gfn) {
3220                 ghc->hva = KVM_HVA_ERR_BAD;
3221                 return -EINVAL;
3222         }
3223
3224         /*
3225          * If the requested region crosses two memslots, we still
3226          * verify that the entire region is valid here.
3227          */
3228         for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3229                 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3230                 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3231                                            &nr_pages_avail);
3232                 if (kvm_is_error_hva(ghc->hva))
3233                         return -EFAULT;
3234         }
3235
3236         /* Use the slow path for cross page reads and writes. */
3237         if (nr_pages_needed == 1)
3238                 ghc->hva += offset;
3239         else
3240                 ghc->memslot = NULL;
3241
3242         ghc->gpa = gpa;
3243         ghc->len = len;
3244         return 0;
3245 }
3246
3247 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3248                               gpa_t gpa, unsigned long len)
3249 {
3250         struct kvm_memslots *slots = kvm_memslots(kvm);
3251         return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3252 }
3253 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3254
3255 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3256                                   void *data, unsigned int offset,
3257                                   unsigned long len)
3258 {
3259         struct kvm_memslots *slots = kvm_memslots(kvm);
3260         int r;
3261         gpa_t gpa = ghc->gpa + offset;
3262
3263         if (WARN_ON_ONCE(len + offset > ghc->len))
3264                 return -EINVAL;
3265
3266         if (slots->generation != ghc->generation) {
3267                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3268                         return -EFAULT;
3269         }
3270
3271         if (kvm_is_error_hva(ghc->hva))
3272                 return -EFAULT;
3273
3274         if (unlikely(!ghc->memslot))
3275                 return kvm_write_guest(kvm, gpa, data, len);
3276
3277         r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3278         if (r)
3279                 return -EFAULT;
3280         mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3281
3282         return 0;
3283 }
3284 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3285
3286 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3287                            void *data, unsigned long len)
3288 {
3289         return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3290 }
3291 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3292
3293 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3294                                  void *data, unsigned int offset,
3295                                  unsigned long len)
3296 {
3297         struct kvm_memslots *slots = kvm_memslots(kvm);
3298         int r;
3299         gpa_t gpa = ghc->gpa + offset;
3300
3301         if (WARN_ON_ONCE(len + offset > ghc->len))
3302                 return -EINVAL;
3303
3304         if (slots->generation != ghc->generation) {
3305                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3306                         return -EFAULT;
3307         }
3308
3309         if (kvm_is_error_hva(ghc->hva))
3310                 return -EFAULT;
3311
3312         if (unlikely(!ghc->memslot))
3313                 return kvm_read_guest(kvm, gpa, data, len);
3314
3315         r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3316         if (r)
3317                 return -EFAULT;
3318
3319         return 0;
3320 }
3321 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3322
3323 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3324                           void *data, unsigned long len)
3325 {
3326         return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3327 }
3328 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3329
3330 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3331 {
3332         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3333         gfn_t gfn = gpa >> PAGE_SHIFT;
3334         int seg;
3335         int offset = offset_in_page(gpa);
3336         int ret;
3337
3338         while ((seg = next_segment(len, offset)) != 0) {
3339                 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3340                 if (ret < 0)
3341                         return ret;
3342                 offset = 0;
3343                 len -= seg;
3344                 ++gfn;
3345         }
3346         return 0;
3347 }
3348 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3349
3350 void mark_page_dirty_in_slot(struct kvm *kvm,
3351                              const struct kvm_memory_slot *memslot,
3352                              gfn_t gfn)
3353 {
3354         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3355
3356 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3357         if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3358                 return;
3359
3360         WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3361 #endif
3362
3363         if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3364                 unsigned long rel_gfn = gfn - memslot->base_gfn;
3365                 u32 slot = (memslot->as_id << 16) | memslot->id;
3366
3367                 if (kvm->dirty_ring_size && vcpu)
3368                         kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3369                 else if (memslot->dirty_bitmap)
3370                         set_bit_le(rel_gfn, memslot->dirty_bitmap);
3371         }
3372 }
3373 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3374
3375 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3376 {
3377         struct kvm_memory_slot *memslot;
3378
3379         memslot = gfn_to_memslot(kvm, gfn);
3380         mark_page_dirty_in_slot(kvm, memslot, gfn);
3381 }
3382 EXPORT_SYMBOL_GPL(mark_page_dirty);
3383
3384 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3385 {
3386         struct kvm_memory_slot *memslot;
3387
3388         memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3389         mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3390 }
3391 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3392
3393 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3394 {
3395         if (!vcpu->sigset_active)
3396                 return;
3397
3398         /*
3399          * This does a lockless modification of ->real_blocked, which is fine
3400          * because, only current can change ->real_blocked and all readers of
3401          * ->real_blocked don't care as long ->real_blocked is always a subset
3402          * of ->blocked.
3403          */
3404         sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3405 }
3406
3407 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3408 {
3409         if (!vcpu->sigset_active)
3410                 return;
3411
3412         sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3413         sigemptyset(&current->real_blocked);
3414 }
3415
3416 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3417 {
3418         unsigned int old, val, grow, grow_start;
3419
3420         old = val = vcpu->halt_poll_ns;
3421         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3422         grow = READ_ONCE(halt_poll_ns_grow);
3423         if (!grow)
3424                 goto out;
3425
3426         val *= grow;
3427         if (val < grow_start)
3428                 val = grow_start;
3429
3430         vcpu->halt_poll_ns = val;
3431 out:
3432         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3433 }
3434
3435 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3436 {
3437         unsigned int old, val, shrink, grow_start;
3438
3439         old = val = vcpu->halt_poll_ns;
3440         shrink = READ_ONCE(halt_poll_ns_shrink);
3441         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3442         if (shrink == 0)
3443                 val = 0;
3444         else
3445                 val /= shrink;
3446
3447         if (val < grow_start)
3448                 val = 0;
3449
3450         vcpu->halt_poll_ns = val;
3451         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3452 }
3453
3454 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3455 {
3456         int ret = -EINTR;
3457         int idx = srcu_read_lock(&vcpu->kvm->srcu);
3458
3459         if (kvm_arch_vcpu_runnable(vcpu))
3460                 goto out;
3461         if (kvm_cpu_has_pending_timer(vcpu))
3462                 goto out;
3463         if (signal_pending(current))
3464                 goto out;
3465         if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3466                 goto out;
3467
3468         ret = 0;
3469 out:
3470         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3471         return ret;
3472 }
3473
3474 /*
3475  * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3476  * pending.  This is mostly used when halting a vCPU, but may also be used
3477  * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3478  */
3479 bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3480 {
3481         struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3482         bool waited = false;
3483
3484         vcpu->stat.generic.blocking = 1;
3485
3486         preempt_disable();
3487         kvm_arch_vcpu_blocking(vcpu);
3488         prepare_to_rcuwait(wait);
3489         preempt_enable();
3490
3491         for (;;) {
3492                 set_current_state(TASK_INTERRUPTIBLE);
3493
3494                 if (kvm_vcpu_check_block(vcpu) < 0)
3495                         break;
3496
3497                 waited = true;
3498                 schedule();
3499         }
3500
3501         preempt_disable();
3502         finish_rcuwait(wait);
3503         kvm_arch_vcpu_unblocking(vcpu);
3504         preempt_enable();
3505
3506         vcpu->stat.generic.blocking = 0;
3507
3508         return waited;
3509 }
3510
3511 static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3512                                           ktime_t end, bool success)
3513 {
3514         struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3515         u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3516
3517         ++vcpu->stat.generic.halt_attempted_poll;
3518
3519         if (success) {
3520                 ++vcpu->stat.generic.halt_successful_poll;
3521
3522                 if (!vcpu_valid_wakeup(vcpu))
3523                         ++vcpu->stat.generic.halt_poll_invalid;
3524
3525                 stats->halt_poll_success_ns += poll_ns;
3526                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3527         } else {
3528                 stats->halt_poll_fail_ns += poll_ns;
3529                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3530         }
3531 }
3532
3533 static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3534 {
3535         struct kvm *kvm = vcpu->kvm;
3536
3537         if (kvm->override_halt_poll_ns) {
3538                 /*
3539                  * Ensure kvm->max_halt_poll_ns is not read before
3540                  * kvm->override_halt_poll_ns.
3541                  *
3542                  * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3543                  */
3544                 smp_rmb();
3545                 return READ_ONCE(kvm->max_halt_poll_ns);
3546         }
3547
3548         return READ_ONCE(halt_poll_ns);
3549 }
3550
3551 /*
3552  * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
3553  * polling is enabled, busy wait for a short time before blocking to avoid the
3554  * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3555  * is halted.
3556  */
3557 void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3558 {
3559         unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3560         bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3561         ktime_t start, cur, poll_end;
3562         bool waited = false;
3563         bool do_halt_poll;
3564         u64 halt_ns;
3565
3566         if (vcpu->halt_poll_ns > max_halt_poll_ns)
3567                 vcpu->halt_poll_ns = max_halt_poll_ns;
3568
3569         do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3570
3571         start = cur = poll_end = ktime_get();
3572         if (do_halt_poll) {
3573                 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3574
3575                 do {
3576                         if (kvm_vcpu_check_block(vcpu) < 0)
3577                                 goto out;
3578                         cpu_relax();
3579                         poll_end = cur = ktime_get();
3580                 } while (kvm_vcpu_can_poll(cur, stop));
3581         }
3582
3583         waited = kvm_vcpu_block(vcpu);
3584
3585         cur = ktime_get();
3586         if (waited) {
3587                 vcpu->stat.generic.halt_wait_ns +=
3588                         ktime_to_ns(cur) - ktime_to_ns(poll_end);
3589                 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3590                                 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3591         }
3592 out:
3593         /* The total time the vCPU was "halted", including polling time. */
3594         halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3595
3596         /*
3597          * Note, halt-polling is considered successful so long as the vCPU was
3598          * never actually scheduled out, i.e. even if the wake event arrived
3599          * after of the halt-polling loop itself, but before the full wait.
3600          */
3601         if (do_halt_poll)
3602                 update_halt_poll_stats(vcpu, start, poll_end, !waited);
3603
3604         if (halt_poll_allowed) {
3605                 /* Recompute the max halt poll time in case it changed. */
3606                 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3607
3608                 if (!vcpu_valid_wakeup(vcpu)) {
3609                         shrink_halt_poll_ns(vcpu);
3610                 } else if (max_halt_poll_ns) {
3611                         if (halt_ns <= vcpu->halt_poll_ns)
3612                                 ;
3613                         /* we had a long block, shrink polling */
3614                         else if (vcpu->halt_poll_ns &&
3615                                  halt_ns > max_halt_poll_ns)
3616                                 shrink_halt_poll_ns(vcpu);
3617                         /* we had a short halt and our poll time is too small */
3618                         else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3619                                  halt_ns < max_halt_poll_ns)
3620                                 grow_halt_poll_ns(vcpu);
3621                 } else {
3622                         vcpu->halt_poll_ns = 0;
3623                 }
3624         }
3625
3626         trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3627 }
3628 EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3629
3630 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3631 {
3632         if (__kvm_vcpu_wake_up(vcpu)) {
3633                 WRITE_ONCE(vcpu->ready, true);
3634                 ++vcpu->stat.generic.halt_wakeup;
3635                 return true;
3636         }
3637
3638         return false;
3639 }
3640 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3641
3642 #ifndef CONFIG_S390
3643 /*
3644  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3645  */
3646 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3647 {
3648         int me, cpu;
3649
3650         if (kvm_vcpu_wake_up(vcpu))
3651                 return;
3652
3653         me = get_cpu();
3654         /*
3655          * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3656          * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
3657          * kick" check does not need atomic operations if kvm_vcpu_kick is used
3658          * within the vCPU thread itself.
3659          */
3660         if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3661                 if (vcpu->mode == IN_GUEST_MODE)
3662                         WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3663                 goto out;
3664         }
3665
3666         /*
3667          * Note, the vCPU could get migrated to a different pCPU at any point
3668          * after kvm_arch_vcpu_should_kick(), which could result in sending an
3669          * IPI to the previous pCPU.  But, that's ok because the purpose of the
3670          * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3671          * vCPU also requires it to leave IN_GUEST_MODE.
3672          */
3673         if (kvm_arch_vcpu_should_kick(vcpu)) {
3674                 cpu = READ_ONCE(vcpu->cpu);
3675                 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3676                         smp_send_reschedule(cpu);
3677         }
3678 out:
3679         put_cpu();
3680 }
3681 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3682 #endif /* !CONFIG_S390 */
3683
3684 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3685 {
3686         struct pid *pid;
3687         struct task_struct *task = NULL;
3688         int ret = 0;
3689
3690         rcu_read_lock();
3691         pid = rcu_dereference(target->pid);
3692         if (pid)
3693                 task = get_pid_task(pid, PIDTYPE_PID);
3694         rcu_read_unlock();
3695         if (!task)
3696                 return ret;
3697         ret = yield_to(task, 1);
3698         put_task_struct(task);
3699
3700         return ret;
3701 }
3702 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3703
3704 /*
3705  * Helper that checks whether a VCPU is eligible for directed yield.
3706  * Most eligible candidate to yield is decided by following heuristics:
3707  *
3708  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3709  *  (preempted lock holder), indicated by @in_spin_loop.
3710  *  Set at the beginning and cleared at the end of interception/PLE handler.
3711  *
3712  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3713  *  chance last time (mostly it has become eligible now since we have probably
3714  *  yielded to lockholder in last iteration. This is done by toggling
3715  *  @dy_eligible each time a VCPU checked for eligibility.)
3716  *
3717  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3718  *  to preempted lock-holder could result in wrong VCPU selection and CPU
3719  *  burning. Giving priority for a potential lock-holder increases lock
3720  *  progress.
3721  *
3722  *  Since algorithm is based on heuristics, accessing another VCPU data without
3723  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3724  *  and continue with next VCPU and so on.
3725  */
3726 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3727 {
3728 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3729         bool eligible;
3730
3731         eligible = !vcpu->spin_loop.in_spin_loop ||
3732                     vcpu->spin_loop.dy_eligible;
3733
3734         if (vcpu->spin_loop.in_spin_loop)
3735                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3736
3737         return eligible;
3738 #else
3739         return true;
3740 #endif
3741 }
3742
3743 /*
3744  * Unlike kvm_arch_vcpu_runnable, this function is called outside
3745  * a vcpu_load/vcpu_put pair.  However, for most architectures
3746  * kvm_arch_vcpu_runnable does not require vcpu_load.
3747  */
3748 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3749 {
3750         return kvm_arch_vcpu_runnable(vcpu);
3751 }
3752
3753 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3754 {
3755         if (kvm_arch_dy_runnable(vcpu))
3756                 return true;
3757
3758 #ifdef CONFIG_KVM_ASYNC_PF
3759         if (!list_empty_careful(&vcpu->async_pf.done))
3760                 return true;
3761 #endif
3762
3763         return false;
3764 }
3765
3766 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3767 {
3768         return false;
3769 }
3770
3771 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3772 {
3773         struct kvm *kvm = me->kvm;
3774         struct kvm_vcpu *vcpu;
3775         int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3776         unsigned long i;
3777         int yielded = 0;
3778         int try = 3;
3779         int pass;
3780
3781         kvm_vcpu_set_in_spin_loop(me, true);
3782         /*
3783          * We boost the priority of a VCPU that is runnable but not
3784          * currently running, because it got preempted by something
3785          * else and called schedule in __vcpu_run.  Hopefully that
3786          * VCPU is holding the lock that we need and will release it.
3787          * We approximate round-robin by starting at the last boosted VCPU.
3788          */
3789         for (pass = 0; pass < 2 && !yielded && try; pass++) {
3790                 kvm_for_each_vcpu(i, vcpu, kvm) {
3791                         if (!pass && i <= last_boosted_vcpu) {
3792                                 i = last_boosted_vcpu;
3793                                 continue;
3794                         } else if (pass && i > last_boosted_vcpu)
3795                                 break;
3796                         if (!READ_ONCE(vcpu->ready))
3797                                 continue;
3798                         if (vcpu == me)
3799                                 continue;
3800                         if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3801                                 continue;
3802                         if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3803                             !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3804                             !kvm_arch_vcpu_in_kernel(vcpu))
3805                                 continue;
3806                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3807                                 continue;
3808
3809                         yielded = kvm_vcpu_yield_to(vcpu);
3810                         if (yielded > 0) {
3811                                 kvm->last_boosted_vcpu = i;
3812                                 break;
3813                         } else if (yielded < 0) {
3814                                 try--;
3815                                 if (!try)
3816                                         break;
3817                         }
3818                 }
3819         }
3820         kvm_vcpu_set_in_spin_loop(me, false);
3821
3822         /* Ensure vcpu is not eligible during next spinloop */
3823         kvm_vcpu_set_dy_eligible(me, false);
3824 }
3825 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3826
3827 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3828 {
3829 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3830         return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3831             (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3832              kvm->dirty_ring_size / PAGE_SIZE);
3833 #else
3834         return false;
3835 #endif
3836 }
3837
3838 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3839 {
3840         struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3841         struct page *page;
3842
3843         if (vmf->pgoff == 0)
3844                 page = virt_to_page(vcpu->run);
3845 #ifdef CONFIG_X86
3846         else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3847                 page = virt_to_page(vcpu->arch.pio_data);
3848 #endif
3849 #ifdef CONFIG_KVM_MMIO
3850         else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3851                 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3852 #endif
3853         else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3854                 page = kvm_dirty_ring_get_page(
3855                     &vcpu->dirty_ring,
3856                     vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3857         else
3858                 return kvm_arch_vcpu_fault(vcpu, vmf);
3859         get_page(page);
3860         vmf->page = page;
3861         return 0;
3862 }
3863
3864 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3865         .fault = kvm_vcpu_fault,
3866 };
3867
3868 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3869 {
3870         struct kvm_vcpu *vcpu = file->private_data;
3871         unsigned long pages = vma_pages(vma);
3872
3873         if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3874              kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3875             ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3876                 return -EINVAL;
3877
3878         vma->vm_ops = &kvm_vcpu_vm_ops;
3879         return 0;
3880 }
3881
3882 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3883 {
3884         struct kvm_vcpu *vcpu = filp->private_data;
3885
3886         kvm_put_kvm(vcpu->kvm);
3887         return 0;
3888 }
3889
3890 static const struct file_operations kvm_vcpu_fops = {
3891         .release        = kvm_vcpu_release,
3892         .unlocked_ioctl = kvm_vcpu_ioctl,
3893         .mmap           = kvm_vcpu_mmap,
3894         .llseek         = noop_llseek,
3895         KVM_COMPAT(kvm_vcpu_compat_ioctl),
3896 };
3897
3898 /*
3899  * Allocates an inode for the vcpu.
3900  */
3901 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3902 {
3903         char name[8 + 1 + ITOA_MAX_LEN + 1];
3904
3905         snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3906         return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3907 }
3908
3909 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3910 static int vcpu_get_pid(void *data, u64 *val)
3911 {
3912         struct kvm_vcpu *vcpu = data;
3913
3914         rcu_read_lock();
3915         *val = pid_nr(rcu_dereference(vcpu->pid));
3916         rcu_read_unlock();
3917         return 0;
3918 }
3919
3920 DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
3921
3922 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3923 {
3924         struct dentry *debugfs_dentry;
3925         char dir_name[ITOA_MAX_LEN * 2];
3926
3927         if (!debugfs_initialized())
3928                 return;
3929
3930         snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3931         debugfs_dentry = debugfs_create_dir(dir_name,
3932                                             vcpu->kvm->debugfs_dentry);
3933         debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
3934                             &vcpu_get_pid_fops);
3935
3936         kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3937 }
3938 #endif
3939
3940 /*
3941  * Creates some virtual cpus.  Good luck creating more than one.
3942  */
3943 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3944 {
3945         int r;
3946         struct kvm_vcpu *vcpu;
3947         struct page *page;
3948
3949         if (id >= KVM_MAX_VCPU_IDS)
3950                 return -EINVAL;
3951
3952         mutex_lock(&kvm->lock);
3953         if (kvm->created_vcpus >= kvm->max_vcpus) {
3954                 mutex_unlock(&kvm->lock);
3955                 return -EINVAL;
3956         }
3957
3958         r = kvm_arch_vcpu_precreate(kvm, id);
3959         if (r) {
3960                 mutex_unlock(&kvm->lock);
3961                 return r;
3962         }
3963
3964         kvm->created_vcpus++;
3965         mutex_unlock(&kvm->lock);
3966
3967         vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3968         if (!vcpu) {
3969                 r = -ENOMEM;
3970                 goto vcpu_decrement;
3971         }
3972
3973         BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3974         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3975         if (!page) {
3976                 r = -ENOMEM;
3977                 goto vcpu_free;
3978         }
3979         vcpu->run = page_address(page);
3980
3981         kvm_vcpu_init(vcpu, kvm, id);
3982
3983         r = kvm_arch_vcpu_create(vcpu);
3984         if (r)
3985                 goto vcpu_free_run_page;
3986
3987         if (kvm->dirty_ring_size) {
3988                 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3989                                          id, kvm->dirty_ring_size);
3990                 if (r)
3991                         goto arch_vcpu_destroy;
3992         }
3993
3994         mutex_lock(&kvm->lock);
3995
3996 #ifdef CONFIG_LOCKDEP
3997         /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
3998         mutex_lock(&vcpu->mutex);
3999         mutex_unlock(&vcpu->mutex);
4000 #endif
4001
4002         if (kvm_get_vcpu_by_id(kvm, id)) {
4003                 r = -EEXIST;
4004                 goto unlock_vcpu_destroy;
4005         }
4006
4007         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4008         r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4009         if (r)
4010                 goto unlock_vcpu_destroy;
4011
4012         /* Now it's all set up, let userspace reach it */
4013         kvm_get_kvm(kvm);
4014         r = create_vcpu_fd(vcpu);
4015         if (r < 0)
4016                 goto kvm_put_xa_release;
4017
4018         if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4019                 r = -EINVAL;
4020                 goto kvm_put_xa_release;
4021         }
4022
4023         /*
4024          * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
4025          * pointer before kvm->online_vcpu's incremented value.
4026          */
4027         smp_wmb();
4028         atomic_inc(&kvm->online_vcpus);
4029
4030         mutex_unlock(&kvm->lock);
4031         kvm_arch_vcpu_postcreate(vcpu);
4032         kvm_create_vcpu_debugfs(vcpu);
4033         return r;
4034
4035 kvm_put_xa_release:
4036         kvm_put_kvm_no_destroy(kvm);
4037         xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4038 unlock_vcpu_destroy:
4039         mutex_unlock(&kvm->lock);
4040         kvm_dirty_ring_free(&vcpu->dirty_ring);
4041 arch_vcpu_destroy:
4042         kvm_arch_vcpu_destroy(vcpu);
4043 vcpu_free_run_page:
4044         free_page((unsigned long)vcpu->run);
4045 vcpu_free:
4046         kmem_cache_free(kvm_vcpu_cache, vcpu);
4047 vcpu_decrement:
4048         mutex_lock(&kvm->lock);
4049         kvm->created_vcpus--;
4050         mutex_unlock(&kvm->lock);
4051         return r;
4052 }
4053
4054 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4055 {
4056         if (sigset) {
4057                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4058                 vcpu->sigset_active = 1;
4059                 vcpu->sigset = *sigset;
4060         } else
4061                 vcpu->sigset_active = 0;
4062         return 0;
4063 }
4064
4065 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4066                               size_t size, loff_t *offset)
4067 {
4068         struct kvm_vcpu *vcpu = file->private_data;
4069
4070         return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4071                         &kvm_vcpu_stats_desc[0], &vcpu->stat,
4072                         sizeof(vcpu->stat), user_buffer, size, offset);
4073 }
4074
4075 static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4076 {
4077         struct kvm_vcpu *vcpu = file->private_data;
4078
4079         kvm_put_kvm(vcpu->kvm);
4080         return 0;
4081 }
4082
4083 static const struct file_operations kvm_vcpu_stats_fops = {
4084         .read = kvm_vcpu_stats_read,
4085         .release = kvm_vcpu_stats_release,
4086         .llseek = noop_llseek,
4087 };
4088
4089 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4090 {
4091         int fd;
4092         struct file *file;
4093         char name[15 + ITOA_MAX_LEN + 1];
4094
4095         snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4096
4097         fd = get_unused_fd_flags(O_CLOEXEC);
4098         if (fd < 0)
4099                 return fd;
4100
4101         file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4102         if (IS_ERR(file)) {
4103                 put_unused_fd(fd);
4104                 return PTR_ERR(file);
4105         }
4106
4107         kvm_get_kvm(vcpu->kvm);
4108
4109         file->f_mode |= FMODE_PREAD;
4110         fd_install(fd, file);
4111
4112         return fd;
4113 }
4114
4115 static long kvm_vcpu_ioctl(struct file *filp,
4116                            unsigned int ioctl, unsigned long arg)
4117 {
4118         struct kvm_vcpu *vcpu = filp->private_data;
4119         void __user *argp = (void __user *)arg;
4120         int r;
4121         struct kvm_fpu *fpu = NULL;
4122         struct kvm_sregs *kvm_sregs = NULL;
4123
4124         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4125                 return -EIO;
4126
4127         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4128                 return -EINVAL;
4129
4130         /*
4131          * Some architectures have vcpu ioctls that are asynchronous to vcpu
4132          * execution; mutex_lock() would break them.
4133          */
4134         r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4135         if (r != -ENOIOCTLCMD)
4136                 return r;
4137
4138         if (mutex_lock_killable(&vcpu->mutex))
4139                 return -EINTR;
4140         switch (ioctl) {
4141         case KVM_RUN: {
4142                 struct pid *oldpid;
4143                 r = -EINVAL;
4144                 if (arg)
4145                         goto out;
4146                 oldpid = rcu_access_pointer(vcpu->pid);
4147                 if (unlikely(oldpid != task_pid(current))) {
4148                         /* The thread running this VCPU changed. */
4149                         struct pid *newpid;
4150
4151                         r = kvm_arch_vcpu_run_pid_change(vcpu);
4152                         if (r)
4153                                 break;
4154
4155                         newpid = get_task_pid(current, PIDTYPE_PID);
4156                         rcu_assign_pointer(vcpu->pid, newpid);
4157                         if (oldpid)
4158                                 synchronize_rcu();
4159                         put_pid(oldpid);
4160                 }
4161                 r = kvm_arch_vcpu_ioctl_run(vcpu);
4162                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4163                 break;
4164         }
4165         case KVM_GET_REGS: {
4166                 struct kvm_regs *kvm_regs;
4167
4168                 r = -ENOMEM;
4169                 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
4170                 if (!kvm_regs)
4171                         goto out;
4172                 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4173                 if (r)
4174                         goto out_free1;
4175                 r = -EFAULT;
4176                 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4177                         goto out_free1;
4178                 r = 0;
4179 out_free1:
4180                 kfree(kvm_regs);
4181                 break;
4182         }
4183         case KVM_SET_REGS: {
4184                 struct kvm_regs *kvm_regs;
4185
4186                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4187                 if (IS_ERR(kvm_regs)) {
4188                         r = PTR_ERR(kvm_regs);
4189                         goto out;
4190                 }
4191                 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4192                 kfree(kvm_regs);
4193                 break;
4194         }
4195         case KVM_GET_SREGS: {
4196                 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4197                                     GFP_KERNEL_ACCOUNT);
4198                 r = -ENOMEM;
4199                 if (!kvm_sregs)
4200                         goto out;
4201                 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4202                 if (r)
4203                         goto out;
4204                 r = -EFAULT;
4205                 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4206                         goto out;
4207                 r = 0;
4208                 break;
4209         }
4210         case KVM_SET_SREGS: {
4211                 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4212                 if (IS_ERR(kvm_sregs)) {
4213                         r = PTR_ERR(kvm_sregs);
4214                         kvm_sregs = NULL;
4215                         goto out;
4216                 }
4217                 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4218                 break;
4219         }
4220         case KVM_GET_MP_STATE: {
4221                 struct kvm_mp_state mp_state;
4222
4223                 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4224                 if (r)
4225                         goto out;
4226                 r = -EFAULT;
4227                 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4228                         goto out;
4229                 r = 0;
4230                 break;
4231         }
4232         case KVM_SET_MP_STATE: {
4233                 struct kvm_mp_state mp_state;
4234
4235                 r = -EFAULT;
4236                 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4237                         goto out;
4238                 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4239                 break;
4240         }
4241         case KVM_TRANSLATE: {
4242                 struct kvm_translation tr;
4243
4244                 r = -EFAULT;
4245                 if (copy_from_user(&tr, argp, sizeof(tr)))
4246                         goto out;
4247                 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4248                 if (r)
4249                         goto out;
4250                 r = -EFAULT;
4251                 if (copy_to_user(argp, &tr, sizeof(tr)))
4252                         goto out;
4253                 r = 0;
4254                 break;
4255         }
4256         case KVM_SET_GUEST_DEBUG: {
4257                 struct kvm_guest_debug dbg;
4258
4259                 r = -EFAULT;
4260                 if (copy_from_user(&dbg, argp, sizeof(dbg)))
4261                         goto out;
4262                 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4263                 break;
4264         }
4265         case KVM_SET_SIGNAL_MASK: {
4266                 struct kvm_signal_mask __user *sigmask_arg = argp;
4267                 struct kvm_signal_mask kvm_sigmask;
4268                 sigset_t sigset, *p;
4269
4270                 p = NULL;
4271                 if (argp) {
4272                         r = -EFAULT;
4273                         if (copy_from_user(&kvm_sigmask, argp,
4274                                            sizeof(kvm_sigmask)))
4275                                 goto out;
4276                         r = -EINVAL;
4277                         if (kvm_sigmask.len != sizeof(sigset))
4278                                 goto out;
4279                         r = -EFAULT;
4280                         if (copy_from_user(&sigset, sigmask_arg->sigset,
4281                                            sizeof(sigset)))
4282                                 goto out;
4283                         p = &sigset;
4284                 }
4285                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4286                 break;
4287         }
4288         case KVM_GET_FPU: {
4289                 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4290                 r = -ENOMEM;
4291                 if (!fpu)
4292                         goto out;
4293                 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4294                 if (r)
4295                         goto out;
4296                 r = -EFAULT;
4297                 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4298                         goto out;
4299                 r = 0;
4300                 break;
4301         }
4302         case KVM_SET_FPU: {
4303                 fpu = memdup_user(argp, sizeof(*fpu));
4304                 if (IS_ERR(fpu)) {
4305                         r = PTR_ERR(fpu);
4306                         fpu = NULL;
4307                         goto out;
4308                 }
4309                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4310                 break;
4311         }
4312         case KVM_GET_STATS_FD: {
4313                 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4314                 break;
4315         }
4316         default:
4317                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4318         }
4319 out:
4320         mutex_unlock(&vcpu->mutex);
4321         kfree(fpu);
4322         kfree(kvm_sregs);
4323         return r;
4324 }
4325
4326 #ifdef CONFIG_KVM_COMPAT
4327 static long kvm_vcpu_compat_ioctl(struct file *filp,
4328                                   unsigned int ioctl, unsigned long arg)
4329 {
4330         struct kvm_vcpu *vcpu = filp->private_data;
4331         void __user *argp = compat_ptr(arg);
4332         int r;
4333
4334         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4335                 return -EIO;
4336
4337         switch (ioctl) {
4338         case KVM_SET_SIGNAL_MASK: {
4339                 struct kvm_signal_mask __user *sigmask_arg = argp;
4340                 struct kvm_signal_mask kvm_sigmask;
4341                 sigset_t sigset;
4342
4343                 if (argp) {
4344                         r = -EFAULT;
4345                         if (copy_from_user(&kvm_sigmask, argp,
4346                                            sizeof(kvm_sigmask)))
4347                                 goto out;
4348                         r = -EINVAL;
4349                         if (kvm_sigmask.len != sizeof(compat_sigset_t))
4350                                 goto out;
4351                         r = -EFAULT;
4352                         if (get_compat_sigset(&sigset,
4353                                               (compat_sigset_t __user *)sigmask_arg->sigset))
4354                                 goto out;
4355                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4356                 } else
4357                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4358                 break;
4359         }
4360         default:
4361                 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4362         }
4363
4364 out:
4365         return r;
4366 }
4367 #endif
4368
4369 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4370 {
4371         struct kvm_device *dev = filp->private_data;
4372
4373         if (dev->ops->mmap)
4374                 return dev->ops->mmap(dev, vma);
4375
4376         return -ENODEV;
4377 }
4378
4379 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4380                                  int (*accessor)(struct kvm_device *dev,
4381                                                  struct kvm_device_attr *attr),
4382                                  unsigned long arg)
4383 {
4384         struct kvm_device_attr attr;
4385
4386         if (!accessor)
4387                 return -EPERM;
4388
4389         if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4390                 return -EFAULT;
4391
4392         return accessor(dev, &attr);
4393 }
4394
4395 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4396                              unsigned long arg)
4397 {
4398         struct kvm_device *dev = filp->private_data;
4399
4400         if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4401                 return -EIO;
4402
4403         switch (ioctl) {
4404         case KVM_SET_DEVICE_ATTR:
4405                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4406         case KVM_GET_DEVICE_ATTR:
4407                 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4408         case KVM_HAS_DEVICE_ATTR:
4409                 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4410         default:
4411                 if (dev->ops->ioctl)
4412                         return dev->ops->ioctl(dev, ioctl, arg);
4413
4414                 return -ENOTTY;
4415         }
4416 }
4417
4418 static int kvm_device_release(struct inode *inode, struct file *filp)
4419 {
4420         struct kvm_device *dev = filp->private_data;
4421         struct kvm *kvm = dev->kvm;
4422
4423         if (dev->ops->release) {
4424                 mutex_lock(&kvm->lock);
4425                 list_del(&dev->vm_node);
4426                 dev->ops->release(dev);
4427                 mutex_unlock(&kvm->lock);
4428         }
4429
4430         kvm_put_kvm(kvm);
4431         return 0;
4432 }
4433
4434 static const struct file_operations kvm_device_fops = {
4435         .unlocked_ioctl = kvm_device_ioctl,
4436         .release = kvm_device_release,
4437         KVM_COMPAT(kvm_device_ioctl),
4438         .mmap = kvm_device_mmap,
4439 };
4440
4441 struct kvm_device *kvm_device_from_filp(struct file *filp)
4442 {
4443         if (filp->f_op != &kvm_device_fops)
4444                 return NULL;
4445
4446         return filp->private_data;
4447 }
4448
4449 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4450 #ifdef CONFIG_KVM_MPIC
4451         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4452         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4453 #endif
4454 };
4455
4456 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4457 {
4458         if (type >= ARRAY_SIZE(kvm_device_ops_table))
4459                 return -ENOSPC;
4460
4461         if (kvm_device_ops_table[type] != NULL)
4462                 return -EEXIST;
4463
4464         kvm_device_ops_table[type] = ops;
4465         return 0;
4466 }
4467
4468 void kvm_unregister_device_ops(u32 type)
4469 {
4470         if (kvm_device_ops_table[type] != NULL)
4471                 kvm_device_ops_table[type] = NULL;
4472 }
4473
4474 static int kvm_ioctl_create_device(struct kvm *kvm,
4475                                    struct kvm_create_device *cd)
4476 {
4477         const struct kvm_device_ops *ops;
4478         struct kvm_device *dev;
4479         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4480         int type;
4481         int ret;
4482
4483         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4484                 return -ENODEV;
4485
4486         type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4487         ops = kvm_device_ops_table[type];
4488         if (ops == NULL)
4489                 return -ENODEV;
4490
4491         if (test)
4492                 return 0;
4493
4494         dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4495         if (!dev)
4496                 return -ENOMEM;
4497
4498         dev->ops = ops;
4499         dev->kvm = kvm;
4500
4501         mutex_lock(&kvm->lock);
4502         ret = ops->create(dev, type);
4503         if (ret < 0) {
4504                 mutex_unlock(&kvm->lock);
4505                 kfree(dev);
4506                 return ret;
4507         }
4508         list_add(&dev->vm_node, &kvm->devices);
4509         mutex_unlock(&kvm->lock);
4510
4511         if (ops->init)
4512                 ops->init(dev);
4513
4514         kvm_get_kvm(kvm);
4515         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4516         if (ret < 0) {
4517                 kvm_put_kvm_no_destroy(kvm);
4518                 mutex_lock(&kvm->lock);
4519                 list_del(&dev->vm_node);
4520                 if (ops->release)
4521                         ops->release(dev);
4522                 mutex_unlock(&kvm->lock);
4523                 if (ops->destroy)
4524                         ops->destroy(dev);
4525                 return ret;
4526         }
4527
4528         cd->fd = ret;
4529         return 0;
4530 }
4531
4532 static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4533 {
4534         switch (arg) {
4535         case KVM_CAP_USER_MEMORY:
4536         case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4537         case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4538         case KVM_CAP_INTERNAL_ERROR_DATA:
4539 #ifdef CONFIG_HAVE_KVM_MSI
4540         case KVM_CAP_SIGNAL_MSI:
4541 #endif
4542 #ifdef CONFIG_HAVE_KVM_IRQFD
4543         case KVM_CAP_IRQFD:
4544 #endif
4545         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4546         case KVM_CAP_CHECK_EXTENSION_VM:
4547         case KVM_CAP_ENABLE_CAP_VM:
4548         case KVM_CAP_HALT_POLL:
4549                 return 1;
4550 #ifdef CONFIG_KVM_MMIO
4551         case KVM_CAP_COALESCED_MMIO:
4552                 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4553         case KVM_CAP_COALESCED_PIO:
4554                 return 1;
4555 #endif
4556 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4557         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4558                 return KVM_DIRTY_LOG_MANUAL_CAPS;
4559 #endif
4560 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4561         case KVM_CAP_IRQ_ROUTING:
4562                 return KVM_MAX_IRQ_ROUTES;
4563 #endif
4564 #if KVM_ADDRESS_SPACE_NUM > 1
4565         case KVM_CAP_MULTI_ADDRESS_SPACE:
4566                 return KVM_ADDRESS_SPACE_NUM;
4567 #endif
4568         case KVM_CAP_NR_MEMSLOTS:
4569                 return KVM_USER_MEM_SLOTS;
4570         case KVM_CAP_DIRTY_LOG_RING:
4571 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4572                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4573 #else
4574                 return 0;
4575 #endif
4576         case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4577 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4578                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4579 #else
4580                 return 0;
4581 #endif
4582 #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4583         case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4584 #endif
4585         case KVM_CAP_BINARY_STATS_FD:
4586         case KVM_CAP_SYSTEM_EVENT_DATA:
4587                 return 1;
4588         default:
4589                 break;
4590         }
4591         return kvm_vm_ioctl_check_extension(kvm, arg);
4592 }
4593
4594 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4595 {
4596         int r;
4597
4598         if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4599                 return -EINVAL;
4600
4601         /* the size should be power of 2 */
4602         if (!size || (size & (size - 1)))
4603                 return -EINVAL;
4604
4605         /* Should be bigger to keep the reserved entries, or a page */
4606         if (size < kvm_dirty_ring_get_rsvd_entries() *
4607             sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4608                 return -EINVAL;
4609
4610         if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4611             sizeof(struct kvm_dirty_gfn))
4612                 return -E2BIG;
4613
4614         /* We only allow it to set once */
4615         if (kvm->dirty_ring_size)
4616                 return -EINVAL;
4617
4618         mutex_lock(&kvm->lock);
4619
4620         if (kvm->created_vcpus) {
4621                 /* We don't allow to change this value after vcpu created */
4622                 r = -EINVAL;
4623         } else {
4624                 kvm->dirty_ring_size = size;
4625                 r = 0;
4626         }
4627
4628         mutex_unlock(&kvm->lock);
4629         return r;
4630 }
4631
4632 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4633 {
4634         unsigned long i;
4635         struct kvm_vcpu *vcpu;
4636         int cleared = 0;
4637
4638         if (!kvm->dirty_ring_size)
4639                 return -EINVAL;
4640
4641         mutex_lock(&kvm->slots_lock);
4642
4643         kvm_for_each_vcpu(i, vcpu, kvm)
4644                 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4645
4646         mutex_unlock(&kvm->slots_lock);
4647
4648         if (cleared)
4649                 kvm_flush_remote_tlbs(kvm);
4650
4651         return cleared;
4652 }
4653
4654 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4655                                                   struct kvm_enable_cap *cap)
4656 {
4657         return -EINVAL;
4658 }
4659
4660 bool kvm_are_all_memslots_empty(struct kvm *kvm)
4661 {
4662         int i;
4663
4664         lockdep_assert_held(&kvm->slots_lock);
4665
4666         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
4667                 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4668                         return false;
4669         }
4670
4671         return true;
4672 }
4673 EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
4674
4675 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4676                                            struct kvm_enable_cap *cap)
4677 {
4678         switch (cap->cap) {
4679 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4680         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4681                 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4682
4683                 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4684                         allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4685
4686                 if (cap->flags || (cap->args[0] & ~allowed_options))
4687                         return -EINVAL;
4688                 kvm->manual_dirty_log_protect = cap->args[0];
4689                 return 0;
4690         }
4691 #endif
4692         case KVM_CAP_HALT_POLL: {
4693                 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4694                         return -EINVAL;
4695
4696                 kvm->max_halt_poll_ns = cap->args[0];
4697
4698                 /*
4699                  * Ensure kvm->override_halt_poll_ns does not become visible
4700                  * before kvm->max_halt_poll_ns.
4701                  *
4702                  * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4703                  */
4704                 smp_wmb();
4705                 kvm->override_halt_poll_ns = true;
4706
4707                 return 0;
4708         }
4709         case KVM_CAP_DIRTY_LOG_RING:
4710         case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4711                 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
4712                         return -EINVAL;
4713
4714                 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4715         case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
4716                 int r = -EINVAL;
4717
4718                 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
4719                     !kvm->dirty_ring_size || cap->flags)
4720                         return r;
4721
4722                 mutex_lock(&kvm->slots_lock);
4723
4724                 /*
4725                  * For simplicity, allow enabling ring+bitmap if and only if
4726                  * there are no memslots, e.g. to ensure all memslots allocate
4727                  * a bitmap after the capability is enabled.
4728                  */
4729                 if (kvm_are_all_memslots_empty(kvm)) {
4730                         kvm->dirty_ring_with_bitmap = true;
4731                         r = 0;
4732                 }
4733
4734                 mutex_unlock(&kvm->slots_lock);
4735
4736                 return r;
4737         }
4738         default:
4739                 return kvm_vm_ioctl_enable_cap(kvm, cap);
4740         }
4741 }
4742
4743 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4744                               size_t size, loff_t *offset)
4745 {
4746         struct kvm *kvm = file->private_data;
4747
4748         return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4749                                 &kvm_vm_stats_desc[0], &kvm->stat,
4750                                 sizeof(kvm->stat), user_buffer, size, offset);
4751 }
4752
4753 static int kvm_vm_stats_release(struct inode *inode, struct file *file)
4754 {
4755         struct kvm *kvm = file->private_data;
4756
4757         kvm_put_kvm(kvm);
4758         return 0;
4759 }
4760
4761 static const struct file_operations kvm_vm_stats_fops = {
4762         .read = kvm_vm_stats_read,
4763         .release = kvm_vm_stats_release,
4764         .llseek = noop_llseek,
4765 };
4766
4767 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4768 {
4769         int fd;
4770         struct file *file;
4771
4772         fd = get_unused_fd_flags(O_CLOEXEC);
4773         if (fd < 0)
4774                 return fd;
4775
4776         file = anon_inode_getfile("kvm-vm-stats",
4777                         &kvm_vm_stats_fops, kvm, O_RDONLY);
4778         if (IS_ERR(file)) {
4779                 put_unused_fd(fd);
4780                 return PTR_ERR(file);
4781         }
4782
4783         kvm_get_kvm(kvm);
4784
4785         file->f_mode |= FMODE_PREAD;
4786         fd_install(fd, file);
4787
4788         return fd;
4789 }
4790
4791 static long kvm_vm_ioctl(struct file *filp,
4792                            unsigned int ioctl, unsigned long arg)
4793 {
4794         struct kvm *kvm = filp->private_data;
4795         void __user *argp = (void __user *)arg;
4796         int r;
4797
4798         if (kvm->mm != current->mm || kvm->vm_dead)
4799                 return -EIO;
4800         switch (ioctl) {
4801         case KVM_CREATE_VCPU:
4802                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4803                 break;
4804         case KVM_ENABLE_CAP: {
4805                 struct kvm_enable_cap cap;
4806
4807                 r = -EFAULT;
4808                 if (copy_from_user(&cap, argp, sizeof(cap)))
4809                         goto out;
4810                 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4811                 break;
4812         }
4813         case KVM_SET_USER_MEMORY_REGION: {
4814                 struct kvm_userspace_memory_region kvm_userspace_mem;
4815
4816                 r = -EFAULT;
4817                 if (copy_from_user(&kvm_userspace_mem, argp,
4818                                                 sizeof(kvm_userspace_mem)))
4819                         goto out;
4820
4821                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4822                 break;
4823         }
4824         case KVM_GET_DIRTY_LOG: {
4825                 struct kvm_dirty_log log;
4826
4827                 r = -EFAULT;
4828                 if (copy_from_user(&log, argp, sizeof(log)))
4829                         goto out;
4830                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4831                 break;
4832         }
4833 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4834         case KVM_CLEAR_DIRTY_LOG: {
4835                 struct kvm_clear_dirty_log log;
4836
4837                 r = -EFAULT;
4838                 if (copy_from_user(&log, argp, sizeof(log)))
4839                         goto out;
4840                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4841                 break;
4842         }
4843 #endif
4844 #ifdef CONFIG_KVM_MMIO
4845         case KVM_REGISTER_COALESCED_MMIO: {
4846                 struct kvm_coalesced_mmio_zone zone;
4847
4848                 r = -EFAULT;
4849                 if (copy_from_user(&zone, argp, sizeof(zone)))
4850                         goto out;
4851                 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4852                 break;
4853         }
4854         case KVM_UNREGISTER_COALESCED_MMIO: {
4855                 struct kvm_coalesced_mmio_zone zone;
4856
4857                 r = -EFAULT;
4858                 if (copy_from_user(&zone, argp, sizeof(zone)))
4859                         goto out;
4860                 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4861                 break;
4862         }
4863 #endif
4864         case KVM_IRQFD: {
4865                 struct kvm_irqfd data;
4866
4867                 r = -EFAULT;
4868                 if (copy_from_user(&data, argp, sizeof(data)))
4869                         goto out;
4870                 r = kvm_irqfd(kvm, &data);
4871                 break;
4872         }
4873         case KVM_IOEVENTFD: {
4874                 struct kvm_ioeventfd data;
4875
4876                 r = -EFAULT;
4877                 if (copy_from_user(&data, argp, sizeof(data)))
4878                         goto out;
4879                 r = kvm_ioeventfd(kvm, &data);
4880                 break;
4881         }
4882 #ifdef CONFIG_HAVE_KVM_MSI
4883         case KVM_SIGNAL_MSI: {
4884                 struct kvm_msi msi;
4885
4886                 r = -EFAULT;
4887                 if (copy_from_user(&msi, argp, sizeof(msi)))
4888                         goto out;
4889                 r = kvm_send_userspace_msi(kvm, &msi);
4890                 break;
4891         }
4892 #endif
4893 #ifdef __KVM_HAVE_IRQ_LINE
4894         case KVM_IRQ_LINE_STATUS:
4895         case KVM_IRQ_LINE: {
4896                 struct kvm_irq_level irq_event;
4897
4898                 r = -EFAULT;
4899                 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4900                         goto out;
4901
4902                 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4903                                         ioctl == KVM_IRQ_LINE_STATUS);
4904                 if (r)
4905                         goto out;
4906
4907                 r = -EFAULT;
4908                 if (ioctl == KVM_IRQ_LINE_STATUS) {
4909                         if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4910                                 goto out;
4911                 }
4912
4913                 r = 0;
4914                 break;
4915         }
4916 #endif
4917 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4918         case KVM_SET_GSI_ROUTING: {
4919                 struct kvm_irq_routing routing;
4920                 struct kvm_irq_routing __user *urouting;
4921                 struct kvm_irq_routing_entry *entries = NULL;
4922
4923                 r = -EFAULT;
4924                 if (copy_from_user(&routing, argp, sizeof(routing)))
4925                         goto out;
4926                 r = -EINVAL;
4927                 if (!kvm_arch_can_set_irq_routing(kvm))
4928                         goto out;
4929                 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4930                         goto out;
4931                 if (routing.flags)
4932                         goto out;
4933                 if (routing.nr) {
4934                         urouting = argp;
4935                         entries = vmemdup_user(urouting->entries,
4936                                                array_size(sizeof(*entries),
4937                                                           routing.nr));
4938                         if (IS_ERR(entries)) {
4939                                 r = PTR_ERR(entries);
4940                                 goto out;
4941                         }
4942                 }
4943                 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4944                                         routing.flags);
4945                 kvfree(entries);
4946                 break;
4947         }
4948 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4949         case KVM_CREATE_DEVICE: {
4950                 struct kvm_create_device cd;
4951
4952                 r = -EFAULT;
4953                 if (copy_from_user(&cd, argp, sizeof(cd)))
4954                         goto out;
4955
4956                 r = kvm_ioctl_create_device(kvm, &cd);
4957                 if (r)
4958                         goto out;
4959
4960                 r = -EFAULT;
4961                 if (copy_to_user(argp, &cd, sizeof(cd)))
4962                         goto out;
4963
4964                 r = 0;
4965                 break;
4966         }
4967         case KVM_CHECK_EXTENSION:
4968                 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4969                 break;
4970         case KVM_RESET_DIRTY_RINGS:
4971                 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4972                 break;
4973         case KVM_GET_STATS_FD:
4974                 r = kvm_vm_ioctl_get_stats_fd(kvm);
4975                 break;
4976         default:
4977                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4978         }
4979 out:
4980         return r;
4981 }
4982
4983 #ifdef CONFIG_KVM_COMPAT
4984 struct compat_kvm_dirty_log {
4985         __u32 slot;
4986         __u32 padding1;
4987         union {
4988                 compat_uptr_t dirty_bitmap; /* one bit per page */
4989                 __u64 padding2;
4990         };
4991 };
4992
4993 struct compat_kvm_clear_dirty_log {
4994         __u32 slot;
4995         __u32 num_pages;
4996         __u64 first_page;
4997         union {
4998                 compat_uptr_t dirty_bitmap; /* one bit per page */
4999                 __u64 padding2;
5000         };
5001 };
5002
5003 long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5004                                      unsigned long arg)
5005 {
5006         return -ENOTTY;
5007 }
5008
5009 static long kvm_vm_compat_ioctl(struct file *filp,
5010                            unsigned int ioctl, unsigned long arg)
5011 {
5012         struct kvm *kvm = filp->private_data;
5013         int r;
5014
5015         if (kvm->mm != current->mm || kvm->vm_dead)
5016                 return -EIO;
5017
5018         r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5019         if (r != -ENOTTY)
5020                 return r;
5021
5022         switch (ioctl) {
5023 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5024         case KVM_CLEAR_DIRTY_LOG: {
5025                 struct compat_kvm_clear_dirty_log compat_log;
5026                 struct kvm_clear_dirty_log log;
5027
5028                 if (copy_from_user(&compat_log, (void __user *)arg,
5029                                    sizeof(compat_log)))
5030                         return -EFAULT;
5031                 log.slot         = compat_log.slot;
5032                 log.num_pages    = compat_log.num_pages;
5033                 log.first_page   = compat_log.first_page;
5034                 log.padding2     = compat_log.padding2;
5035                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5036
5037                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5038                 break;
5039         }
5040 #endif
5041         case KVM_GET_DIRTY_LOG: {
5042                 struct compat_kvm_dirty_log compat_log;
5043                 struct kvm_dirty_log log;
5044
5045                 if (copy_from_user(&compat_log, (void __user *)arg,
5046                                    sizeof(compat_log)))
5047                         return -EFAULT;
5048                 log.slot         = compat_log.slot;
5049                 log.padding1     = compat_log.padding1;
5050                 log.padding2     = compat_log.padding2;
5051                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5052
5053                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5054                 break;
5055         }
5056         default:
5057                 r = kvm_vm_ioctl(filp, ioctl, arg);
5058         }
5059         return r;
5060 }
5061 #endif
5062
5063 static const struct file_operations kvm_vm_fops = {
5064         .release        = kvm_vm_release,
5065         .unlocked_ioctl = kvm_vm_ioctl,
5066         .llseek         = noop_llseek,
5067         KVM_COMPAT(kvm_vm_compat_ioctl),
5068 };
5069
5070 bool file_is_kvm(struct file *file)
5071 {
5072         return file && file->f_op == &kvm_vm_fops;
5073 }
5074 EXPORT_SYMBOL_GPL(file_is_kvm);
5075
5076 static int kvm_dev_ioctl_create_vm(unsigned long type)
5077 {
5078         char fdname[ITOA_MAX_LEN + 1];
5079         int r, fd;
5080         struct kvm *kvm;
5081         struct file *file;
5082
5083         fd = get_unused_fd_flags(O_CLOEXEC);
5084         if (fd < 0)
5085                 return fd;
5086
5087         snprintf(fdname, sizeof(fdname), "%d", fd);
5088
5089         kvm = kvm_create_vm(type, fdname);
5090         if (IS_ERR(kvm)) {
5091                 r = PTR_ERR(kvm);
5092                 goto put_fd;
5093         }
5094
5095         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5096         if (IS_ERR(file)) {
5097                 r = PTR_ERR(file);
5098                 goto put_kvm;
5099         }
5100
5101         /*
5102          * Don't call kvm_put_kvm anymore at this point; file->f_op is
5103          * already set, with ->release() being kvm_vm_release().  In error
5104          * cases it will be called by the final fput(file) and will take
5105          * care of doing kvm_put_kvm(kvm).
5106          */
5107         kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5108
5109         fd_install(fd, file);
5110         return fd;
5111
5112 put_kvm:
5113         kvm_put_kvm(kvm);
5114 put_fd:
5115         put_unused_fd(fd);
5116         return r;
5117 }
5118
5119 static long kvm_dev_ioctl(struct file *filp,
5120                           unsigned int ioctl, unsigned long arg)
5121 {
5122         int r = -EINVAL;
5123
5124         switch (ioctl) {
5125         case KVM_GET_API_VERSION:
5126                 if (arg)
5127                         goto out;
5128                 r = KVM_API_VERSION;
5129                 break;
5130         case KVM_CREATE_VM:
5131                 r = kvm_dev_ioctl_create_vm(arg);
5132                 break;
5133         case KVM_CHECK_EXTENSION:
5134                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5135                 break;
5136         case KVM_GET_VCPU_MMAP_SIZE:
5137                 if (arg)
5138                         goto out;
5139                 r = PAGE_SIZE;     /* struct kvm_run */
5140 #ifdef CONFIG_X86
5141                 r += PAGE_SIZE;    /* pio data page */
5142 #endif
5143 #ifdef CONFIG_KVM_MMIO
5144                 r += PAGE_SIZE;    /* coalesced mmio ring page */
5145 #endif
5146                 break;
5147         case KVM_TRACE_ENABLE:
5148         case KVM_TRACE_PAUSE:
5149         case KVM_TRACE_DISABLE:
5150                 r = -EOPNOTSUPP;
5151                 break;
5152         default:
5153                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
5154         }
5155 out:
5156         return r;
5157 }
5158
5159 static struct file_operations kvm_chardev_ops = {
5160         .unlocked_ioctl = kvm_dev_ioctl,
5161         .llseek         = noop_llseek,
5162         KVM_COMPAT(kvm_dev_ioctl),
5163 };
5164
5165 static struct miscdevice kvm_dev = {
5166         KVM_MINOR,
5167         "kvm",
5168         &kvm_chardev_ops,
5169 };
5170
5171 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5172 __visible bool kvm_rebooting;
5173 EXPORT_SYMBOL_GPL(kvm_rebooting);
5174
5175 static DEFINE_PER_CPU(bool, hardware_enabled);
5176 static int kvm_usage_count;
5177
5178 static int __hardware_enable_nolock(void)
5179 {
5180         if (__this_cpu_read(hardware_enabled))
5181                 return 0;
5182
5183         if (kvm_arch_hardware_enable()) {
5184                 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5185                         raw_smp_processor_id());
5186                 return -EIO;
5187         }
5188
5189         __this_cpu_write(hardware_enabled, true);
5190         return 0;
5191 }
5192
5193 static void hardware_enable_nolock(void *failed)
5194 {
5195         if (__hardware_enable_nolock())
5196                 atomic_inc(failed);
5197 }
5198
5199 static int kvm_online_cpu(unsigned int cpu)
5200 {
5201         int ret = 0;
5202
5203         /*
5204          * Abort the CPU online process if hardware virtualization cannot
5205          * be enabled. Otherwise running VMs would encounter unrecoverable
5206          * errors when scheduled to this CPU.
5207          */
5208         mutex_lock(&kvm_lock);
5209         if (kvm_usage_count)
5210                 ret = __hardware_enable_nolock();
5211         mutex_unlock(&kvm_lock);
5212         return ret;
5213 }
5214
5215 static void hardware_disable_nolock(void *junk)
5216 {
5217         /*
5218          * Note, hardware_disable_all_nolock() tells all online CPUs to disable
5219          * hardware, not just CPUs that successfully enabled hardware!
5220          */
5221         if (!__this_cpu_read(hardware_enabled))
5222                 return;
5223
5224         kvm_arch_hardware_disable();
5225
5226         __this_cpu_write(hardware_enabled, false);
5227 }
5228
5229 static int kvm_offline_cpu(unsigned int cpu)
5230 {
5231         mutex_lock(&kvm_lock);
5232         if (kvm_usage_count)
5233                 hardware_disable_nolock(NULL);
5234         mutex_unlock(&kvm_lock);
5235         return 0;
5236 }
5237
5238 static void hardware_disable_all_nolock(void)
5239 {
5240         BUG_ON(!kvm_usage_count);
5241
5242         kvm_usage_count--;
5243         if (!kvm_usage_count)
5244                 on_each_cpu(hardware_disable_nolock, NULL, 1);
5245 }
5246
5247 static void hardware_disable_all(void)
5248 {
5249         cpus_read_lock();
5250         mutex_lock(&kvm_lock);
5251         hardware_disable_all_nolock();
5252         mutex_unlock(&kvm_lock);
5253         cpus_read_unlock();
5254 }
5255
5256 static int hardware_enable_all(void)
5257 {
5258         atomic_t failed = ATOMIC_INIT(0);
5259         int r;
5260
5261         /*
5262          * Do not enable hardware virtualization if the system is going down.
5263          * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5264          * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5265          * after kvm_reboot() is called.  Note, this relies on system_state
5266          * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5267          * hook instead of registering a dedicated reboot notifier (the latter
5268          * runs before system_state is updated).
5269          */
5270         if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5271             system_state == SYSTEM_RESTART)
5272                 return -EBUSY;
5273
5274         /*
5275          * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5276          * is called, and so on_each_cpu() between them includes the CPU that
5277          * is being onlined.  As a result, hardware_enable_nolock() may get
5278          * invoked before kvm_online_cpu(), which also enables hardware if the
5279          * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
5280          * enable hardware multiple times.
5281          */
5282         cpus_read_lock();
5283         mutex_lock(&kvm_lock);
5284
5285         r = 0;
5286
5287         kvm_usage_count++;
5288         if (kvm_usage_count == 1) {
5289                 on_each_cpu(hardware_enable_nolock, &failed, 1);
5290
5291                 if (atomic_read(&failed)) {
5292                         hardware_disable_all_nolock();
5293                         r = -EBUSY;
5294                 }
5295         }
5296
5297         mutex_unlock(&kvm_lock);
5298         cpus_read_unlock();
5299
5300         return r;
5301 }
5302
5303 static void kvm_shutdown(void)
5304 {
5305         /*
5306          * Disable hardware virtualization and set kvm_rebooting to indicate
5307          * that KVM has asynchronously disabled hardware virtualization, i.e.
5308          * that relevant errors and exceptions aren't entirely unexpected.
5309          * Some flavors of hardware virtualization need to be disabled before
5310          * transferring control to firmware (to perform shutdown/reboot), e.g.
5311          * on x86, virtualization can block INIT interrupts, which are used by
5312          * firmware to pull APs back under firmware control.  Note, this path
5313          * is used for both shutdown and reboot scenarios, i.e. neither name is
5314          * 100% comprehensive.
5315          */
5316         pr_info("kvm: exiting hardware virtualization\n");
5317         kvm_rebooting = true;
5318         on_each_cpu(hardware_disable_nolock, NULL, 1);
5319 }
5320
5321 static int kvm_suspend(void)
5322 {
5323         /*
5324          * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5325          * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5326          * is stable.  Assert that kvm_lock is not held to ensure the system
5327          * isn't suspended while KVM is enabling hardware.  Hardware enabling
5328          * can be preempted, but the task cannot be frozen until it has dropped
5329          * all locks (userspace tasks are frozen via a fake signal).
5330          */
5331         lockdep_assert_not_held(&kvm_lock);
5332         lockdep_assert_irqs_disabled();
5333
5334         if (kvm_usage_count)
5335                 hardware_disable_nolock(NULL);
5336         return 0;
5337 }
5338
5339 static void kvm_resume(void)
5340 {
5341         lockdep_assert_not_held(&kvm_lock);
5342         lockdep_assert_irqs_disabled();
5343
5344         if (kvm_usage_count)
5345                 WARN_ON_ONCE(__hardware_enable_nolock());
5346 }
5347
5348 static struct syscore_ops kvm_syscore_ops = {
5349         .suspend = kvm_suspend,
5350         .resume = kvm_resume,
5351         .shutdown = kvm_shutdown,
5352 };
5353 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5354 static int hardware_enable_all(void)
5355 {
5356         return 0;
5357 }
5358
5359 static void hardware_disable_all(void)
5360 {
5361
5362 }
5363 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5364
5365 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5366 {
5367         if (dev->ops->destructor)
5368                 dev->ops->destructor(dev);
5369 }
5370
5371 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5372 {
5373         int i;
5374
5375         for (i = 0; i < bus->dev_count; i++) {
5376                 struct kvm_io_device *pos = bus->range[i].dev;
5377
5378                 kvm_iodevice_destructor(pos);
5379         }
5380         kfree(bus);
5381 }
5382
5383 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5384                                  const struct kvm_io_range *r2)
5385 {
5386         gpa_t addr1 = r1->addr;
5387         gpa_t addr2 = r2->addr;
5388
5389         if (addr1 < addr2)
5390                 return -1;
5391
5392         /* If r2->len == 0, match the exact address.  If r2->len != 0,
5393          * accept any overlapping write.  Any order is acceptable for
5394          * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5395          * we process all of them.
5396          */
5397         if (r2->len) {
5398                 addr1 += r1->len;
5399                 addr2 += r2->len;
5400         }
5401
5402         if (addr1 > addr2)
5403                 return 1;
5404
5405         return 0;
5406 }
5407
5408 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5409 {
5410         return kvm_io_bus_cmp(p1, p2);
5411 }
5412
5413 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5414                              gpa_t addr, int len)
5415 {
5416         struct kvm_io_range *range, key;
5417         int off;
5418
5419         key = (struct kvm_io_range) {
5420                 .addr = addr,
5421                 .len = len,
5422         };
5423
5424         range = bsearch(&key, bus->range, bus->dev_count,
5425                         sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5426         if (range == NULL)
5427                 return -ENOENT;
5428
5429         off = range - bus->range;
5430
5431         while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5432                 off--;
5433
5434         return off;
5435 }
5436
5437 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5438                               struct kvm_io_range *range, const void *val)
5439 {
5440         int idx;
5441
5442         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5443         if (idx < 0)
5444                 return -EOPNOTSUPP;
5445
5446         while (idx < bus->dev_count &&
5447                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5448                 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5449                                         range->len, val))
5450                         return idx;
5451                 idx++;
5452         }
5453
5454         return -EOPNOTSUPP;
5455 }
5456
5457 /* kvm_io_bus_write - called under kvm->slots_lock */
5458 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5459                      int len, const void *val)
5460 {
5461         struct kvm_io_bus *bus;
5462         struct kvm_io_range range;
5463         int r;
5464
5465         range = (struct kvm_io_range) {
5466                 .addr = addr,
5467                 .len = len,
5468         };
5469
5470         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5471         if (!bus)
5472                 return -ENOMEM;
5473         r = __kvm_io_bus_write(vcpu, bus, &range, val);
5474         return r < 0 ? r : 0;
5475 }
5476 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5477
5478 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
5479 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5480                             gpa_t addr, int len, const void *val, long cookie)
5481 {
5482         struct kvm_io_bus *bus;
5483         struct kvm_io_range range;
5484
5485         range = (struct kvm_io_range) {
5486                 .addr = addr,
5487                 .len = len,
5488         };
5489
5490         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5491         if (!bus)
5492                 return -ENOMEM;
5493
5494         /* First try the device referenced by cookie. */
5495         if ((cookie >= 0) && (cookie < bus->dev_count) &&
5496             (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5497                 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5498                                         val))
5499                         return cookie;
5500
5501         /*
5502          * cookie contained garbage; fall back to search and return the
5503          * correct cookie value.
5504          */
5505         return __kvm_io_bus_write(vcpu, bus, &range, val);
5506 }
5507
5508 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5509                              struct kvm_io_range *range, void *val)
5510 {
5511         int idx;
5512
5513         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5514         if (idx < 0)
5515                 return -EOPNOTSUPP;
5516
5517         while (idx < bus->dev_count &&
5518                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5519                 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5520                                        range->len, val))
5521                         return idx;
5522                 idx++;
5523         }
5524
5525         return -EOPNOTSUPP;
5526 }
5527
5528 /* kvm_io_bus_read - called under kvm->slots_lock */
5529 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5530                     int len, void *val)
5531 {
5532         struct kvm_io_bus *bus;
5533         struct kvm_io_range range;
5534         int r;
5535
5536         range = (struct kvm_io_range) {
5537                 .addr = addr,
5538                 .len = len,
5539         };
5540
5541         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5542         if (!bus)
5543                 return -ENOMEM;
5544         r = __kvm_io_bus_read(vcpu, bus, &range, val);
5545         return r < 0 ? r : 0;
5546 }
5547
5548 /* Caller must hold slots_lock. */
5549 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5550                             int len, struct kvm_io_device *dev)
5551 {
5552         int i;
5553         struct kvm_io_bus *new_bus, *bus;
5554         struct kvm_io_range range;
5555
5556         bus = kvm_get_bus(kvm, bus_idx);
5557         if (!bus)
5558                 return -ENOMEM;
5559
5560         /* exclude ioeventfd which is limited by maximum fd */
5561         if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5562                 return -ENOSPC;
5563
5564         new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5565                           GFP_KERNEL_ACCOUNT);
5566         if (!new_bus)
5567                 return -ENOMEM;
5568
5569         range = (struct kvm_io_range) {
5570                 .addr = addr,
5571                 .len = len,
5572                 .dev = dev,
5573         };
5574
5575         for (i = 0; i < bus->dev_count; i++)
5576                 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5577                         break;
5578
5579         memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5580         new_bus->dev_count++;
5581         new_bus->range[i] = range;
5582         memcpy(new_bus->range + i + 1, bus->range + i,
5583                 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5584         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5585         synchronize_srcu_expedited(&kvm->srcu);
5586         kfree(bus);
5587
5588         return 0;
5589 }
5590
5591 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5592                               struct kvm_io_device *dev)
5593 {
5594         int i;
5595         struct kvm_io_bus *new_bus, *bus;
5596
5597         lockdep_assert_held(&kvm->slots_lock);
5598
5599         bus = kvm_get_bus(kvm, bus_idx);
5600         if (!bus)
5601                 return 0;
5602
5603         for (i = 0; i < bus->dev_count; i++) {
5604                 if (bus->range[i].dev == dev) {
5605                         break;
5606                 }
5607         }
5608
5609         if (i == bus->dev_count)
5610                 return 0;
5611
5612         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5613                           GFP_KERNEL_ACCOUNT);
5614         if (new_bus) {
5615                 memcpy(new_bus, bus, struct_size(bus, range, i));
5616                 new_bus->dev_count--;
5617                 memcpy(new_bus->range + i, bus->range + i + 1,
5618                                 flex_array_size(new_bus, range, new_bus->dev_count - i));
5619         }
5620
5621         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5622         synchronize_srcu_expedited(&kvm->srcu);
5623
5624         /*
5625          * If NULL bus is installed, destroy the old bus, including all the
5626          * attached devices. Otherwise, destroy the caller's device only.
5627          */
5628         if (!new_bus) {
5629                 pr_err("kvm: failed to shrink bus, removing it completely\n");
5630                 kvm_io_bus_destroy(bus);
5631                 return -ENOMEM;
5632         }
5633
5634         kvm_iodevice_destructor(dev);
5635         kfree(bus);
5636         return 0;
5637 }
5638
5639 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5640                                          gpa_t addr)
5641 {
5642         struct kvm_io_bus *bus;
5643         int dev_idx, srcu_idx;
5644         struct kvm_io_device *iodev = NULL;
5645
5646         srcu_idx = srcu_read_lock(&kvm->srcu);
5647
5648         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5649         if (!bus)
5650                 goto out_unlock;
5651
5652         dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5653         if (dev_idx < 0)
5654                 goto out_unlock;
5655
5656         iodev = bus->range[dev_idx].dev;
5657
5658 out_unlock:
5659         srcu_read_unlock(&kvm->srcu, srcu_idx);
5660
5661         return iodev;
5662 }
5663 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5664
5665 static int kvm_debugfs_open(struct inode *inode, struct file *file,
5666                            int (*get)(void *, u64 *), int (*set)(void *, u64),
5667                            const char *fmt)
5668 {
5669         int ret;
5670         struct kvm_stat_data *stat_data = inode->i_private;
5671
5672         /*
5673          * The debugfs files are a reference to the kvm struct which
5674         * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5675         * avoids the race between open and the removal of the debugfs directory.
5676          */
5677         if (!kvm_get_kvm_safe(stat_data->kvm))
5678                 return -ENOENT;
5679
5680         ret = simple_attr_open(inode, file, get,
5681                                kvm_stats_debugfs_mode(stat_data->desc) & 0222
5682                                ? set : NULL, fmt);
5683         if (ret)
5684                 kvm_put_kvm(stat_data->kvm);
5685
5686         return ret;
5687 }
5688
5689 static int kvm_debugfs_release(struct inode *inode, struct file *file)
5690 {
5691         struct kvm_stat_data *stat_data = inode->i_private;
5692
5693         simple_attr_release(inode, file);
5694         kvm_put_kvm(stat_data->kvm);
5695
5696         return 0;
5697 }
5698
5699 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5700 {
5701         *val = *(u64 *)((void *)(&kvm->stat) + offset);
5702
5703         return 0;
5704 }
5705
5706 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5707 {
5708         *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5709
5710         return 0;
5711 }
5712
5713 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5714 {
5715         unsigned long i;
5716         struct kvm_vcpu *vcpu;
5717
5718         *val = 0;
5719
5720         kvm_for_each_vcpu(i, vcpu, kvm)
5721                 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5722
5723         return 0;
5724 }
5725
5726 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5727 {
5728         unsigned long i;
5729         struct kvm_vcpu *vcpu;
5730
5731         kvm_for_each_vcpu(i, vcpu, kvm)
5732                 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5733
5734         return 0;
5735 }
5736
5737 static int kvm_stat_data_get(void *data, u64 *val)
5738 {
5739         int r = -EFAULT;
5740         struct kvm_stat_data *stat_data = data;
5741
5742         switch (stat_data->kind) {
5743         case KVM_STAT_VM:
5744                 r = kvm_get_stat_per_vm(stat_data->kvm,
5745                                         stat_data->desc->desc.offset, val);
5746                 break;
5747         case KVM_STAT_VCPU:
5748                 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5749                                           stat_data->desc->desc.offset, val);
5750                 break;
5751         }
5752
5753         return r;
5754 }
5755
5756 static int kvm_stat_data_clear(void *data, u64 val)
5757 {
5758         int r = -EFAULT;
5759         struct kvm_stat_data *stat_data = data;
5760
5761         if (val)
5762                 return -EINVAL;
5763
5764         switch (stat_data->kind) {
5765         case KVM_STAT_VM:
5766                 r = kvm_clear_stat_per_vm(stat_data->kvm,
5767                                           stat_data->desc->desc.offset);
5768                 break;
5769         case KVM_STAT_VCPU:
5770                 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5771                                             stat_data->desc->desc.offset);
5772                 break;
5773         }
5774
5775         return r;
5776 }
5777
5778 static int kvm_stat_data_open(struct inode *inode, struct file *file)
5779 {
5780         __simple_attr_check_format("%llu\n", 0ull);
5781         return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5782                                 kvm_stat_data_clear, "%llu\n");
5783 }
5784
5785 static const struct file_operations stat_fops_per_vm = {
5786         .owner = THIS_MODULE,
5787         .open = kvm_stat_data_open,
5788         .release = kvm_debugfs_release,
5789         .read = simple_attr_read,
5790         .write = simple_attr_write,
5791         .llseek = no_llseek,
5792 };
5793
5794 static int vm_stat_get(void *_offset, u64 *val)
5795 {
5796         unsigned offset = (long)_offset;
5797         struct kvm *kvm;
5798         u64 tmp_val;
5799
5800         *val = 0;
5801         mutex_lock(&kvm_lock);
5802         list_for_each_entry(kvm, &vm_list, vm_list) {
5803                 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5804                 *val += tmp_val;
5805         }
5806         mutex_unlock(&kvm_lock);
5807         return 0;
5808 }
5809
5810 static int vm_stat_clear(void *_offset, u64 val)
5811 {
5812         unsigned offset = (long)_offset;
5813         struct kvm *kvm;
5814
5815         if (val)
5816                 return -EINVAL;
5817
5818         mutex_lock(&kvm_lock);
5819         list_for_each_entry(kvm, &vm_list, vm_list) {
5820                 kvm_clear_stat_per_vm(kvm, offset);
5821         }
5822         mutex_unlock(&kvm_lock);
5823
5824         return 0;
5825 }
5826
5827 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5828 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5829
5830 static int vcpu_stat_get(void *_offset, u64 *val)
5831 {
5832         unsigned offset = (long)_offset;
5833         struct kvm *kvm;
5834         u64 tmp_val;
5835
5836         *val = 0;
5837         mutex_lock(&kvm_lock);
5838         list_for_each_entry(kvm, &vm_list, vm_list) {
5839                 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5840                 *val += tmp_val;
5841         }
5842         mutex_unlock(&kvm_lock);
5843         return 0;
5844 }
5845
5846 static int vcpu_stat_clear(void *_offset, u64 val)
5847 {
5848         unsigned offset = (long)_offset;
5849         struct kvm *kvm;
5850
5851         if (val)
5852                 return -EINVAL;
5853
5854         mutex_lock(&kvm_lock);
5855         list_for_each_entry(kvm, &vm_list, vm_list) {
5856                 kvm_clear_stat_per_vcpu(kvm, offset);
5857         }
5858         mutex_unlock(&kvm_lock);
5859
5860         return 0;
5861 }
5862
5863 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5864                         "%llu\n");
5865 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5866
5867 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5868 {
5869         struct kobj_uevent_env *env;
5870         unsigned long long created, active;
5871
5872         if (!kvm_dev.this_device || !kvm)
5873                 return;
5874
5875         mutex_lock(&kvm_lock);
5876         if (type == KVM_EVENT_CREATE_VM) {
5877                 kvm_createvm_count++;
5878                 kvm_active_vms++;
5879         } else if (type == KVM_EVENT_DESTROY_VM) {
5880                 kvm_active_vms--;
5881         }
5882         created = kvm_createvm_count;
5883         active = kvm_active_vms;
5884         mutex_unlock(&kvm_lock);
5885
5886         env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5887         if (!env)
5888                 return;
5889
5890         add_uevent_var(env, "CREATED=%llu", created);
5891         add_uevent_var(env, "COUNT=%llu", active);
5892
5893         if (type == KVM_EVENT_CREATE_VM) {
5894                 add_uevent_var(env, "EVENT=create");
5895                 kvm->userspace_pid = task_pid_nr(current);
5896         } else if (type == KVM_EVENT_DESTROY_VM) {
5897                 add_uevent_var(env, "EVENT=destroy");
5898         }
5899         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5900
5901         if (!IS_ERR(kvm->debugfs_dentry)) {
5902                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5903
5904                 if (p) {
5905                         tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5906                         if (!IS_ERR(tmp))
5907                                 add_uevent_var(env, "STATS_PATH=%s", tmp);
5908                         kfree(p);
5909                 }
5910         }
5911         /* no need for checks, since we are adding at most only 5 keys */
5912         env->envp[env->envp_idx++] = NULL;
5913         kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5914         kfree(env);
5915 }
5916
5917 static void kvm_init_debug(void)
5918 {
5919         const struct file_operations *fops;
5920         const struct _kvm_stats_desc *pdesc;
5921         int i;
5922
5923         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5924
5925         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5926                 pdesc = &kvm_vm_stats_desc[i];
5927                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5928                         fops = &vm_stat_fops;
5929                 else
5930                         fops = &vm_stat_readonly_fops;
5931                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5932                                 kvm_debugfs_dir,
5933                                 (void *)(long)pdesc->desc.offset, fops);
5934         }
5935
5936         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5937                 pdesc = &kvm_vcpu_stats_desc[i];
5938                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5939                         fops = &vcpu_stat_fops;
5940                 else
5941                         fops = &vcpu_stat_readonly_fops;
5942                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5943                                 kvm_debugfs_dir,
5944                                 (void *)(long)pdesc->desc.offset, fops);
5945         }
5946 }
5947
5948 static inline
5949 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5950 {
5951         return container_of(pn, struct kvm_vcpu, preempt_notifier);
5952 }
5953
5954 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5955 {
5956         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5957
5958         WRITE_ONCE(vcpu->preempted, false);
5959         WRITE_ONCE(vcpu->ready, false);
5960
5961         __this_cpu_write(kvm_running_vcpu, vcpu);
5962         kvm_arch_sched_in(vcpu, cpu);
5963         kvm_arch_vcpu_load(vcpu, cpu);
5964 }
5965
5966 static void kvm_sched_out(struct preempt_notifier *pn,
5967                           struct task_struct *next)
5968 {
5969         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5970
5971         if (current->on_rq) {
5972                 WRITE_ONCE(vcpu->preempted, true);
5973                 WRITE_ONCE(vcpu->ready, true);
5974         }
5975         kvm_arch_vcpu_put(vcpu);
5976         __this_cpu_write(kvm_running_vcpu, NULL);
5977 }
5978
5979 /**
5980  * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5981  *
5982  * We can disable preemption locally around accessing the per-CPU variable,
5983  * and use the resolved vcpu pointer after enabling preemption again,
5984  * because even if the current thread is migrated to another CPU, reading
5985  * the per-CPU value later will give us the same value as we update the
5986  * per-CPU variable in the preempt notifier handlers.
5987  */
5988 struct kvm_vcpu *kvm_get_running_vcpu(void)
5989 {
5990         struct kvm_vcpu *vcpu;
5991
5992         preempt_disable();
5993         vcpu = __this_cpu_read(kvm_running_vcpu);
5994         preempt_enable();
5995
5996         return vcpu;
5997 }
5998 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5999
6000 /**
6001  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6002  */
6003 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6004 {
6005         return &kvm_running_vcpu;
6006 }
6007
6008 #ifdef CONFIG_GUEST_PERF_EVENTS
6009 static unsigned int kvm_guest_state(void)
6010 {
6011         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6012         unsigned int state;
6013
6014         if (!kvm_arch_pmi_in_guest(vcpu))
6015                 return 0;
6016
6017         state = PERF_GUEST_ACTIVE;
6018         if (!kvm_arch_vcpu_in_kernel(vcpu))
6019                 state |= PERF_GUEST_USER;
6020
6021         return state;
6022 }
6023
6024 static unsigned long kvm_guest_get_ip(void)
6025 {
6026         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6027
6028         /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6029         if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6030                 return 0;
6031
6032         return kvm_arch_vcpu_get_ip(vcpu);
6033 }
6034
6035 static struct perf_guest_info_callbacks kvm_guest_cbs = {
6036         .state                  = kvm_guest_state,
6037         .get_ip                 = kvm_guest_get_ip,
6038         .handle_intel_pt_intr   = NULL,
6039 };
6040
6041 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6042 {
6043         kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6044         perf_register_guest_info_callbacks(&kvm_guest_cbs);
6045 }
6046 void kvm_unregister_perf_callbacks(void)
6047 {
6048         perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6049 }
6050 #endif
6051
6052 int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6053 {
6054         int r;
6055         int cpu;
6056
6057 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6058         r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6059                                       kvm_online_cpu, kvm_offline_cpu);
6060         if (r)
6061                 return r;
6062
6063         register_syscore_ops(&kvm_syscore_ops);
6064 #endif
6065
6066         /* A kmem cache lets us meet the alignment requirements of fx_save. */
6067         if (!vcpu_align)
6068                 vcpu_align = __alignof__(struct kvm_vcpu);
6069         kvm_vcpu_cache =
6070                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6071                                            SLAB_ACCOUNT,
6072                                            offsetof(struct kvm_vcpu, arch),
6073                                            offsetofend(struct kvm_vcpu, stats_id)
6074                                            - offsetof(struct kvm_vcpu, arch),
6075                                            NULL);
6076         if (!kvm_vcpu_cache) {
6077                 r = -ENOMEM;
6078                 goto err_vcpu_cache;
6079         }
6080
6081         for_each_possible_cpu(cpu) {
6082                 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6083                                             GFP_KERNEL, cpu_to_node(cpu))) {
6084                         r = -ENOMEM;
6085                         goto err_cpu_kick_mask;
6086                 }
6087         }
6088
6089         r = kvm_irqfd_init();
6090         if (r)
6091                 goto err_irqfd;
6092
6093         r = kvm_async_pf_init();
6094         if (r)
6095                 goto err_async_pf;
6096
6097         kvm_chardev_ops.owner = module;
6098
6099         kvm_preempt_ops.sched_in = kvm_sched_in;
6100         kvm_preempt_ops.sched_out = kvm_sched_out;
6101
6102         kvm_init_debug();
6103
6104         r = kvm_vfio_ops_init();
6105         if (WARN_ON_ONCE(r))
6106                 goto err_vfio;
6107
6108         /*
6109          * Registration _must_ be the very last thing done, as this exposes
6110          * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6111          */
6112         r = misc_register(&kvm_dev);
6113         if (r) {
6114                 pr_err("kvm: misc device register failed\n");
6115                 goto err_register;
6116         }
6117
6118         return 0;
6119
6120 err_register:
6121         kvm_vfio_ops_exit();
6122 err_vfio:
6123         kvm_async_pf_deinit();
6124 err_async_pf:
6125         kvm_irqfd_exit();
6126 err_irqfd:
6127 err_cpu_kick_mask:
6128         for_each_possible_cpu(cpu)
6129                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6130         kmem_cache_destroy(kvm_vcpu_cache);
6131 err_vcpu_cache:
6132 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6133         unregister_syscore_ops(&kvm_syscore_ops);
6134         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6135 #endif
6136         return r;
6137 }
6138 EXPORT_SYMBOL_GPL(kvm_init);
6139
6140 void kvm_exit(void)
6141 {
6142         int cpu;
6143
6144         /*
6145          * Note, unregistering /dev/kvm doesn't strictly need to come first,
6146          * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6147          * to KVM while the module is being stopped.
6148          */
6149         misc_deregister(&kvm_dev);
6150
6151         debugfs_remove_recursive(kvm_debugfs_dir);
6152         for_each_possible_cpu(cpu)
6153                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6154         kmem_cache_destroy(kvm_vcpu_cache);
6155         kvm_vfio_ops_exit();
6156         kvm_async_pf_deinit();
6157 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6158         unregister_syscore_ops(&kvm_syscore_ops);
6159         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6160 #endif
6161         kvm_irqfd_exit();
6162 }
6163 EXPORT_SYMBOL_GPL(kvm_exit);
6164
6165 struct kvm_vm_worker_thread_context {
6166         struct kvm *kvm;
6167         struct task_struct *parent;
6168         struct completion init_done;
6169         kvm_vm_thread_fn_t thread_fn;
6170         uintptr_t data;
6171         int err;
6172 };
6173
6174 static int kvm_vm_worker_thread(void *context)
6175 {
6176         /*
6177          * The init_context is allocated on the stack of the parent thread, so
6178          * we have to locally copy anything that is needed beyond initialization
6179          */
6180         struct kvm_vm_worker_thread_context *init_context = context;
6181         struct task_struct *parent;
6182         struct kvm *kvm = init_context->kvm;
6183         kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6184         uintptr_t data = init_context->data;
6185         int err;
6186
6187         err = kthread_park(current);
6188         /* kthread_park(current) is never supposed to return an error */
6189         WARN_ON(err != 0);
6190         if (err)
6191                 goto init_complete;
6192
6193         err = cgroup_attach_task_all(init_context->parent, current);
6194         if (err) {
6195                 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6196                         __func__, err);
6197                 goto init_complete;
6198         }
6199
6200         set_user_nice(current, task_nice(init_context->parent));
6201
6202 init_complete:
6203         init_context->err = err;
6204         complete(&init_context->init_done);
6205         init_context = NULL;
6206
6207         if (err)
6208                 goto out;
6209
6210         /* Wait to be woken up by the spawner before proceeding. */
6211         kthread_parkme();
6212
6213         if (!kthread_should_stop())
6214                 err = thread_fn(kvm, data);
6215
6216 out:
6217         /*
6218          * Move kthread back to its original cgroup to prevent it lingering in
6219          * the cgroup of the VM process, after the latter finishes its
6220          * execution.
6221          *
6222          * kthread_stop() waits on the 'exited' completion condition which is
6223          * set in exit_mm(), via mm_release(), in do_exit(). However, the
6224          * kthread is removed from the cgroup in the cgroup_exit() which is
6225          * called after the exit_mm(). This causes the kthread_stop() to return
6226          * before the kthread actually quits the cgroup.
6227          */
6228         rcu_read_lock();
6229         parent = rcu_dereference(current->real_parent);
6230         get_task_struct(parent);
6231         rcu_read_unlock();
6232         cgroup_attach_task_all(parent, current);
6233         put_task_struct(parent);
6234
6235         return err;
6236 }
6237
6238 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6239                                 uintptr_t data, const char *name,
6240                                 struct task_struct **thread_ptr)
6241 {
6242         struct kvm_vm_worker_thread_context init_context = {};
6243         struct task_struct *thread;
6244
6245         *thread_ptr = NULL;
6246         init_context.kvm = kvm;
6247         init_context.parent = current;
6248         init_context.thread_fn = thread_fn;
6249         init_context.data = data;
6250         init_completion(&init_context.init_done);
6251
6252         thread = kthread_run(kvm_vm_worker_thread, &init_context,
6253                              "%s-%d", name, task_pid_nr(current));
6254         if (IS_ERR(thread))
6255                 return PTR_ERR(thread);
6256
6257         /* kthread_run is never supposed to return NULL */
6258         WARN_ON(thread == NULL);
6259
6260         wait_for_completion(&init_context.init_done);
6261
6262         if (!init_context.err)
6263                 *thread_ptr = thread;
6264
6265         return init_context.err;
6266 }