virt/kvm/kvm_main.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * Copyright (C) 2006 Qumranet, Inc.
   9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10  *
  11  * Authors:
  12  *   Avi Kivity   <avi@qumranet.com>
  13  *   Yaniv Kamay  <yaniv@qumranet.com>
  14  */
  15
  16 #include <kvm/iodev.h>
  17
  18 #include <linux/kvm_host.h>
  19 #include <linux/kvm.h>
  20 #include <linux/module.h>
  21 #include <linux/errno.h>
  22 #include <linux/percpu.h>
  23 #include <linux/mm.h>
  24 #include <linux/miscdevice.h>
  25 #include <linux/vmalloc.h>
  26 #include <linux/reboot.h>
  27 #include <linux/debugfs.h>
  28 #include <linux/highmem.h>
  29 #include <linux/file.h>
  30 #include <linux/syscore_ops.h>
  31 #include <linux/cpu.h>
  32 #include <linux/sched/signal.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/sched/stat.h>
  35 #include <linux/cpumask.h>
  36 #include <linux/smp.h>
  37 #include <linux/anon_inodes.h>
  38 #include <linux/profile.h>
  39 #include <linux/kvm_para.h>
  40 #include <linux/pagemap.h>
  41 #include <linux/mman.h>
  42 #include <linux/swap.h>
  43 #include <linux/bitops.h>
  44 #include <linux/spinlock.h>
  45 #include <linux/compat.h>
  46 #include <linux/srcu.h>
  47 #include <linux/hugetlb.h>
  48 #include <linux/slab.h>
  49 #include <linux/sort.h>
  50 #include <linux/bsearch.h>
  51 #include <linux/io.h>
  52 #include <linux/lockdep.h>
  53 #include <linux/kthread.h>
  54 #include <linux/suspend.h>
  55
  56 #include <asm/processor.h>
  57 #include <asm/ioctl.h>
  58 #include <linux/uaccess.h>
  59
  60 #include "coalesced_mmio.h"
  61 #include "async_pf.h"
  62 #include "kvm_mm.h"
  63 #include "vfio.h"
  64
  65 #include <trace/events/ipi.h>
  66
  67 #define CREATE_TRACE_POINTS
  68 #include <trace/events/kvm.h>
  69
  70 #include <linux/kvm_dirty_ring.h>
  71
  72
  73 /* Worst case buffer size needed for holding an integer. */
  74 #define ITOA_MAX_LEN 12
  75
  76 MODULE_AUTHOR("Qumranet");
  77 MODULE_LICENSE("GPL");
  78
  79 /* Architectures should define their poll value according to the halt latency */
  80 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  81 module_param(halt_poll_ns, uint, 0644);
  82 EXPORT_SYMBOL_GPL(halt_poll_ns);
  83
  84 /* Default doubles per-vcpu halt_poll_ns. */
  85 unsigned int halt_poll_ns_grow = 2;
  86 module_param(halt_poll_ns_grow, uint, 0644);
  87 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
  88
  89 /* The start value to grow halt_poll_ns from */
  90 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  91 module_param(halt_poll_ns_grow_start, uint, 0644);
  92 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
  93
  94 /* Default resets per-vcpu halt_poll_ns . */
  95 unsigned int halt_poll_ns_shrink;
  96 module_param(halt_poll_ns_shrink, uint, 0644);
  97 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  98
  99 /*
 100  * Ordering of locks:
 101  *
 102  *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 103  */
 104
 105 DEFINE_MUTEX(kvm_lock);
 106 LIST_HEAD(vm_list);
 107
 108 static struct kmem_cache *kvm_vcpu_cache;
 109
 110 static __read_mostly struct preempt_ops kvm_preempt_ops;
 111 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 112
 113 struct dentry *kvm_debugfs_dir;
 114 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 115
 116 static const struct file_operations stat_fops_per_vm;
 117
 118 static struct file_operations kvm_chardev_ops;
 119
 120 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 121                            unsigned long arg);
 122 #ifdef CONFIG_KVM_COMPAT
 123 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 124                                   unsigned long arg);
 125 #define KVM_COMPAT(c)   .compat_ioctl   = (c)
 126 #else
 127 /*
 128  * For architectures that don't implement a compat infrastructure,
 129  * adopt a double line of defense:
 130  * - Prevent a compat task from opening /dev/kvm
 131  * - If the open has been done by a 64bit task, and the KVM fd
 132  *   passed to a compat task, let the ioctls fail.
 133  */
 134 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
 135                                 unsigned long arg) { return -EINVAL; }
 136
 137 static int kvm_no_compat_open(struct inode *inode, struct file *file)
 138 {
 139         return is_compat_task() ? -ENODEV : 0;
 140 }
 141 #define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
 142                         .open           = kvm_no_compat_open
 143 #endif
 144 static int hardware_enable_all(void);
 145 static void hardware_disable_all(void);
 146
 147 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 148
 149 #define KVM_EVENT_CREATE_VM 0
 150 #define KVM_EVENT_DESTROY_VM 1
 151 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 152 static unsigned long long kvm_createvm_count;
 153 static unsigned long long kvm_active_vms;
 154
 155 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
 156
 157 __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
 158 {
 159 }
 160
 161 bool kvm_is_zone_device_page(struct page *page)
 162 {
 163         /*
 164          * The metadata used by is_zone_device_page() to determine whether or
 165          * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
 166          * the device has been pinned, e.g. by get_user_pages().  WARN if the
 167          * page_count() is zero to help detect bad usage of this helper.
 168          */
 169         if (WARN_ON_ONCE(!page_count(page)))
 170                 return false;
 171
 172         return is_zone_device_page(page);
 173 }
 174
 175 /*
 176  * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
 177  * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
 178  * is likely incomplete, it has been compiled purely through people wanting to
 179  * back guest with a certain type of memory and encountering issues.
 180  */
 181 struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
 182 {
 183         struct page *page;
 184
 185         if (!pfn_valid(pfn))
 186                 return NULL;
 187
 188         page = pfn_to_page(pfn);
 189         if (!PageReserved(page))
 190                 return page;
 191
 192         /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
 193         if (is_zero_pfn(pfn))
 194                 return page;
 195
 196         /*
 197          * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
 198          * perspective they are "normal" pages, albeit with slightly different
 199          * usage rules.
 200          */
 201         if (kvm_is_zone_device_page(page))
 202                 return page;
 203
 204         return NULL;
 205 }
 206
 207 /*
 208  * Switches to specified vcpu, until a matching vcpu_put()
 209  */
 210 void vcpu_load(struct kvm_vcpu *vcpu)
 211 {
 212         int cpu = get_cpu();
 213
 214         __this_cpu_write(kvm_running_vcpu, vcpu);
 215         preempt_notifier_register(&vcpu->preempt_notifier);
 216         kvm_arch_vcpu_load(vcpu, cpu);
 217         put_cpu();
 218 }
 219 EXPORT_SYMBOL_GPL(vcpu_load);
 220
 221 void vcpu_put(struct kvm_vcpu *vcpu)
 222 {
 223         preempt_disable();
 224         kvm_arch_vcpu_put(vcpu);
 225         preempt_notifier_unregister(&vcpu->preempt_notifier);
 226         __this_cpu_write(kvm_running_vcpu, NULL);
 227         preempt_enable();
 228 }
 229 EXPORT_SYMBOL_GPL(vcpu_put);
 230
 231 /* TODO: merge with kvm_arch_vcpu_should_kick */
 232 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 233 {
 234         int mode = kvm_vcpu_exiting_guest_mode(vcpu);
 235
 236         /*
 237          * We need to wait for the VCPU to reenable interrupts and get out of
 238          * READING_SHADOW_PAGE_TABLES mode.
 239          */
 240         if (req & KVM_REQUEST_WAIT)
 241                 return mode != OUTSIDE_GUEST_MODE;
 242
 243         /*
 244          * Need to kick a running VCPU, but otherwise there is nothing to do.
 245          */
 246         return mode == IN_GUEST_MODE;
 247 }
 248
 249 static void ack_kick(void *_completed)
 250 {
 251 }
 252
 253 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
 254 {
 255         if (cpumask_empty(cpus))
 256                 return false;
 257
 258         smp_call_function_many(cpus, ack_kick, NULL, wait);
 259         return true;
 260 }
 261
 262 static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
 263                                   struct cpumask *tmp, int current_cpu)
 264 {
 265         int cpu;
 266
 267         if (likely(!(req & KVM_REQUEST_NO_ACTION)))
 268                 __kvm_make_request(req, vcpu);
 269
 270         if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
 271                 return;
 272
 273         /*
 274          * Note, the vCPU could get migrated to a different pCPU at any point
 275          * after kvm_request_needs_ipi(), which could result in sending an IPI
 276          * to the previous pCPU.  But, that's OK because the purpose of the IPI
 277          * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
 278          * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
 279          * after this point is also OK, as the requirement is only that KVM wait
 280          * for vCPUs that were reading SPTEs _before_ any changes were
 281          * finalized. See kvm_vcpu_kick() for more details on handling requests.
 282          */
 283         if (kvm_request_needs_ipi(vcpu, req)) {
 284                 cpu = READ_ONCE(vcpu->cpu);
 285                 if (cpu != -1 && cpu != current_cpu)
 286                         __cpumask_set_cpu(cpu, tmp);
 287         }
 288 }
 289
 290 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 291                                  unsigned long *vcpu_bitmap)
 292 {
 293         struct kvm_vcpu *vcpu;
 294         struct cpumask *cpus;
 295         int i, me;
 296         bool called;
 297
 298         me = get_cpu();
 299
 300         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 301         cpumask_clear(cpus);
 302
 303         for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
 304                 vcpu = kvm_get_vcpu(kvm, i);
 305                 if (!vcpu)
 306                         continue;
 307                 kvm_make_vcpu_request(vcpu, req, cpus, me);
 308         }
 309
 310         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 311         put_cpu();
 312
 313         return called;
 314 }
 315
 316 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 317                                       struct kvm_vcpu *except)
 318 {
 319         struct kvm_vcpu *vcpu;
 320         struct cpumask *cpus;
 321         unsigned long i;
 322         bool called;
 323         int me;
 324
 325         me = get_cpu();
 326
 327         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 328         cpumask_clear(cpus);
 329
 330         kvm_for_each_vcpu(i, vcpu, kvm) {
 331                 if (vcpu == except)
 332                         continue;
 333                 kvm_make_vcpu_request(vcpu, req, cpus, me);
 334         }
 335
 336         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 337         put_cpu();
 338
 339         return called;
 340 }
 341
 342 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 343 {
 344         return kvm_make_all_cpus_request_except(kvm, req, NULL);
 345 }
 346 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 347
 348 void kvm_flush_remote_tlbs(struct kvm *kvm)
 349 {
 350         ++kvm->stat.generic.remote_tlb_flush_requests;
 351
 352         /*
 353          * We want to publish modifications to the page tables before reading
 354          * mode. Pairs with a memory barrier in arch-specific code.
 355          * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
 356          * and smp_mb in walk_shadow_page_lockless_begin/end.
 357          * - powerpc: smp_mb in kvmppc_prepare_to_enter.
 358          *
 359          * There is already an smp_mb__after_atomic() before
 360          * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 361          * barrier here.
 362          */
 363         if (!kvm_arch_flush_remote_tlbs(kvm)
 364             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 365                 ++kvm->stat.generic.remote_tlb_flush;
 366 }
 367 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 368
 369 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
 370 {
 371         if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
 372                 return;
 373
 374         /*
 375          * Fall back to a flushing entire TLBs if the architecture range-based
 376          * TLB invalidation is unsupported or can't be performed for whatever
 377          * reason.
 378          */
 379         kvm_flush_remote_tlbs(kvm);
 380 }
 381
 382 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
 383                                    const struct kvm_memory_slot *memslot)
 384 {
 385         /*
 386          * All current use cases for flushing the TLBs for a specific memslot
 387          * are related to dirty logging, and many do the TLB flush out of
 388          * mmu_lock. The interaction between the various operations on memslot
 389          * must be serialized by slots_locks to ensure the TLB flush from one
 390          * operation is observed by any other operation on the same memslot.
 391          */
 392         lockdep_assert_held(&kvm->slots_lock);
 393         kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
 394 }
 395
 396 static void kvm_flush_shadow_all(struct kvm *kvm)
 397 {
 398         kvm_arch_flush_shadow_all(kvm);
 399         kvm_arch_guest_memory_reclaimed(kvm);
 400 }
 401
 402 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 403 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 404                                                gfp_t gfp_flags)
 405 {
 406         gfp_flags |= mc->gfp_zero;
 407
 408         if (mc->kmem_cache)
 409                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 410         else
 411                 return (void *)__get_free_page(gfp_flags);
 412 }
 413
 414 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
 415 {
 416         gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
 417         void *obj;
 418
 419         if (mc->nobjs >= min)
 420                 return 0;
 421
 422         if (unlikely(!mc->objects)) {
 423                 if (WARN_ON_ONCE(!capacity))
 424                         return -EIO;
 425
 426                 mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
 427                 if (!mc->objects)
 428                         return -ENOMEM;
 429
 430                 mc->capacity = capacity;
 431         }
 432
 433         /* It is illegal to request a different capacity across topups. */
 434         if (WARN_ON_ONCE(mc->capacity != capacity))
 435                 return -EIO;
 436
 437         while (mc->nobjs < mc->capacity) {
 438                 obj = mmu_memory_cache_alloc_obj(mc, gfp);
 439                 if (!obj)
 440                         return mc->nobjs >= min ? 0 : -ENOMEM;
 441                 mc->objects[mc->nobjs++] = obj;
 442         }
 443         return 0;
 444 }
 445
 446 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 447 {
 448         return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
 449 }
 450
 451 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 452 {
 453         return mc->nobjs;
 454 }
 455
 456 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 457 {
 458         while (mc->nobjs) {
 459                 if (mc->kmem_cache)
 460                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
 461                 else
 462                         free_page((unsigned long)mc->objects[--mc->nobjs]);
 463         }
 464
 465         kvfree(mc->objects);
 466
 467         mc->objects = NULL;
 468         mc->capacity = 0;
 469 }
 470
 471 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 472 {
 473         void *p;
 474
 475         if (WARN_ON(!mc->nobjs))
 476                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
 477         else
 478                 p = mc->objects[--mc->nobjs];
 479         BUG_ON(!p);
 480         return p;
 481 }
 482 #endif
 483
 484 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 485 {
 486         mutex_init(&vcpu->mutex);
 487         vcpu->cpu = -1;
 488         vcpu->kvm = kvm;
 489         vcpu->vcpu_id = id;
 490         vcpu->pid = NULL;
 491 #ifndef __KVM_HAVE_ARCH_WQP
 492         rcuwait_init(&vcpu->wait);
 493 #endif
 494         kvm_async_pf_vcpu_init(vcpu);
 495
 496         kvm_vcpu_set_in_spin_loop(vcpu, false);
 497         kvm_vcpu_set_dy_eligible(vcpu, false);
 498         vcpu->preempted = false;
 499         vcpu->ready = false;
 500         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 501         vcpu->last_used_slot = NULL;
 502
 503         /* Fill the stats id string for the vcpu */
 504         snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
 505                  task_pid_nr(current), id);
 506 }
 507
 508 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 509 {
 510         kvm_arch_vcpu_destroy(vcpu);
 511         kvm_dirty_ring_free(&vcpu->dirty_ring);
 512
 513         /*
 514          * No need for rcu_read_lock as VCPU_RUN is the only place that changes
 515          * the vcpu->pid pointer, and at destruction time all file descriptors
 516          * are already gone.
 517          */
 518         put_pid(rcu_dereference_protected(vcpu->pid, 1));
 519
 520         free_page((unsigned long)vcpu->run);
 521         kmem_cache_free(kvm_vcpu_cache, vcpu);
 522 }
 523
 524 void kvm_destroy_vcpus(struct kvm *kvm)
 525 {
 526         unsigned long i;
 527         struct kvm_vcpu *vcpu;
 528
 529         kvm_for_each_vcpu(i, vcpu, kvm) {
 530                 kvm_vcpu_destroy(vcpu);
 531                 xa_erase(&kvm->vcpu_array, i);
 532         }
 533
 534         atomic_set(&kvm->online_vcpus, 0);
 535 }
 536 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
 537
 538 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 539 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 540 {
 541         return container_of(mn, struct kvm, mmu_notifier);
 542 }
 543
 544 typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 545
 546 typedef void (*on_lock_fn_t)(struct kvm *kvm);
 547 typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 548
 549 struct kvm_mmu_notifier_range {
 550         /*
 551          * 64-bit addresses, as KVM notifiers can operate on host virtual
 552          * addresses (unsigned long) and guest physical addresses (64-bit).
 553          */
 554         u64 start;
 555         u64 end;
 556         union kvm_mmu_notifier_arg arg;
 557         gfn_handler_t handler;
 558         on_lock_fn_t on_lock;
 559         on_unlock_fn_t on_unlock;
 560         bool flush_on_ret;
 561         bool may_block;
 562 };
 563
 564 /*
 565  * Use a dedicated stub instead of NULL to indicate that there is no callback
 566  * function/handler.  The compiler technically can't guarantee that a real
 567  * function will have a non-zero address, and so it will generate code to
 568  * check for !NULL, whereas comparing against a stub will be elided at compile
 569  * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
 570  */
 571 static void kvm_null_fn(void)
 572 {
 573
 574 }
 575 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 576
 577 static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
 578
 579 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
 580 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)          \
 581         for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
 582              node;                                                           \
 583              node = interval_tree_iter_next(node, start, last))      \
 584
 585 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 586                                                   const struct kvm_mmu_notifier_range *range)
 587 {
 588         bool ret = false, locked = false;
 589         struct kvm_gfn_range gfn_range;
 590         struct kvm_memory_slot *slot;
 591         struct kvm_memslots *slots;
 592         int i, idx;
 593
 594         if (WARN_ON_ONCE(range->end <= range->start))
 595                 return 0;
 596
 597         /* A null handler is allowed if and only if on_lock() is provided. */
 598         if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
 599                          IS_KVM_NULL_FN(range->handler)))
 600                 return 0;
 601
 602         idx = srcu_read_lock(&kvm->srcu);
 603
 604         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 605                 struct interval_tree_node *node;
 606
 607                 slots = __kvm_memslots(kvm, i);
 608                 kvm_for_each_memslot_in_hva_range(node, slots,
 609                                                   range->start, range->end - 1) {
 610                         unsigned long hva_start, hva_end;
 611
 612                         slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
 613                         hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
 614                         hva_end = min_t(unsigned long, range->end,
 615                                         slot->userspace_addr + (slot->npages << PAGE_SHIFT));
 616
 617                         /*
 618                          * To optimize for the likely case where the address
 619                          * range is covered by zero or one memslots, don't
 620                          * bother making these conditional (to avoid writes on
 621                          * the second or later invocation of the handler).
 622                          */
 623                         gfn_range.arg = range->arg;
 624                         gfn_range.may_block = range->may_block;
 625
 626                         /*
 627                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 628                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 629                          */
 630                         gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
 631                         gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 632                         gfn_range.slot = slot;
 633
 634                         if (!locked) {
 635                                 locked = true;
 636                                 KVM_MMU_LOCK(kvm);
 637                                 if (!IS_KVM_NULL_FN(range->on_lock))
 638                                         range->on_lock(kvm);
 639
 640                                 if (IS_KVM_NULL_FN(range->handler))
 641                                         break;
 642                         }
 643                         ret |= range->handler(kvm, &gfn_range);
 644                 }
 645         }
 646
 647         if (range->flush_on_ret && ret)
 648                 kvm_flush_remote_tlbs(kvm);
 649
 650         if (locked) {
 651                 KVM_MMU_UNLOCK(kvm);
 652                 if (!IS_KVM_NULL_FN(range->on_unlock))
 653                         range->on_unlock(kvm);
 654         }
 655
 656         srcu_read_unlock(&kvm->srcu, idx);
 657
 658         /* The notifiers are averse to booleans. :-( */
 659         return (int)ret;
 660 }
 661
 662 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 663                                                 unsigned long start,
 664                                                 unsigned long end,
 665                                                 union kvm_mmu_notifier_arg arg,
 666                                                 gfn_handler_t handler)
 667 {
 668         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 669         const struct kvm_mmu_notifier_range range = {
 670                 .start          = start,
 671                 .end            = end,
 672                 .arg            = arg,
 673                 .handler        = handler,
 674                 .on_lock        = (void *)kvm_null_fn,
 675                 .on_unlock      = (void *)kvm_null_fn,
 676                 .flush_on_ret   = true,
 677                 .may_block      = false,
 678         };
 679
 680         return __kvm_handle_hva_range(kvm, &range);
 681 }
 682
 683 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
 684                                                          unsigned long start,
 685                                                          unsigned long end,
 686                                                          gfn_handler_t handler)
 687 {
 688         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 689         const struct kvm_mmu_notifier_range range = {
 690                 .start          = start,
 691                 .end            = end,
 692                 .handler        = handler,
 693                 .on_lock        = (void *)kvm_null_fn,
 694                 .on_unlock      = (void *)kvm_null_fn,
 695                 .flush_on_ret   = false,
 696                 .may_block      = false,
 697         };
 698
 699         return __kvm_handle_hva_range(kvm, &range);
 700 }
 701
 702 static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 703 {
 704         /*
 705          * Skipping invalid memslots is correct if and only change_pte() is
 706          * surrounded by invalidate_range_{start,end}(), which is currently
 707          * guaranteed by the primary MMU.  If that ever changes, KVM needs to
 708          * unmap the memslot instead of skipping the memslot to ensure that KVM
 709          * doesn't hold references to the old PFN.
 710          */
 711         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 712
 713         if (range->slot->flags & KVM_MEMSLOT_INVALID)
 714                 return false;
 715
 716         return kvm_set_spte_gfn(kvm, range);
 717 }
 718
 719 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 720                                         struct mm_struct *mm,
 721                                         unsigned long address,
 722                                         pte_t pte)
 723 {
 724         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 725         const union kvm_mmu_notifier_arg arg = { .pte = pte };
 726
 727         trace_kvm_set_spte_hva(address);
 728
 729         /*
 730          * .change_pte() must be surrounded by .invalidate_range_{start,end}().
 731          * If mmu_invalidate_in_progress is zero, then no in-progress
 732          * invalidations, including this one, found a relevant memslot at
 733          * start(); rechecking memslots here is unnecessary.  Note, a false
 734          * positive (count elevated by a different invalidation) is sub-optimal
 735          * but functionally ok.
 736          */
 737         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 738         if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
 739                 return;
 740
 741         kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
 742 }
 743
 744 void kvm_mmu_invalidate_begin(struct kvm *kvm)
 745 {
 746         lockdep_assert_held_write(&kvm->mmu_lock);
 747         /*
 748          * The count increase must become visible at unlock time as no
 749          * spte can be established without taking the mmu_lock and
 750          * count is also read inside the mmu_lock critical section.
 751          */
 752         kvm->mmu_invalidate_in_progress++;
 753
 754         if (likely(kvm->mmu_invalidate_in_progress == 1)) {
 755                 kvm->mmu_invalidate_range_start = INVALID_GPA;
 756                 kvm->mmu_invalidate_range_end = INVALID_GPA;
 757         }
 758 }
 759
 760 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
 761 {
 762         lockdep_assert_held_write(&kvm->mmu_lock);
 763
 764         WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
 765
 766         if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
 767                 kvm->mmu_invalidate_range_start = start;
 768                 kvm->mmu_invalidate_range_end = end;
 769         } else {
 770                 /*
 771                  * Fully tracking multiple concurrent ranges has diminishing
 772                  * returns. Keep things simple and just find the minimal range
 773                  * which includes the current and new ranges. As there won't be
 774                  * enough information to subtract a range after its invalidate
 775                  * completes, any ranges invalidated concurrently will
 776                  * accumulate and persist until all outstanding invalidates
 777                  * complete.
 778                  */
 779                 kvm->mmu_invalidate_range_start =
 780                         min(kvm->mmu_invalidate_range_start, start);
 781                 kvm->mmu_invalidate_range_end =
 782                         max(kvm->mmu_invalidate_range_end, end);
 783         }
 784 }
 785
 786 static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 787 {
 788         kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
 789         return kvm_unmap_gfn_range(kvm, range);
 790 }
 791
 792 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 793                                         const struct mmu_notifier_range *range)
 794 {
 795         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 796         const struct kvm_mmu_notifier_range hva_range = {
 797                 .start          = range->start,
 798                 .end            = range->end,
 799                 .handler        = kvm_mmu_unmap_gfn_range,
 800                 .on_lock        = kvm_mmu_invalidate_begin,
 801                 .on_unlock      = kvm_arch_guest_memory_reclaimed,
 802                 .flush_on_ret   = true,
 803                 .may_block      = mmu_notifier_range_blockable(range),
 804         };
 805
 806         trace_kvm_unmap_hva_range(range->start, range->end);
 807
 808         /*
 809          * Prevent memslot modification between range_start() and range_end()
 810          * so that conditionally locking provides the same result in both
 811          * functions.  Without that guarantee, the mmu_invalidate_in_progress
 812          * adjustments will be imbalanced.
 813          *
 814          * Pairs with the decrement in range_end().
 815          */
 816         spin_lock(&kvm->mn_invalidate_lock);
 817         kvm->mn_active_invalidate_count++;
 818         spin_unlock(&kvm->mn_invalidate_lock);
 819
 820         /*
 821          * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
 822          * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
 823          * each cache's lock.  There are relatively few caches in existence at
 824          * any given time, and the caches themselves can check for hva overlap,
 825          * i.e. don't need to rely on memslot overlap checks for performance.
 826          * Because this runs without holding mmu_lock, the pfn caches must use
 827          * mn_active_invalidate_count (see above) instead of
 828          * mmu_invalidate_in_progress.
 829          */
 830         gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
 831                                           hva_range.may_block);
 832
 833         __kvm_handle_hva_range(kvm, &hva_range);
 834
 835         return 0;
 836 }
 837
 838 void kvm_mmu_invalidate_end(struct kvm *kvm)
 839 {
 840         lockdep_assert_held_write(&kvm->mmu_lock);
 841
 842         /*
 843          * This sequence increase will notify the kvm page fault that
 844          * the page that is going to be mapped in the spte could have
 845          * been freed.
 846          */
 847         kvm->mmu_invalidate_seq++;
 848         smp_wmb();
 849         /*
 850          * The above sequence increase must be visible before the
 851          * below count decrease, which is ensured by the smp_wmb above
 852          * in conjunction with the smp_rmb in mmu_invalidate_retry().
 853          */
 854         kvm->mmu_invalidate_in_progress--;
 855         KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
 856
 857         /*
 858          * Assert that at least one range was added between start() and end().
 859          * Not adding a range isn't fatal, but it is a KVM bug.
 860          */
 861         WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
 862 }
 863
 864 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 865                                         const struct mmu_notifier_range *range)
 866 {
 867         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 868         const struct kvm_mmu_notifier_range hva_range = {
 869                 .start          = range->start,
 870                 .end            = range->end,
 871                 .handler        = (void *)kvm_null_fn,
 872                 .on_lock        = kvm_mmu_invalidate_end,
 873                 .on_unlock      = (void *)kvm_null_fn,
 874                 .flush_on_ret   = false,
 875                 .may_block      = mmu_notifier_range_blockable(range),
 876         };
 877         bool wake;
 878
 879         __kvm_handle_hva_range(kvm, &hva_range);
 880
 881         /* Pairs with the increment in range_start(). */
 882         spin_lock(&kvm->mn_invalidate_lock);
 883         wake = (--kvm->mn_active_invalidate_count == 0);
 884         spin_unlock(&kvm->mn_invalidate_lock);
 885
 886         /*
 887          * There can only be one waiter, since the wait happens under
 888          * slots_lock.
 889          */
 890         if (wake)
 891                 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 892 }
 893
 894 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 895                                               struct mm_struct *mm,
 896                                               unsigned long start,
 897                                               unsigned long end)
 898 {
 899         trace_kvm_age_hva(start, end);
 900
 901         return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
 902                                     kvm_age_gfn);
 903 }
 904
 905 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 906                                         struct mm_struct *mm,
 907                                         unsigned long start,
 908                                         unsigned long end)
 909 {
 910         trace_kvm_age_hva(start, end);
 911
 912         /*
 913          * Even though we do not flush TLB, this will still adversely
 914          * affect performance on pre-Haswell Intel EPT, where there is
 915          * no EPT Access Bit to clear so that we have to tear down EPT
 916          * tables instead. If we find this unacceptable, we can always
 917          * add a parameter to kvm_age_hva so that it effectively doesn't
 918          * do anything on clear_young.
 919          *
 920          * Also note that currently we never issue secondary TLB flushes
 921          * from clear_young, leaving this job up to the regular system
 922          * cadence. If we find this inaccurate, we might come up with a
 923          * more sophisticated heuristic later.
 924          */
 925         return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 926 }
 927
 928 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 929                                        struct mm_struct *mm,
 930                                        unsigned long address)
 931 {
 932         trace_kvm_test_age_hva(address);
 933
 934         return kvm_handle_hva_range_no_flush(mn, address, address + 1,
 935                                              kvm_test_age_gfn);
 936 }
 937
 938 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 939                                      struct mm_struct *mm)
 940 {
 941         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 942         int idx;
 943
 944         idx = srcu_read_lock(&kvm->srcu);
 945         kvm_flush_shadow_all(kvm);
 946         srcu_read_unlock(&kvm->srcu, idx);
 947 }
 948
 949 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 950         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 951         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 952         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 953         .clear_young            = kvm_mmu_notifier_clear_young,
 954         .test_young             = kvm_mmu_notifier_test_young,
 955         .change_pte             = kvm_mmu_notifier_change_pte,
 956         .release                = kvm_mmu_notifier_release,
 957 };
 958
 959 static int kvm_init_mmu_notifier(struct kvm *kvm)
 960 {
 961         kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 962         return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 963 }
 964
 965 #else  /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 966
 967 static int kvm_init_mmu_notifier(struct kvm *kvm)
 968 {
 969         return 0;
 970 }
 971
 972 #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 973
 974 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 975 static int kvm_pm_notifier_call(struct notifier_block *bl,
 976                                 unsigned long state,
 977                                 void *unused)
 978 {
 979         struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
 980
 981         return kvm_arch_pm_notifier(kvm, state);
 982 }
 983
 984 static void kvm_init_pm_notifier(struct kvm *kvm)
 985 {
 986         kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
 987         /* Suspend KVM before we suspend ftrace, RCU, etc. */
 988         kvm->pm_notifier.priority = INT_MAX;
 989         register_pm_notifier(&kvm->pm_notifier);
 990 }
 991
 992 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 993 {
 994         unregister_pm_notifier(&kvm->pm_notifier);
 995 }
 996 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
 997 static void kvm_init_pm_notifier(struct kvm *kvm)
 998 {
 999 }
1000
1001 static void kvm_destroy_pm_notifier(struct kvm *kvm)
1002 {
1003 }
1004 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
1005
1006 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
1007 {
1008         if (!memslot->dirty_bitmap)
1009                 return;
1010
1011         kvfree(memslot->dirty_bitmap);
1012         memslot->dirty_bitmap = NULL;
1013 }
1014
1015 /* This does not remove the slot from struct kvm_memslots data structures */
1016 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
1017 {
1018         kvm_destroy_dirty_bitmap(slot);
1019
1020         kvm_arch_free_memslot(kvm, slot);
1021
1022         kfree(slot);
1023 }
1024
1025 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
1026 {
1027         struct hlist_node *idnode;
1028         struct kvm_memory_slot *memslot;
1029         int bkt;
1030
1031         /*
1032          * The same memslot objects live in both active and inactive sets,
1033          * arbitrarily free using index '1' so the second invocation of this
1034          * function isn't operating over a structure with dangling pointers
1035          * (even though this function isn't actually touching them).
1036          */
1037         if (!slots->node_idx)
1038                 return;
1039
1040         hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1041                 kvm_free_memslot(kvm, memslot);
1042 }
1043
1044 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1045 {
1046         switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1047         case KVM_STATS_TYPE_INSTANT:
1048                 return 0444;
1049         case KVM_STATS_TYPE_CUMULATIVE:
1050         case KVM_STATS_TYPE_PEAK:
1051         default:
1052                 return 0644;
1053         }
1054 }
1055
1056
1057 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1058 {
1059         int i;
1060         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1061                                       kvm_vcpu_stats_header.num_desc;
1062
1063         if (IS_ERR(kvm->debugfs_dentry))
1064                 return;
1065
1066         debugfs_remove_recursive(kvm->debugfs_dentry);
1067
1068         if (kvm->debugfs_stat_data) {
1069                 for (i = 0; i < kvm_debugfs_num_entries; i++)
1070                         kfree(kvm->debugfs_stat_data[i]);
1071                 kfree(kvm->debugfs_stat_data);
1072         }
1073 }
1074
1075 static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1076 {
1077         static DEFINE_MUTEX(kvm_debugfs_lock);
1078         struct dentry *dent;
1079         char dir_name[ITOA_MAX_LEN * 2];
1080         struct kvm_stat_data *stat_data;
1081         const struct _kvm_stats_desc *pdesc;
1082         int i, ret = -ENOMEM;
1083         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1084                                       kvm_vcpu_stats_header.num_desc;
1085
1086         if (!debugfs_initialized())
1087                 return 0;
1088
1089         snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1090         mutex_lock(&kvm_debugfs_lock);
1091         dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1092         if (dent) {
1093                 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1094                 dput(dent);
1095                 mutex_unlock(&kvm_debugfs_lock);
1096                 return 0;
1097         }
1098         dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1099         mutex_unlock(&kvm_debugfs_lock);
1100         if (IS_ERR(dent))
1101                 return 0;
1102
1103         kvm->debugfs_dentry = dent;
1104         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1105                                          sizeof(*kvm->debugfs_stat_data),
1106                                          GFP_KERNEL_ACCOUNT);
1107         if (!kvm->debugfs_stat_data)
1108                 goto out_err;
1109
1110         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1111                 pdesc = &kvm_vm_stats_desc[i];
1112                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1113                 if (!stat_data)
1114                         goto out_err;
1115
1116                 stat_data->kvm = kvm;
1117                 stat_data->desc = pdesc;
1118                 stat_data->kind = KVM_STAT_VM;
1119                 kvm->debugfs_stat_data[i] = stat_data;
1120                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1121                                     kvm->debugfs_dentry, stat_data,
1122                                     &stat_fops_per_vm);
1123         }
1124
1125         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1126                 pdesc = &kvm_vcpu_stats_desc[i];
1127                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1128                 if (!stat_data)
1129                         goto out_err;
1130
1131                 stat_data->kvm = kvm;
1132                 stat_data->desc = pdesc;
1133                 stat_data->kind = KVM_STAT_VCPU;
1134                 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1135                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1136                                     kvm->debugfs_dentry, stat_data,
1137                                     &stat_fops_per_vm);
1138         }
1139
1140         ret = kvm_arch_create_vm_debugfs(kvm);
1141         if (ret)
1142                 goto out_err;
1143
1144         return 0;
1145 out_err:
1146         kvm_destroy_vm_debugfs(kvm);
1147         return ret;
1148 }
1149
1150 /*
1151  * Called after the VM is otherwise initialized, but just before adding it to
1152  * the vm_list.
1153  */
1154 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1155 {
1156         return 0;
1157 }
1158
1159 /*
1160  * Called just after removing the VM from the vm_list, but before doing any
1161  * other destruction.
1162  */
1163 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1164 {
1165 }
1166
1167 /*
1168  * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1169  * be setup already, so we can create arch-specific debugfs entries under it.
1170  * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1171  * a per-arch destroy interface is not needed.
1172  */
1173 int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1174 {
1175         return 0;
1176 }
1177
1178 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1179 {
1180         struct kvm *kvm = kvm_arch_alloc_vm();
1181         struct kvm_memslots *slots;
1182         int r = -ENOMEM;
1183         int i, j;
1184
1185         if (!kvm)
1186                 return ERR_PTR(-ENOMEM);
1187
1188         /* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
1189         __module_get(kvm_chardev_ops.owner);
1190
1191         KVM_MMU_LOCK_INIT(kvm);
1192         mmgrab(current->mm);
1193         kvm->mm = current->mm;
1194         kvm_eventfd_init(kvm);
1195         mutex_init(&kvm->lock);
1196         mutex_init(&kvm->irq_lock);
1197         mutex_init(&kvm->slots_lock);
1198         mutex_init(&kvm->slots_arch_lock);
1199         spin_lock_init(&kvm->mn_invalidate_lock);
1200         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1201         xa_init(&kvm->vcpu_array);
1202
1203         INIT_LIST_HEAD(&kvm->gpc_list);
1204         spin_lock_init(&kvm->gpc_lock);
1205
1206         INIT_LIST_HEAD(&kvm->devices);
1207         kvm->max_vcpus = KVM_MAX_VCPUS;
1208
1209         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1210
1211         /*
1212          * Force subsequent debugfs file creations to fail if the VM directory
1213          * is not created (by kvm_create_vm_debugfs()).
1214          */
1215         kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1216
1217         snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1218                  task_pid_nr(current));
1219
1220         if (init_srcu_struct(&kvm->srcu))
1221                 goto out_err_no_srcu;
1222         if (init_srcu_struct(&kvm->irq_srcu))
1223                 goto out_err_no_irq_srcu;
1224
1225         refcount_set(&kvm->users_count, 1);
1226         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1227                 for (j = 0; j < 2; j++) {
1228                         slots = &kvm->__memslots[i][j];
1229
1230                         atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1231                         slots->hva_tree = RB_ROOT_CACHED;
1232                         slots->gfn_tree = RB_ROOT;
1233                         hash_init(slots->id_hash);
1234                         slots->node_idx = j;
1235
1236                         /* Generations must be different for each address space. */
1237                         slots->generation = i;
1238                 }
1239
1240                 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1241         }
1242
1243         for (i = 0; i < KVM_NR_BUSES; i++) {
1244                 rcu_assign_pointer(kvm->buses[i],
1245                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1246                 if (!kvm->buses[i])
1247                         goto out_err_no_arch_destroy_vm;
1248         }
1249
1250         r = kvm_arch_init_vm(kvm, type);
1251         if (r)
1252                 goto out_err_no_arch_destroy_vm;
1253
1254         r = hardware_enable_all();
1255         if (r)
1256                 goto out_err_no_disable;
1257
1258 #ifdef CONFIG_HAVE_KVM_IRQFD
1259         INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1260 #endif
1261
1262         r = kvm_init_mmu_notifier(kvm);
1263         if (r)
1264                 goto out_err_no_mmu_notifier;
1265
1266         r = kvm_coalesced_mmio_init(kvm);
1267         if (r < 0)
1268                 goto out_no_coalesced_mmio;
1269
1270         r = kvm_create_vm_debugfs(kvm, fdname);
1271         if (r)
1272                 goto out_err_no_debugfs;
1273
1274         r = kvm_arch_post_init_vm(kvm);
1275         if (r)
1276                 goto out_err;
1277
1278         mutex_lock(&kvm_lock);
1279         list_add(&kvm->vm_list, &vm_list);
1280         mutex_unlock(&kvm_lock);
1281
1282         preempt_notifier_inc();
1283         kvm_init_pm_notifier(kvm);
1284
1285         return kvm;
1286
1287 out_err:
1288         kvm_destroy_vm_debugfs(kvm);
1289 out_err_no_debugfs:
1290         kvm_coalesced_mmio_free(kvm);
1291 out_no_coalesced_mmio:
1292 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1293         if (kvm->mmu_notifier.ops)
1294                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1295 #endif
1296 out_err_no_mmu_notifier:
1297         hardware_disable_all();
1298 out_err_no_disable:
1299         kvm_arch_destroy_vm(kvm);
1300 out_err_no_arch_destroy_vm:
1301         WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1302         for (i = 0; i < KVM_NR_BUSES; i++)
1303                 kfree(kvm_get_bus(kvm, i));
1304         cleanup_srcu_struct(&kvm->irq_srcu);
1305 out_err_no_irq_srcu:
1306         cleanup_srcu_struct(&kvm->srcu);
1307 out_err_no_srcu:
1308         kvm_arch_free_vm(kvm);
1309         mmdrop(current->mm);
1310         module_put(kvm_chardev_ops.owner);
1311         return ERR_PTR(r);
1312 }
1313
1314 static void kvm_destroy_devices(struct kvm *kvm)
1315 {
1316         struct kvm_device *dev, *tmp;
1317
1318         /*
1319          * We do not need to take the kvm->lock here, because nobody else
1320          * has a reference to the struct kvm at this point and therefore
1321          * cannot access the devices list anyhow.
1322          */
1323         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1324                 list_del(&dev->vm_node);
1325                 dev->ops->destroy(dev);
1326         }
1327 }
1328
1329 static void kvm_destroy_vm(struct kvm *kvm)
1330 {
1331         int i;
1332         struct mm_struct *mm = kvm->mm;
1333
1334         kvm_destroy_pm_notifier(kvm);
1335         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1336         kvm_destroy_vm_debugfs(kvm);
1337         kvm_arch_sync_events(kvm);
1338         mutex_lock(&kvm_lock);
1339         list_del(&kvm->vm_list);
1340         mutex_unlock(&kvm_lock);
1341         kvm_arch_pre_destroy_vm(kvm);
1342
1343         kvm_free_irq_routing(kvm);
1344         for (i = 0; i < KVM_NR_BUSES; i++) {
1345                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1346
1347                 if (bus)
1348                         kvm_io_bus_destroy(bus);
1349                 kvm->buses[i] = NULL;
1350         }
1351         kvm_coalesced_mmio_free(kvm);
1352 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1353         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1354         /*
1355          * At this point, pending calls to invalidate_range_start()
1356          * have completed but no more MMU notifiers will run, so
1357          * mn_active_invalidate_count may remain unbalanced.
1358          * No threads can be waiting in kvm_swap_active_memslots() as the
1359          * last reference on KVM has been dropped, but freeing
1360          * memslots would deadlock without this manual intervention.
1361          *
1362          * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
1363          * notifier between a start() and end(), then there shouldn't be any
1364          * in-progress invalidations.
1365          */
1366         WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1367         if (kvm->mn_active_invalidate_count)
1368                 kvm->mn_active_invalidate_count = 0;
1369         else
1370                 WARN_ON(kvm->mmu_invalidate_in_progress);
1371 #else
1372         kvm_flush_shadow_all(kvm);
1373 #endif
1374         kvm_arch_destroy_vm(kvm);
1375         kvm_destroy_devices(kvm);
1376         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1377                 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1378                 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1379         }
1380         cleanup_srcu_struct(&kvm->irq_srcu);
1381         cleanup_srcu_struct(&kvm->srcu);
1382         kvm_arch_free_vm(kvm);
1383         preempt_notifier_dec();
1384         hardware_disable_all();
1385         mmdrop(mm);
1386         module_put(kvm_chardev_ops.owner);
1387 }
1388
1389 void kvm_get_kvm(struct kvm *kvm)
1390 {
1391         refcount_inc(&kvm->users_count);
1392 }
1393 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1394
1395 /*
1396  * Make sure the vm is not during destruction, which is a safe version of
1397  * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1398  */
1399 bool kvm_get_kvm_safe(struct kvm *kvm)
1400 {
1401         return refcount_inc_not_zero(&kvm->users_count);
1402 }
1403 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1404
1405 void kvm_put_kvm(struct kvm *kvm)
1406 {
1407         if (refcount_dec_and_test(&kvm->users_count))
1408                 kvm_destroy_vm(kvm);
1409 }
1410 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1411
1412 /*
1413  * Used to put a reference that was taken on behalf of an object associated
1414  * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1415  * of the new file descriptor fails and the reference cannot be transferred to
1416  * its final owner.  In such cases, the caller is still actively using @kvm and
1417  * will fail miserably if the refcount unexpectedly hits zero.
1418  */
1419 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1420 {
1421         WARN_ON(refcount_dec_and_test(&kvm->users_count));
1422 }
1423 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1424
1425 static int kvm_vm_release(struct inode *inode, struct file *filp)
1426 {
1427         struct kvm *kvm = filp->private_data;
1428
1429         kvm_irqfd_release(kvm);
1430
1431         kvm_put_kvm(kvm);
1432         return 0;
1433 }
1434
1435 /*
1436  * Allocation size is twice as large as the actual dirty bitmap size.
1437  * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1438  */
1439 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1440 {
1441         unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1442
1443         memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1444         if (!memslot->dirty_bitmap)
1445                 return -ENOMEM;
1446
1447         return 0;
1448 }
1449
1450 static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1451 {
1452         struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1453         int node_idx_inactive = active->node_idx ^ 1;
1454
1455         return &kvm->__memslots[as_id][node_idx_inactive];
1456 }
1457
1458 /*
1459  * Helper to get the address space ID when one of memslot pointers may be NULL.
1460  * This also serves as a sanity that at least one of the pointers is non-NULL,
1461  * and that their address space IDs don't diverge.
1462  */
1463 static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1464                                   struct kvm_memory_slot *b)
1465 {
1466         if (WARN_ON_ONCE(!a && !b))
1467                 return 0;
1468
1469         if (!a)
1470                 return b->as_id;
1471         if (!b)
1472                 return a->as_id;
1473
1474         WARN_ON_ONCE(a->as_id != b->as_id);
1475         return a->as_id;
1476 }
1477
1478 static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1479                                 struct kvm_memory_slot *slot)
1480 {
1481         struct rb_root *gfn_tree = &slots->gfn_tree;
1482         struct rb_node **node, *parent;
1483         int idx = slots->node_idx;
1484
1485         parent = NULL;
1486         for (node = &gfn_tree->rb_node; *node; ) {
1487                 struct kvm_memory_slot *tmp;
1488
1489                 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1490                 parent = *node;
1491                 if (slot->base_gfn < tmp->base_gfn)
1492                         node = &(*node)->rb_left;
1493                 else if (slot->base_gfn > tmp->base_gfn)
1494                         node = &(*node)->rb_right;
1495                 else
1496                         BUG();
1497         }
1498
1499         rb_link_node(&slot->gfn_node[idx], parent, node);
1500         rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1501 }
1502
1503 static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1504                                struct kvm_memory_slot *slot)
1505 {
1506         rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1507 }
1508
1509 static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1510                                  struct kvm_memory_slot *old,
1511                                  struct kvm_memory_slot *new)
1512 {
1513         int idx = slots->node_idx;
1514
1515         WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1516
1517         rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1518                         &slots->gfn_tree);
1519 }
1520
1521 /*
1522  * Replace @old with @new in the inactive memslots.
1523  *
1524  * With NULL @old this simply adds @new.
1525  * With NULL @new this simply removes @old.
1526  *
1527  * If @new is non-NULL its hva_node[slots_idx] range has to be set
1528  * appropriately.
1529  */
1530 static void kvm_replace_memslot(struct kvm *kvm,
1531                                 struct kvm_memory_slot *old,
1532                                 struct kvm_memory_slot *new)
1533 {
1534         int as_id = kvm_memslots_get_as_id(old, new);
1535         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1536         int idx = slots->node_idx;
1537
1538         if (old) {
1539                 hash_del(&old->id_node[idx]);
1540                 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1541
1542                 if ((long)old == atomic_long_read(&slots->last_used_slot))
1543                         atomic_long_set(&slots->last_used_slot, (long)new);
1544
1545                 if (!new) {
1546                         kvm_erase_gfn_node(slots, old);
1547                         return;
1548                 }
1549         }
1550
1551         /*
1552          * Initialize @new's hva range.  Do this even when replacing an @old
1553          * slot, kvm_copy_memslot() deliberately does not touch node data.
1554          */
1555         new->hva_node[idx].start = new->userspace_addr;
1556         new->hva_node[idx].last = new->userspace_addr +
1557                                   (new->npages << PAGE_SHIFT) - 1;
1558
1559         /*
1560          * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
1561          * hva_node needs to be swapped with remove+insert even though hva can't
1562          * change when replacing an existing slot.
1563          */
1564         hash_add(slots->id_hash, &new->id_node[idx], new->id);
1565         interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1566
1567         /*
1568          * If the memslot gfn is unchanged, rb_replace_node() can be used to
1569          * switch the node in the gfn tree instead of removing the old and
1570          * inserting the new as two separate operations. Replacement is a
1571          * single O(1) operation versus two O(log(n)) operations for
1572          * remove+insert.
1573          */
1574         if (old && old->base_gfn == new->base_gfn) {
1575                 kvm_replace_gfn_node(slots, old, new);
1576         } else {
1577                 if (old)
1578                         kvm_erase_gfn_node(slots, old);
1579                 kvm_insert_gfn_node(slots, new);
1580         }
1581 }
1582
1583 /*
1584  * Flags that do not access any of the extra space of struct
1585  * kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_V1_FLAGS
1586  * only allows these.
1587  */
1588 #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
1589         (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
1590
1591 static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
1592 {
1593         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1594
1595 #ifdef __KVM_HAVE_READONLY_MEM
1596         valid_flags |= KVM_MEM_READONLY;
1597 #endif
1598
1599         if (mem->flags & ~valid_flags)
1600                 return -EINVAL;
1601
1602         return 0;
1603 }
1604
1605 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1606 {
1607         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1608
1609         /* Grab the generation from the activate memslots. */
1610         u64 gen = __kvm_memslots(kvm, as_id)->generation;
1611
1612         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1613         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1614
1615         /*
1616          * Do not store the new memslots while there are invalidations in
1617          * progress, otherwise the locking in invalidate_range_start and
1618          * invalidate_range_end will be unbalanced.
1619          */
1620         spin_lock(&kvm->mn_invalidate_lock);
1621         prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1622         while (kvm->mn_active_invalidate_count) {
1623                 set_current_state(TASK_UNINTERRUPTIBLE);
1624                 spin_unlock(&kvm->mn_invalidate_lock);
1625                 schedule();
1626                 spin_lock(&kvm->mn_invalidate_lock);
1627         }
1628         finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1629         rcu_assign_pointer(kvm->memslots[as_id], slots);
1630         spin_unlock(&kvm->mn_invalidate_lock);
1631
1632         /*
1633          * Acquired in kvm_set_memslot. Must be released before synchronize
1634          * SRCU below in order to avoid deadlock with another thread
1635          * acquiring the slots_arch_lock in an srcu critical section.
1636          */
1637         mutex_unlock(&kvm->slots_arch_lock);
1638
1639         synchronize_srcu_expedited(&kvm->srcu);
1640
1641         /*
1642          * Increment the new memslot generation a second time, dropping the
1643          * update in-progress flag and incrementing the generation based on
1644          * the number of address spaces.  This provides a unique and easily
1645          * identifiable generation number while the memslots are in flux.
1646          */
1647         gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1648
1649         /*
1650          * Generations must be unique even across address spaces.  We do not need
1651          * a global counter for that, instead the generation space is evenly split
1652          * across address spaces.  For example, with two address spaces, address
1653          * space 0 will use generations 0, 2, 4, ... while address space 1 will
1654          * use generations 1, 3, 5, ...
1655          */
1656         gen += KVM_ADDRESS_SPACE_NUM;
1657
1658         kvm_arch_memslots_updated(kvm, gen);
1659
1660         slots->generation = gen;
1661 }
1662
1663 static int kvm_prepare_memory_region(struct kvm *kvm,
1664                                      const struct kvm_memory_slot *old,
1665                                      struct kvm_memory_slot *new,
1666                                      enum kvm_mr_change change)
1667 {
1668         int r;
1669
1670         /*
1671          * If dirty logging is disabled, nullify the bitmap; the old bitmap
1672          * will be freed on "commit".  If logging is enabled in both old and
1673          * new, reuse the existing bitmap.  If logging is enabled only in the
1674          * new and KVM isn't using a ring buffer, allocate and initialize a
1675          * new bitmap.
1676          */
1677         if (change != KVM_MR_DELETE) {
1678                 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1679                         new->dirty_bitmap = NULL;
1680                 else if (old && old->dirty_bitmap)
1681                         new->dirty_bitmap = old->dirty_bitmap;
1682                 else if (kvm_use_dirty_bitmap(kvm)) {
1683                         r = kvm_alloc_dirty_bitmap(new);
1684                         if (r)
1685                                 return r;
1686
1687                         if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1688                                 bitmap_set(new->dirty_bitmap, 0, new->npages);
1689                 }
1690         }
1691
1692         r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1693
1694         /* Free the bitmap on failure if it was allocated above. */
1695         if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1696                 kvm_destroy_dirty_bitmap(new);
1697
1698         return r;
1699 }
1700
1701 static void kvm_commit_memory_region(struct kvm *kvm,
1702                                      struct kvm_memory_slot *old,
1703                                      const struct kvm_memory_slot *new,
1704                                      enum kvm_mr_change change)
1705 {
1706         int old_flags = old ? old->flags : 0;
1707         int new_flags = new ? new->flags : 0;
1708         /*
1709          * Update the total number of memslot pages before calling the arch
1710          * hook so that architectures can consume the result directly.
1711          */
1712         if (change == KVM_MR_DELETE)
1713                 kvm->nr_memslot_pages -= old->npages;
1714         else if (change == KVM_MR_CREATE)
1715                 kvm->nr_memslot_pages += new->npages;
1716
1717         if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1718                 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1719                 atomic_set(&kvm->nr_memslots_dirty_logging,
1720                            atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1721         }
1722
1723         kvm_arch_commit_memory_region(kvm, old, new, change);
1724
1725         switch (change) {
1726         case KVM_MR_CREATE:
1727                 /* Nothing more to do. */
1728                 break;
1729         case KVM_MR_DELETE:
1730                 /* Free the old memslot and all its metadata. */
1731                 kvm_free_memslot(kvm, old);
1732                 break;
1733         case KVM_MR_MOVE:
1734         case KVM_MR_FLAGS_ONLY:
1735                 /*
1736                  * Free the dirty bitmap as needed; the below check encompasses
1737                  * both the flags and whether a ring buffer is being used)
1738                  */
1739                 if (old->dirty_bitmap && !new->dirty_bitmap)
1740                         kvm_destroy_dirty_bitmap(old);
1741
1742                 /*
1743                  * The final quirk.  Free the detached, old slot, but only its
1744                  * memory, not any metadata.  Metadata, including arch specific
1745                  * data, may be reused by @new.
1746                  */
1747                 kfree(old);
1748                 break;
1749         default:
1750                 BUG();
1751         }
1752 }
1753
1754 /*
1755  * Activate @new, which must be installed in the inactive slots by the caller,
1756  * by swapping the active slots and then propagating @new to @old once @old is
1757  * unreachable and can be safely modified.
1758  *
1759  * With NULL @old this simply adds @new to @active (while swapping the sets).
1760  * With NULL @new this simply removes @old from @active and frees it
1761  * (while also swapping the sets).
1762  */
1763 static void kvm_activate_memslot(struct kvm *kvm,
1764                                  struct kvm_memory_slot *old,
1765                                  struct kvm_memory_slot *new)
1766 {
1767         int as_id = kvm_memslots_get_as_id(old, new);
1768
1769         kvm_swap_active_memslots(kvm, as_id);
1770
1771         /* Propagate the new memslot to the now inactive memslots. */
1772         kvm_replace_memslot(kvm, old, new);
1773 }
1774
1775 static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1776                              const struct kvm_memory_slot *src)
1777 {
1778         dest->base_gfn = src->base_gfn;
1779         dest->npages = src->npages;
1780         dest->dirty_bitmap = src->dirty_bitmap;
1781         dest->arch = src->arch;
1782         dest->userspace_addr = src->userspace_addr;
1783         dest->flags = src->flags;
1784         dest->id = src->id;
1785         dest->as_id = src->as_id;
1786 }
1787
1788 static void kvm_invalidate_memslot(struct kvm *kvm,
1789                                    struct kvm_memory_slot *old,
1790                                    struct kvm_memory_slot *invalid_slot)
1791 {
1792         /*
1793          * Mark the current slot INVALID.  As with all memslot modifications,
1794          * this must be done on an unreachable slot to avoid modifying the
1795          * current slot in the active tree.
1796          */
1797         kvm_copy_memslot(invalid_slot, old);
1798         invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1799         kvm_replace_memslot(kvm, old, invalid_slot);
1800
1801         /*
1802          * Activate the slot that is now marked INVALID, but don't propagate
1803          * the slot to the now inactive slots. The slot is either going to be
1804          * deleted or recreated as a new slot.
1805          */
1806         kvm_swap_active_memslots(kvm, old->as_id);
1807
1808         /*
1809          * From this point no new shadow pages pointing to a deleted, or moved,
1810          * memslot will be created.  Validation of sp->gfn happens in:
1811          *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1812          *      - kvm_is_visible_gfn (mmu_check_root)
1813          */
1814         kvm_arch_flush_shadow_memslot(kvm, old);
1815         kvm_arch_guest_memory_reclaimed(kvm);
1816
1817         /* Was released by kvm_swap_active_memslots(), reacquire. */
1818         mutex_lock(&kvm->slots_arch_lock);
1819
1820         /*
1821          * Copy the arch-specific field of the newly-installed slot back to the
1822          * old slot as the arch data could have changed between releasing
1823          * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1824          * above.  Writers are required to retrieve memslots *after* acquiring
1825          * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1826          */
1827         old->arch = invalid_slot->arch;
1828 }
1829
1830 static void kvm_create_memslot(struct kvm *kvm,
1831                                struct kvm_memory_slot *new)
1832 {
1833         /* Add the new memslot to the inactive set and activate. */
1834         kvm_replace_memslot(kvm, NULL, new);
1835         kvm_activate_memslot(kvm, NULL, new);
1836 }
1837
1838 static void kvm_delete_memslot(struct kvm *kvm,
1839                                struct kvm_memory_slot *old,
1840                                struct kvm_memory_slot *invalid_slot)
1841 {
1842         /*
1843          * Remove the old memslot (in the inactive memslots) by passing NULL as
1844          * the "new" slot, and for the invalid version in the active slots.
1845          */
1846         kvm_replace_memslot(kvm, old, NULL);
1847         kvm_activate_memslot(kvm, invalid_slot, NULL);
1848 }
1849
1850 static void kvm_move_memslot(struct kvm *kvm,
1851                              struct kvm_memory_slot *old,
1852                              struct kvm_memory_slot *new,
1853                              struct kvm_memory_slot *invalid_slot)
1854 {
1855         /*
1856          * Replace the old memslot in the inactive slots, and then swap slots
1857          * and replace the current INVALID with the new as well.
1858          */
1859         kvm_replace_memslot(kvm, old, new);
1860         kvm_activate_memslot(kvm, invalid_slot, new);
1861 }
1862
1863 static void kvm_update_flags_memslot(struct kvm *kvm,
1864                                      struct kvm_memory_slot *old,
1865                                      struct kvm_memory_slot *new)
1866 {
1867         /*
1868          * Similar to the MOVE case, but the slot doesn't need to be zapped as
1869          * an intermediate step. Instead, the old memslot is simply replaced
1870          * with a new, updated copy in both memslot sets.
1871          */
1872         kvm_replace_memslot(kvm, old, new);
1873         kvm_activate_memslot(kvm, old, new);
1874 }
1875
1876 static int kvm_set_memslot(struct kvm *kvm,
1877                            struct kvm_memory_slot *old,
1878                            struct kvm_memory_slot *new,
1879                            enum kvm_mr_change change)
1880 {
1881         struct kvm_memory_slot *invalid_slot;
1882         int r;
1883
1884         /*
1885          * Released in kvm_swap_active_memslots().
1886          *
1887          * Must be held from before the current memslots are copied until after
1888          * the new memslots are installed with rcu_assign_pointer, then
1889          * released before the synchronize srcu in kvm_swap_active_memslots().
1890          *
1891          * When modifying memslots outside of the slots_lock, must be held
1892          * before reading the pointer to the current memslots until after all
1893          * changes to those memslots are complete.
1894          *
1895          * These rules ensure that installing new memslots does not lose
1896          * changes made to the previous memslots.
1897          */
1898         mutex_lock(&kvm->slots_arch_lock);
1899
1900         /*
1901          * Invalidate the old slot if it's being deleted or moved.  This is
1902          * done prior to actually deleting/moving the memslot to allow vCPUs to
1903          * continue running by ensuring there are no mappings or shadow pages
1904          * for the memslot when it is deleted/moved.  Without pre-invalidation
1905          * (and without a lock), a window would exist between effecting the
1906          * delete/move and committing the changes in arch code where KVM or a
1907          * guest could access a non-existent memslot.
1908          *
1909          * Modifications are done on a temporary, unreachable slot.  The old
1910          * slot needs to be preserved in case a later step fails and the
1911          * invalidation needs to be reverted.
1912          */
1913         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1914                 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1915                 if (!invalid_slot) {
1916                         mutex_unlock(&kvm->slots_arch_lock);
1917                         return -ENOMEM;
1918                 }
1919                 kvm_invalidate_memslot(kvm, old, invalid_slot);
1920         }
1921
1922         r = kvm_prepare_memory_region(kvm, old, new, change);
1923         if (r) {
1924                 /*
1925                  * For DELETE/MOVE, revert the above INVALID change.  No
1926                  * modifications required since the original slot was preserved
1927                  * in the inactive slots.  Changing the active memslots also
1928                  * release slots_arch_lock.
1929                  */
1930                 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1931                         kvm_activate_memslot(kvm, invalid_slot, old);
1932                         kfree(invalid_slot);
1933                 } else {
1934                         mutex_unlock(&kvm->slots_arch_lock);
1935                 }
1936                 return r;
1937         }
1938
1939         /*
1940          * For DELETE and MOVE, the working slot is now active as the INVALID
1941          * version of the old slot.  MOVE is particularly special as it reuses
1942          * the old slot and returns a copy of the old slot (in working_slot).
1943          * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
1944          * old slot is detached but otherwise preserved.
1945          */
1946         if (change == KVM_MR_CREATE)
1947                 kvm_create_memslot(kvm, new);
1948         else if (change == KVM_MR_DELETE)
1949                 kvm_delete_memslot(kvm, old, invalid_slot);
1950         else if (change == KVM_MR_MOVE)
1951                 kvm_move_memslot(kvm, old, new, invalid_slot);
1952         else if (change == KVM_MR_FLAGS_ONLY)
1953                 kvm_update_flags_memslot(kvm, old, new);
1954         else
1955                 BUG();
1956
1957         /* Free the temporary INVALID slot used for DELETE and MOVE. */
1958         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1959                 kfree(invalid_slot);
1960
1961         /*
1962          * No need to refresh new->arch, changes after dropping slots_arch_lock
1963          * will directly hit the final, active memslot.  Architectures are
1964          * responsible for knowing that new->arch may be stale.
1965          */
1966         kvm_commit_memory_region(kvm, old, new, change);
1967
1968         return 0;
1969 }
1970
1971 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1972                                       gfn_t start, gfn_t end)
1973 {
1974         struct kvm_memslot_iter iter;
1975
1976         kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1977                 if (iter.slot->id != id)
1978                         return true;
1979         }
1980
1981         return false;
1982 }
1983
1984 /*
1985  * Allocate some memory and give it an address in the guest physical address
1986  * space.
1987  *
1988  * Discontiguous memory is allowed, mostly for framebuffers.
1989  *
1990  * Must be called holding kvm->slots_lock for write.
1991  */
1992 int __kvm_set_memory_region(struct kvm *kvm,
1993                             const struct kvm_userspace_memory_region2 *mem)
1994 {
1995         struct kvm_memory_slot *old, *new;
1996         struct kvm_memslots *slots;
1997         enum kvm_mr_change change;
1998         unsigned long npages;
1999         gfn_t base_gfn;
2000         int as_id, id;
2001         int r;
2002
2003         r = check_memory_region_flags(mem);
2004         if (r)
2005                 return r;
2006
2007         as_id = mem->slot >> 16;
2008         id = (u16)mem->slot;
2009
2010         /* General sanity checks */
2011         if ((mem->memory_size & (PAGE_SIZE - 1)) ||
2012             (mem->memory_size != (unsigned long)mem->memory_size))
2013                 return -EINVAL;
2014         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
2015                 return -EINVAL;
2016         /* We can read the guest memory with __xxx_user() later on. */
2017         if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
2018             (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
2019              !access_ok((void __user *)(unsigned long)mem->userspace_addr,
2020                         mem->memory_size))
2021                 return -EINVAL;
2022         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
2023                 return -EINVAL;
2024         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
2025                 return -EINVAL;
2026         if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
2027                 return -EINVAL;
2028
2029         slots = __kvm_memslots(kvm, as_id);
2030
2031         /*
2032          * Note, the old memslot (and the pointer itself!) may be invalidated
2033          * and/or destroyed by kvm_set_memslot().
2034          */
2035         old = id_to_memslot(slots, id);
2036
2037         if (!mem->memory_size) {
2038                 if (!old || !old->npages)
2039                         return -EINVAL;
2040
2041                 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
2042                         return -EIO;
2043
2044                 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
2045         }
2046
2047         base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2048         npages = (mem->memory_size >> PAGE_SHIFT);
2049
2050         if (!old || !old->npages) {
2051                 change = KVM_MR_CREATE;
2052
2053                 /*
2054                  * To simplify KVM internals, the total number of pages across
2055                  * all memslots must fit in an unsigned long.
2056                  */
2057                 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2058                         return -EINVAL;
2059         } else { /* Modify an existing slot. */
2060                 if ((mem->userspace_addr != old->userspace_addr) ||
2061                     (npages != old->npages) ||
2062                     ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2063                         return -EINVAL;
2064
2065                 if (base_gfn != old->base_gfn)
2066                         change = KVM_MR_MOVE;
2067                 else if (mem->flags != old->flags)
2068                         change = KVM_MR_FLAGS_ONLY;
2069                 else /* Nothing to change. */
2070                         return 0;
2071         }
2072
2073         if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2074             kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2075                 return -EEXIST;
2076
2077         /* Allocate a slot that will persist in the memslot. */
2078         new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2079         if (!new)
2080                 return -ENOMEM;
2081
2082         new->as_id = as_id;
2083         new->id = id;
2084         new->base_gfn = base_gfn;
2085         new->npages = npages;
2086         new->flags = mem->flags;
2087         new->userspace_addr = mem->userspace_addr;
2088
2089         r = kvm_set_memslot(kvm, old, new, change);
2090         if (r)
2091                 kfree(new);
2092         return r;
2093 }
2094 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2095
2096 int kvm_set_memory_region(struct kvm *kvm,
2097                           const struct kvm_userspace_memory_region2 *mem)
2098 {
2099         int r;
2100
2101         mutex_lock(&kvm->slots_lock);
2102         r = __kvm_set_memory_region(kvm, mem);
2103         mutex_unlock(&kvm->slots_lock);
2104         return r;
2105 }
2106 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2107
2108 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2109                                           struct kvm_userspace_memory_region2 *mem)
2110 {
2111         if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2112                 return -EINVAL;
2113
2114         return kvm_set_memory_region(kvm, mem);
2115 }
2116
2117 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2118 /**
2119  * kvm_get_dirty_log - get a snapshot of dirty pages
2120  * @kvm:        pointer to kvm instance
2121  * @log:        slot id and address to which we copy the log
2122  * @is_dirty:   set to '1' if any dirty pages were found
2123  * @memslot:    set to the associated memslot, always valid on success
2124  */
2125 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2126                       int *is_dirty, struct kvm_memory_slot **memslot)
2127 {
2128         struct kvm_memslots *slots;
2129         int i, as_id, id;
2130         unsigned long n;
2131         unsigned long any = 0;
2132
2133         /* Dirty ring tracking may be exclusive to dirty log tracking */
2134         if (!kvm_use_dirty_bitmap(kvm))
2135                 return -ENXIO;
2136
2137         *memslot = NULL;
2138         *is_dirty = 0;
2139
2140         as_id = log->slot >> 16;
2141         id = (u16)log->slot;
2142         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2143                 return -EINVAL;
2144
2145         slots = __kvm_memslots(kvm, as_id);
2146         *memslot = id_to_memslot(slots, id);
2147         if (!(*memslot) || !(*memslot)->dirty_bitmap)
2148                 return -ENOENT;
2149
2150         kvm_arch_sync_dirty_log(kvm, *memslot);
2151
2152         n = kvm_dirty_bitmap_bytes(*memslot);
2153
2154         for (i = 0; !any && i < n/sizeof(long); ++i)
2155                 any = (*memslot)->dirty_bitmap[i];
2156
2157         if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2158                 return -EFAULT;
2159
2160         if (any)
2161                 *is_dirty = 1;
2162         return 0;
2163 }
2164 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2165
2166 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2167 /**
2168  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2169  *      and reenable dirty page tracking for the corresponding pages.
2170  * @kvm:        pointer to kvm instance
2171  * @log:        slot id and address to which we copy the log
2172  *
2173  * We need to keep it in mind that VCPU threads can write to the bitmap
2174  * concurrently. So, to avoid losing track of dirty pages we keep the
2175  * following order:
2176  *
2177  *    1. Take a snapshot of the bit and clear it if needed.
2178  *    2. Write protect the corresponding page.
2179  *    3. Copy the snapshot to the userspace.
2180  *    4. Upon return caller flushes TLB's if needed.
2181  *
2182  * Between 2 and 4, the guest may write to the page using the remaining TLB
2183  * entry.  This is not a problem because the page is reported dirty using
2184  * the snapshot taken before and step 4 ensures that writes done after
2185  * exiting to userspace will be logged for the next call.
2186  *
2187  */
2188 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2189 {
2190         struct kvm_memslots *slots;
2191         struct kvm_memory_slot *memslot;
2192         int i, as_id, id;
2193         unsigned long n;
2194         unsigned long *dirty_bitmap;
2195         unsigned long *dirty_bitmap_buffer;
2196         bool flush;
2197
2198         /* Dirty ring tracking may be exclusive to dirty log tracking */
2199         if (!kvm_use_dirty_bitmap(kvm))
2200                 return -ENXIO;
2201
2202         as_id = log->slot >> 16;
2203         id = (u16)log->slot;
2204         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2205                 return -EINVAL;
2206
2207         slots = __kvm_memslots(kvm, as_id);
2208         memslot = id_to_memslot(slots, id);
2209         if (!memslot || !memslot->dirty_bitmap)
2210                 return -ENOENT;
2211
2212         dirty_bitmap = memslot->dirty_bitmap;
2213
2214         kvm_arch_sync_dirty_log(kvm, memslot);
2215
2216         n = kvm_dirty_bitmap_bytes(memslot);
2217         flush = false;
2218         if (kvm->manual_dirty_log_protect) {
2219                 /*
2220                  * Unlike kvm_get_dirty_log, we always return false in *flush,
2221                  * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
2222                  * is some code duplication between this function and
2223                  * kvm_get_dirty_log, but hopefully all architecture
2224                  * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2225                  * can be eliminated.
2226                  */
2227                 dirty_bitmap_buffer = dirty_bitmap;
2228         } else {
2229                 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2230                 memset(dirty_bitmap_buffer, 0, n);
2231
2232                 KVM_MMU_LOCK(kvm);
2233                 for (i = 0; i < n / sizeof(long); i++) {
2234                         unsigned long mask;
2235                         gfn_t offset;
2236
2237                         if (!dirty_bitmap[i])
2238                                 continue;
2239
2240                         flush = true;
2241                         mask = xchg(&dirty_bitmap[i], 0);
2242                         dirty_bitmap_buffer[i] = mask;
2243
2244                         offset = i * BITS_PER_LONG;
2245                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2246                                                                 offset, mask);
2247                 }
2248                 KVM_MMU_UNLOCK(kvm);
2249         }
2250
2251         if (flush)
2252                 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2253
2254         if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2255                 return -EFAULT;
2256         return 0;
2257 }
2258
2259
2260 /**
2261  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2262  * @kvm: kvm instance
2263  * @log: slot id and address to which we copy the log
2264  *
2265  * Steps 1-4 below provide general overview of dirty page logging. See
2266  * kvm_get_dirty_log_protect() function description for additional details.
2267  *
2268  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2269  * always flush the TLB (step 4) even if previous step failed  and the dirty
2270  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2271  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2272  * writes will be marked dirty for next log read.
2273  *
2274  *   1. Take a snapshot of the bit and clear it if needed.
2275  *   2. Write protect the corresponding page.
2276  *   3. Copy the snapshot to the userspace.
2277  *   4. Flush TLB's if needed.
2278  */
2279 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2280                                       struct kvm_dirty_log *log)
2281 {
2282         int r;
2283
2284         mutex_lock(&kvm->slots_lock);
2285
2286         r = kvm_get_dirty_log_protect(kvm, log);
2287
2288         mutex_unlock(&kvm->slots_lock);
2289         return r;
2290 }
2291
2292 /**
2293  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2294  *      and reenable dirty page tracking for the corresponding pages.
2295  * @kvm:        pointer to kvm instance
2296  * @log:        slot id and address from which to fetch the bitmap of dirty pages
2297  */
2298 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2299                                        struct kvm_clear_dirty_log *log)
2300 {
2301         struct kvm_memslots *slots;
2302         struct kvm_memory_slot *memslot;
2303         int as_id, id;
2304         gfn_t offset;
2305         unsigned long i, n;
2306         unsigned long *dirty_bitmap;
2307         unsigned long *dirty_bitmap_buffer;
2308         bool flush;
2309
2310         /* Dirty ring tracking may be exclusive to dirty log tracking */
2311         if (!kvm_use_dirty_bitmap(kvm))
2312                 return -ENXIO;
2313
2314         as_id = log->slot >> 16;
2315         id = (u16)log->slot;
2316         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2317                 return -EINVAL;
2318
2319         if (log->first_page & 63)
2320                 return -EINVAL;
2321
2322         slots = __kvm_memslots(kvm, as_id);
2323         memslot = id_to_memslot(slots, id);
2324         if (!memslot || !memslot->dirty_bitmap)
2325                 return -ENOENT;
2326
2327         dirty_bitmap = memslot->dirty_bitmap;
2328
2329         n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2330
2331         if (log->first_page > memslot->npages ||
2332             log->num_pages > memslot->npages - log->first_page ||
2333             (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2334             return -EINVAL;
2335
2336         kvm_arch_sync_dirty_log(kvm, memslot);
2337
2338         flush = false;
2339         dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2340         if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2341                 return -EFAULT;
2342
2343         KVM_MMU_LOCK(kvm);
2344         for (offset = log->first_page, i = offset / BITS_PER_LONG,
2345                  n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2346              i++, offset += BITS_PER_LONG) {
2347                 unsigned long mask = *dirty_bitmap_buffer++;
2348                 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2349                 if (!mask)
2350                         continue;
2351
2352                 mask &= atomic_long_fetch_andnot(mask, p);
2353
2354                 /*
2355                  * mask contains the bits that really have been cleared.  This
2356                  * never includes any bits beyond the length of the memslot (if
2357                  * the length is not aligned to 64 pages), therefore it is not
2358                  * a problem if userspace sets them in log->dirty_bitmap.
2359                 */
2360                 if (mask) {
2361                         flush = true;
2362                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2363                                                                 offset, mask);
2364                 }
2365         }
2366         KVM_MMU_UNLOCK(kvm);
2367
2368         if (flush)
2369                 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2370
2371         return 0;
2372 }
2373
2374 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2375                                         struct kvm_clear_dirty_log *log)
2376 {
2377         int r;
2378
2379         mutex_lock(&kvm->slots_lock);
2380
2381         r = kvm_clear_dirty_log_protect(kvm, log);
2382
2383         mutex_unlock(&kvm->slots_lock);
2384         return r;
2385 }
2386 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2387
2388 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2389 {
2390         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2391 }
2392 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2393
2394 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2395 {
2396         struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2397         u64 gen = slots->generation;
2398         struct kvm_memory_slot *slot;
2399
2400         /*
2401          * This also protects against using a memslot from a different address space,
2402          * since different address spaces have different generation numbers.
2403          */
2404         if (unlikely(gen != vcpu->last_used_slot_gen)) {
2405                 vcpu->last_used_slot = NULL;
2406                 vcpu->last_used_slot_gen = gen;
2407         }
2408
2409         slot = try_get_memslot(vcpu->last_used_slot, gfn);
2410         if (slot)
2411                 return slot;
2412
2413         /*
2414          * Fall back to searching all memslots. We purposely use
2415          * search_memslots() instead of __gfn_to_memslot() to avoid
2416          * thrashing the VM-wide last_used_slot in kvm_memslots.
2417          */
2418         slot = search_memslots(slots, gfn, false);
2419         if (slot) {
2420                 vcpu->last_used_slot = slot;
2421                 return slot;
2422         }
2423
2424         return NULL;
2425 }
2426
2427 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2428 {
2429         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2430
2431         return kvm_is_visible_memslot(memslot);
2432 }
2433 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2434
2435 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2436 {
2437         struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2438
2439         return kvm_is_visible_memslot(memslot);
2440 }
2441 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2442
2443 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2444 {
2445         struct vm_area_struct *vma;
2446         unsigned long addr, size;
2447
2448         size = PAGE_SIZE;
2449
2450         addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2451         if (kvm_is_error_hva(addr))
2452                 return PAGE_SIZE;
2453
2454         mmap_read_lock(current->mm);
2455         vma = find_vma(current->mm, addr);
2456         if (!vma)
2457                 goto out;
2458
2459         size = vma_kernel_pagesize(vma);
2460
2461 out:
2462         mmap_read_unlock(current->mm);
2463
2464         return size;
2465 }
2466
2467 static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2468 {
2469         return slot->flags & KVM_MEM_READONLY;
2470 }
2471
2472 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2473                                        gfn_t *nr_pages, bool write)
2474 {
2475         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2476                 return KVM_HVA_ERR_BAD;
2477
2478         if (memslot_is_readonly(slot) && write)
2479                 return KVM_HVA_ERR_RO_BAD;
2480
2481         if (nr_pages)
2482                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2483
2484         return __gfn_to_hva_memslot(slot, gfn);
2485 }
2486
2487 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2488                                      gfn_t *nr_pages)
2489 {
2490         return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2491 }
2492
2493 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2494                                         gfn_t gfn)
2495 {
2496         return gfn_to_hva_many(slot, gfn, NULL);
2497 }
2498 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2499
2500 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2501 {
2502         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2503 }
2504 EXPORT_SYMBOL_GPL(gfn_to_hva);
2505
2506 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2507 {
2508         return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2509 }
2510 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2511
2512 /*
2513  * Return the hva of a @gfn and the R/W attribute if possible.
2514  *
2515  * @slot: the kvm_memory_slot which contains @gfn
2516  * @gfn: the gfn to be translated
2517  * @writable: used to return the read/write attribute of the @slot if the hva
2518  * is valid and @writable is not NULL
2519  */
2520 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2521                                       gfn_t gfn, bool *writable)
2522 {
2523         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2524
2525         if (!kvm_is_error_hva(hva) && writable)
2526                 *writable = !memslot_is_readonly(slot);
2527
2528         return hva;
2529 }
2530
2531 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2532 {
2533         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2534
2535         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2536 }
2537
2538 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2539 {
2540         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2541
2542         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2543 }
2544
2545 static inline int check_user_page_hwpoison(unsigned long addr)
2546 {
2547         int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2548
2549         rc = get_user_pages(addr, 1, flags, NULL);
2550         return rc == -EHWPOISON;
2551 }
2552
2553 /*
2554  * The fast path to get the writable pfn which will be stored in @pfn,
2555  * true indicates success, otherwise false is returned.  It's also the
2556  * only part that runs if we can in atomic context.
2557  */
2558 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2559                             bool *writable, kvm_pfn_t *pfn)
2560 {
2561         struct page *page[1];
2562
2563         /*
2564          * Fast pin a writable pfn only if it is a write fault request
2565          * or the caller allows to map a writable pfn for a read fault
2566          * request.
2567          */
2568         if (!(write_fault || writable))
2569                 return false;
2570
2571         if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2572                 *pfn = page_to_pfn(page[0]);
2573
2574                 if (writable)
2575                         *writable = true;
2576                 return true;
2577         }
2578
2579         return false;
2580 }
2581
2582 /*
2583  * The slow path to get the pfn of the specified host virtual address,
2584  * 1 indicates success, -errno is returned if error is detected.
2585  */
2586 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2587                            bool interruptible, bool *writable, kvm_pfn_t *pfn)
2588 {
2589         /*
2590          * When a VCPU accesses a page that is not mapped into the secondary
2591          * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2592          * make progress. We always want to honor NUMA hinting faults in that
2593          * case, because GUP usage corresponds to memory accesses from the VCPU.
2594          * Otherwise, we'd not trigger NUMA hinting faults once a page is
2595          * mapped into the secondary MMU and gets accessed by a VCPU.
2596          *
2597          * Note that get_user_page_fast_only() and FOLL_WRITE for now
2598          * implicitly honor NUMA hinting faults and don't need this flag.
2599          */
2600         unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
2601         struct page *page;
2602         int npages;
2603
2604         might_sleep();
2605
2606         if (writable)
2607                 *writable = write_fault;
2608
2609         if (write_fault)
2610                 flags |= FOLL_WRITE;
2611         if (async)
2612                 flags |= FOLL_NOWAIT;
2613         if (interruptible)
2614                 flags |= FOLL_INTERRUPTIBLE;
2615
2616         npages = get_user_pages_unlocked(addr, 1, &page, flags);
2617         if (npages != 1)
2618                 return npages;
2619
2620         /* map read fault as writable if possible */
2621         if (unlikely(!write_fault) && writable) {
2622                 struct page *wpage;
2623
2624                 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2625                         *writable = true;
2626                         put_page(page);
2627                         page = wpage;
2628                 }
2629         }
2630         *pfn = page_to_pfn(page);
2631         return npages;
2632 }
2633
2634 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2635 {
2636         if (unlikely(!(vma->vm_flags & VM_READ)))
2637                 return false;
2638
2639         if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2640                 return false;
2641
2642         return true;
2643 }
2644
2645 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2646 {
2647         struct page *page = kvm_pfn_to_refcounted_page(pfn);
2648
2649         if (!page)
2650                 return 1;
2651
2652         return get_page_unless_zero(page);
2653 }
2654
2655 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2656                                unsigned long addr, bool write_fault,
2657                                bool *writable, kvm_pfn_t *p_pfn)
2658 {
2659         kvm_pfn_t pfn;
2660         pte_t *ptep;
2661         pte_t pte;
2662         spinlock_t *ptl;
2663         int r;
2664
2665         r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2666         if (r) {
2667                 /*
2668                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2669                  * not call the fault handler, so do it here.
2670                  */
2671                 bool unlocked = false;
2672                 r = fixup_user_fault(current->mm, addr,
2673                                      (write_fault ? FAULT_FLAG_WRITE : 0),
2674                                      &unlocked);
2675                 if (unlocked)
2676                         return -EAGAIN;
2677                 if (r)
2678                         return r;
2679
2680                 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2681                 if (r)
2682                         return r;
2683         }
2684
2685         pte = ptep_get(ptep);
2686
2687         if (write_fault && !pte_write(pte)) {
2688                 pfn = KVM_PFN_ERR_RO_FAULT;
2689                 goto out;
2690         }
2691
2692         if (writable)
2693                 *writable = pte_write(pte);
2694         pfn = pte_pfn(pte);
2695
2696         /*
2697          * Get a reference here because callers of *hva_to_pfn* and
2698          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2699          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2700          * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2701          * simply do nothing for reserved pfns.
2702          *
2703          * Whoever called remap_pfn_range is also going to call e.g.
2704          * unmap_mapping_range before the underlying pages are freed,
2705          * causing a call to our MMU notifier.
2706          *
2707          * Certain IO or PFNMAP mappings can be backed with valid
2708          * struct pages, but be allocated without refcounting e.g.,
2709          * tail pages of non-compound higher order allocations, which
2710          * would then underflow the refcount when the caller does the
2711          * required put_page. Don't allow those pages here.
2712          */
2713         if (!kvm_try_get_pfn(pfn))
2714                 r = -EFAULT;
2715
2716 out:
2717         pte_unmap_unlock(ptep, ptl);
2718         *p_pfn = pfn;
2719
2720         return r;
2721 }
2722
2723 /*
2724  * Pin guest page in memory and return its pfn.
2725  * @addr: host virtual address which maps memory to the guest
2726  * @atomic: whether this function can sleep
2727  * @interruptible: whether the process can be interrupted by non-fatal signals
2728  * @async: whether this function need to wait IO complete if the
2729  *         host page is not in the memory
2730  * @write_fault: whether we should get a writable host page
2731  * @writable: whether it allows to map a writable host page for !@write_fault
2732  *
2733  * The function will map a writable host page for these two cases:
2734  * 1): @write_fault = true
2735  * 2): @write_fault = false && @writable, @writable will tell the caller
2736  *     whether the mapping is writable.
2737  */
2738 kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2739                      bool *async, bool write_fault, bool *writable)
2740 {
2741         struct vm_area_struct *vma;
2742         kvm_pfn_t pfn;
2743         int npages, r;
2744
2745         /* we can do it either atomically or asynchronously, not both */
2746         BUG_ON(atomic && async);
2747
2748         if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2749                 return pfn;
2750
2751         if (atomic)
2752                 return KVM_PFN_ERR_FAULT;
2753
2754         npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2755                                  writable, &pfn);
2756         if (npages == 1)
2757                 return pfn;
2758         if (npages == -EINTR)
2759                 return KVM_PFN_ERR_SIGPENDING;
2760
2761         mmap_read_lock(current->mm);
2762         if (npages == -EHWPOISON ||
2763               (!async && check_user_page_hwpoison(addr))) {
2764                 pfn = KVM_PFN_ERR_HWPOISON;
2765                 goto exit;
2766         }
2767
2768 retry:
2769         vma = vma_lookup(current->mm, addr);
2770
2771         if (vma == NULL)
2772                 pfn = KVM_PFN_ERR_FAULT;
2773         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2774                 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
2775                 if (r == -EAGAIN)
2776                         goto retry;
2777                 if (r < 0)
2778                         pfn = KVM_PFN_ERR_FAULT;
2779         } else {
2780                 if (async && vma_is_valid(vma, write_fault))
2781                         *async = true;
2782                 pfn = KVM_PFN_ERR_FAULT;
2783         }
2784 exit:
2785         mmap_read_unlock(current->mm);
2786         return pfn;
2787 }
2788
2789 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2790                                bool atomic, bool interruptible, bool *async,
2791                                bool write_fault, bool *writable, hva_t *hva)
2792 {
2793         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2794
2795         if (hva)
2796                 *hva = addr;
2797
2798         if (addr == KVM_HVA_ERR_RO_BAD) {
2799                 if (writable)
2800                         *writable = false;
2801                 return KVM_PFN_ERR_RO_FAULT;
2802         }
2803
2804         if (kvm_is_error_hva(addr)) {
2805                 if (writable)
2806                         *writable = false;
2807                 return KVM_PFN_NOSLOT;
2808         }
2809
2810         /* Do not map writable pfn in the readonly memslot. */
2811         if (writable && memslot_is_readonly(slot)) {
2812                 *writable = false;
2813                 writable = NULL;
2814         }
2815
2816         return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
2817                           writable);
2818 }
2819 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2820
2821 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2822                       bool *writable)
2823 {
2824         return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
2825                                     NULL, write_fault, writable, NULL);
2826 }
2827 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2828
2829 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2830 {
2831         return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
2832                                     NULL, NULL);
2833 }
2834 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2835
2836 kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2837 {
2838         return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
2839                                     NULL, NULL);
2840 }
2841 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2842
2843 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2844 {
2845         return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2846 }
2847 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2848
2849 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2850 {
2851         return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2852 }
2853 EXPORT_SYMBOL_GPL(gfn_to_pfn);
2854
2855 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2856 {
2857         return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2858 }
2859 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2860
2861 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2862                             struct page **pages, int nr_pages)
2863 {
2864         unsigned long addr;
2865         gfn_t entry = 0;
2866
2867         addr = gfn_to_hva_many(slot, gfn, &entry);
2868         if (kvm_is_error_hva(addr))
2869                 return -1;
2870
2871         if (entry < nr_pages)
2872                 return 0;
2873
2874         return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2875 }
2876 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2877
2878 /*
2879  * Do not use this helper unless you are absolutely certain the gfn _must_ be
2880  * backed by 'struct page'.  A valid example is if the backing memslot is
2881  * controlled by KVM.  Note, if the returned page is valid, it's refcount has
2882  * been elevated by gfn_to_pfn().
2883  */
2884 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2885 {
2886         struct page *page;
2887         kvm_pfn_t pfn;
2888
2889         pfn = gfn_to_pfn(kvm, gfn);
2890
2891         if (is_error_noslot_pfn(pfn))
2892                 return KVM_ERR_PTR_BAD_PAGE;
2893
2894         page = kvm_pfn_to_refcounted_page(pfn);
2895         if (!page)
2896                 return KVM_ERR_PTR_BAD_PAGE;
2897
2898         return page;
2899 }
2900 EXPORT_SYMBOL_GPL(gfn_to_page);
2901
2902 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2903 {
2904         if (dirty)
2905                 kvm_release_pfn_dirty(pfn);
2906         else
2907                 kvm_release_pfn_clean(pfn);
2908 }
2909
2910 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2911 {
2912         kvm_pfn_t pfn;
2913         void *hva = NULL;
2914         struct page *page = KVM_UNMAPPED_PAGE;
2915
2916         if (!map)
2917                 return -EINVAL;
2918
2919         pfn = gfn_to_pfn(vcpu->kvm, gfn);
2920         if (is_error_noslot_pfn(pfn))
2921                 return -EINVAL;
2922
2923         if (pfn_valid(pfn)) {
2924                 page = pfn_to_page(pfn);
2925                 hva = kmap(page);
2926 #ifdef CONFIG_HAS_IOMEM
2927         } else {
2928                 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2929 #endif
2930         }
2931
2932         if (!hva)
2933                 return -EFAULT;
2934
2935         map->page = page;
2936         map->hva = hva;
2937         map->pfn = pfn;
2938         map->gfn = gfn;
2939
2940         return 0;
2941 }
2942 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2943
2944 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2945 {
2946         if (!map)
2947                 return;
2948
2949         if (!map->hva)
2950                 return;
2951
2952         if (map->page != KVM_UNMAPPED_PAGE)
2953                 kunmap(map->page);
2954 #ifdef CONFIG_HAS_IOMEM
2955         else
2956                 memunmap(map->hva);
2957 #endif
2958
2959         if (dirty)
2960                 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2961
2962         kvm_release_pfn(map->pfn, dirty);
2963
2964         map->hva = NULL;
2965         map->page = NULL;
2966 }
2967 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2968
2969 static bool kvm_is_ad_tracked_page(struct page *page)
2970 {
2971         /*
2972          * Per page-flags.h, pages tagged PG_reserved "should in general not be
2973          * touched (e.g. set dirty) except by its owner".
2974          */
2975         return !PageReserved(page);
2976 }
2977
2978 static void kvm_set_page_dirty(struct page *page)
2979 {
2980         if (kvm_is_ad_tracked_page(page))
2981                 SetPageDirty(page);
2982 }
2983
2984 static void kvm_set_page_accessed(struct page *page)
2985 {
2986         if (kvm_is_ad_tracked_page(page))
2987                 mark_page_accessed(page);
2988 }
2989
2990 void kvm_release_page_clean(struct page *page)
2991 {
2992         WARN_ON(is_error_page(page));
2993
2994         kvm_set_page_accessed(page);
2995         put_page(page);
2996 }
2997 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2998
2999 void kvm_release_pfn_clean(kvm_pfn_t pfn)
3000 {
3001         struct page *page;
3002
3003         if (is_error_noslot_pfn(pfn))
3004                 return;
3005
3006         page = kvm_pfn_to_refcounted_page(pfn);
3007         if (!page)
3008                 return;
3009
3010         kvm_release_page_clean(page);
3011 }
3012 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
3013
3014 void kvm_release_page_dirty(struct page *page)
3015 {
3016         WARN_ON(is_error_page(page));
3017
3018         kvm_set_page_dirty(page);
3019         kvm_release_page_clean(page);
3020 }
3021 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
3022
3023 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
3024 {
3025         struct page *page;
3026
3027         if (is_error_noslot_pfn(pfn))
3028                 return;
3029
3030         page = kvm_pfn_to_refcounted_page(pfn);
3031         if (!page)
3032                 return;
3033
3034         kvm_release_page_dirty(page);
3035 }
3036 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
3037
3038 /*
3039  * Note, checking for an error/noslot pfn is the caller's responsibility when
3040  * directly marking a page dirty/accessed.  Unlike the "release" helpers, the
3041  * "set" helpers are not to be used when the pfn might point at garbage.
3042  */
3043 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
3044 {
3045         if (WARN_ON(is_error_noslot_pfn(pfn)))
3046                 return;
3047
3048         if (pfn_valid(pfn))
3049                 kvm_set_page_dirty(pfn_to_page(pfn));
3050 }
3051 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3052
3053 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
3054 {
3055         if (WARN_ON(is_error_noslot_pfn(pfn)))
3056                 return;
3057
3058         if (pfn_valid(pfn))
3059                 kvm_set_page_accessed(pfn_to_page(pfn));
3060 }
3061 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3062
3063 static int next_segment(unsigned long len, int offset)
3064 {
3065         if (len > PAGE_SIZE - offset)
3066                 return PAGE_SIZE - offset;
3067         else
3068                 return len;
3069 }
3070
3071 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3072                                  void *data, int offset, int len)
3073 {
3074         int r;
3075         unsigned long addr;
3076
3077         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3078         if (kvm_is_error_hva(addr))
3079                 return -EFAULT;
3080         r = __copy_from_user(data, (void __user *)addr + offset, len);
3081         if (r)
3082                 return -EFAULT;
3083         return 0;
3084 }
3085
3086 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3087                         int len)
3088 {
3089         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3090
3091         return __kvm_read_guest_page(slot, gfn, data, offset, len);
3092 }
3093 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3094
3095 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3096                              int offset, int len)
3097 {
3098         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3099
3100         return __kvm_read_guest_page(slot, gfn, data, offset, len);
3101 }
3102 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3103
3104 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3105 {
3106         gfn_t gfn = gpa >> PAGE_SHIFT;
3107         int seg;
3108         int offset = offset_in_page(gpa);
3109         int ret;
3110
3111         while ((seg = next_segment(len, offset)) != 0) {
3112                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3113                 if (ret < 0)
3114                         return ret;
3115                 offset = 0;
3116                 len -= seg;
3117                 data += seg;
3118                 ++gfn;
3119         }
3120         return 0;
3121 }
3122 EXPORT_SYMBOL_GPL(kvm_read_guest);
3123
3124 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3125 {
3126         gfn_t gfn = gpa >> PAGE_SHIFT;
3127         int seg;
3128         int offset = offset_in_page(gpa);
3129         int ret;
3130
3131         while ((seg = next_segment(len, offset)) != 0) {
3132                 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3133                 if (ret < 0)
3134                         return ret;
3135                 offset = 0;
3136                 len -= seg;
3137                 data += seg;
3138                 ++gfn;
3139         }
3140         return 0;
3141 }
3142 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3143
3144 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3145                                    void *data, int offset, unsigned long len)
3146 {
3147         int r;
3148         unsigned long addr;
3149
3150         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3151         if (kvm_is_error_hva(addr))
3152                 return -EFAULT;
3153         pagefault_disable();
3154         r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3155         pagefault_enable();
3156         if (r)
3157                 return -EFAULT;
3158         return 0;
3159 }
3160
3161 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3162                                void *data, unsigned long len)
3163 {
3164         gfn_t gfn = gpa >> PAGE_SHIFT;
3165         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3166         int offset = offset_in_page(gpa);
3167
3168         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3169 }
3170 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3171
3172 static int __kvm_write_guest_page(struct kvm *kvm,
3173                                   struct kvm_memory_slot *memslot, gfn_t gfn,
3174                                   const void *data, int offset, int len)
3175 {
3176         int r;
3177         unsigned long addr;
3178
3179         addr = gfn_to_hva_memslot(memslot, gfn);
3180         if (kvm_is_error_hva(addr))
3181                 return -EFAULT;
3182         r = __copy_to_user((void __user *)addr + offset, data, len);
3183         if (r)
3184                 return -EFAULT;
3185         mark_page_dirty_in_slot(kvm, memslot, gfn);
3186         return 0;
3187 }
3188
3189 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3190                          const void *data, int offset, int len)
3191 {
3192         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3193
3194         return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3195 }
3196 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3197
3198 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3199                               const void *data, int offset, int len)
3200 {
3201         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3202
3203         return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3204 }
3205 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3206
3207 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3208                     unsigned long len)
3209 {
3210         gfn_t gfn = gpa >> PAGE_SHIFT;
3211         int seg;
3212         int offset = offset_in_page(gpa);
3213         int ret;
3214
3215         while ((seg = next_segment(len, offset)) != 0) {
3216                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3217                 if (ret < 0)
3218                         return ret;
3219                 offset = 0;
3220                 len -= seg;
3221                 data += seg;
3222                 ++gfn;
3223         }
3224         return 0;
3225 }
3226 EXPORT_SYMBOL_GPL(kvm_write_guest);
3227
3228 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3229                          unsigned long len)
3230 {
3231         gfn_t gfn = gpa >> PAGE_SHIFT;
3232         int seg;
3233         int offset = offset_in_page(gpa);
3234         int ret;
3235
3236         while ((seg = next_segment(len, offset)) != 0) {
3237                 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3238                 if (ret < 0)
3239                         return ret;
3240                 offset = 0;
3241                 len -= seg;
3242                 data += seg;
3243                 ++gfn;
3244         }
3245         return 0;
3246 }
3247 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3248
3249 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3250                                        struct gfn_to_hva_cache *ghc,
3251                                        gpa_t gpa, unsigned long len)
3252 {
3253         int offset = offset_in_page(gpa);
3254         gfn_t start_gfn = gpa >> PAGE_SHIFT;
3255         gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3256         gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3257         gfn_t nr_pages_avail;
3258
3259         /* Update ghc->generation before performing any error checks. */
3260         ghc->generation = slots->generation;
3261
3262         if (start_gfn > end_gfn) {
3263                 ghc->hva = KVM_HVA_ERR_BAD;
3264                 return -EINVAL;
3265         }
3266
3267         /*
3268          * If the requested region crosses two memslots, we still
3269          * verify that the entire region is valid here.
3270          */
3271         for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3272                 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3273                 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3274                                            &nr_pages_avail);
3275                 if (kvm_is_error_hva(ghc->hva))
3276                         return -EFAULT;
3277         }
3278
3279         /* Use the slow path for cross page reads and writes. */
3280         if (nr_pages_needed == 1)
3281                 ghc->hva += offset;
3282         else
3283                 ghc->memslot = NULL;
3284
3285         ghc->gpa = gpa;
3286         ghc->len = len;
3287         return 0;
3288 }
3289
3290 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3291                               gpa_t gpa, unsigned long len)
3292 {
3293         struct kvm_memslots *slots = kvm_memslots(kvm);
3294         return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3295 }
3296 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3297
3298 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3299                                   void *data, unsigned int offset,
3300                                   unsigned long len)
3301 {
3302         struct kvm_memslots *slots = kvm_memslots(kvm);
3303         int r;
3304         gpa_t gpa = ghc->gpa + offset;
3305
3306         if (WARN_ON_ONCE(len + offset > ghc->len))
3307                 return -EINVAL;
3308
3309         if (slots->generation != ghc->generation) {
3310                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3311                         return -EFAULT;
3312         }
3313
3314         if (kvm_is_error_hva(ghc->hva))
3315                 return -EFAULT;
3316
3317         if (unlikely(!ghc->memslot))
3318                 return kvm_write_guest(kvm, gpa, data, len);
3319
3320         r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3321         if (r)
3322                 return -EFAULT;
3323         mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3324
3325         return 0;
3326 }
3327 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3328
3329 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3330                            void *data, unsigned long len)
3331 {
3332         return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3333 }
3334 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3335
3336 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3337                                  void *data, unsigned int offset,
3338                                  unsigned long len)
3339 {
3340         struct kvm_memslots *slots = kvm_memslots(kvm);
3341         int r;
3342         gpa_t gpa = ghc->gpa + offset;
3343
3344         if (WARN_ON_ONCE(len + offset > ghc->len))
3345                 return -EINVAL;
3346
3347         if (slots->generation != ghc->generation) {
3348                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3349                         return -EFAULT;
3350         }
3351
3352         if (kvm_is_error_hva(ghc->hva))
3353                 return -EFAULT;
3354
3355         if (unlikely(!ghc->memslot))
3356                 return kvm_read_guest(kvm, gpa, data, len);
3357
3358         r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3359         if (r)
3360                 return -EFAULT;
3361
3362         return 0;
3363 }
3364 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3365
3366 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3367                           void *data, unsigned long len)
3368 {
3369         return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3370 }
3371 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3372
3373 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3374 {
3375         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3376         gfn_t gfn = gpa >> PAGE_SHIFT;
3377         int seg;
3378         int offset = offset_in_page(gpa);
3379         int ret;
3380
3381         while ((seg = next_segment(len, offset)) != 0) {
3382                 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3383                 if (ret < 0)
3384                         return ret;
3385                 offset = 0;
3386                 len -= seg;
3387                 ++gfn;
3388         }
3389         return 0;
3390 }
3391 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3392
3393 void mark_page_dirty_in_slot(struct kvm *kvm,
3394                              const struct kvm_memory_slot *memslot,
3395                              gfn_t gfn)
3396 {
3397         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3398
3399 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3400         if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3401                 return;
3402
3403         WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3404 #endif
3405
3406         if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3407                 unsigned long rel_gfn = gfn - memslot->base_gfn;
3408                 u32 slot = (memslot->as_id << 16) | memslot->id;
3409
3410                 if (kvm->dirty_ring_size && vcpu)
3411                         kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3412                 else if (memslot->dirty_bitmap)
3413                         set_bit_le(rel_gfn, memslot->dirty_bitmap);
3414         }
3415 }
3416 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3417
3418 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3419 {
3420         struct kvm_memory_slot *memslot;
3421
3422         memslot = gfn_to_memslot(kvm, gfn);
3423         mark_page_dirty_in_slot(kvm, memslot, gfn);
3424 }
3425 EXPORT_SYMBOL_GPL(mark_page_dirty);
3426
3427 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3428 {
3429         struct kvm_memory_slot *memslot;
3430
3431         memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3432         mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3433 }
3434 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3435
3436 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3437 {
3438         if (!vcpu->sigset_active)
3439                 return;
3440
3441         /*
3442          * This does a lockless modification of ->real_blocked, which is fine
3443          * because, only current can change ->real_blocked and all readers of
3444          * ->real_blocked don't care as long ->real_blocked is always a subset
3445          * of ->blocked.
3446          */
3447         sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3448 }
3449
3450 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3451 {
3452         if (!vcpu->sigset_active)
3453                 return;
3454
3455         sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3456         sigemptyset(&current->real_blocked);
3457 }
3458
3459 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3460 {
3461         unsigned int old, val, grow, grow_start;
3462
3463         old = val = vcpu->halt_poll_ns;
3464         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3465         grow = READ_ONCE(halt_poll_ns_grow);
3466         if (!grow)
3467                 goto out;
3468
3469         val *= grow;
3470         if (val < grow_start)
3471                 val = grow_start;
3472
3473         vcpu->halt_poll_ns = val;
3474 out:
3475         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3476 }
3477
3478 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3479 {
3480         unsigned int old, val, shrink, grow_start;
3481
3482         old = val = vcpu->halt_poll_ns;
3483         shrink = READ_ONCE(halt_poll_ns_shrink);
3484         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3485         if (shrink == 0)
3486                 val = 0;
3487         else
3488                 val /= shrink;
3489
3490         if (val < grow_start)
3491                 val = 0;
3492
3493         vcpu->halt_poll_ns = val;
3494         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3495 }
3496
3497 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3498 {
3499         int ret = -EINTR;
3500         int idx = srcu_read_lock(&vcpu->kvm->srcu);
3501
3502         if (kvm_arch_vcpu_runnable(vcpu))
3503                 goto out;
3504         if (kvm_cpu_has_pending_timer(vcpu))
3505                 goto out;
3506         if (signal_pending(current))
3507                 goto out;
3508         if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3509                 goto out;
3510
3511         ret = 0;
3512 out:
3513         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3514         return ret;
3515 }
3516
3517 /*
3518  * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3519  * pending.  This is mostly used when halting a vCPU, but may also be used
3520  * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3521  */
3522 bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3523 {
3524         struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3525         bool waited = false;
3526
3527         vcpu->stat.generic.blocking = 1;
3528
3529         preempt_disable();
3530         kvm_arch_vcpu_blocking(vcpu);
3531         prepare_to_rcuwait(wait);
3532         preempt_enable();
3533
3534         for (;;) {
3535                 set_current_state(TASK_INTERRUPTIBLE);
3536
3537                 if (kvm_vcpu_check_block(vcpu) < 0)
3538                         break;
3539
3540                 waited = true;
3541                 schedule();
3542         }
3543
3544         preempt_disable();
3545         finish_rcuwait(wait);
3546         kvm_arch_vcpu_unblocking(vcpu);
3547         preempt_enable();
3548
3549         vcpu->stat.generic.blocking = 0;
3550
3551         return waited;
3552 }
3553
3554 static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3555                                           ktime_t end, bool success)
3556 {
3557         struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3558         u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3559
3560         ++vcpu->stat.generic.halt_attempted_poll;
3561
3562         if (success) {
3563                 ++vcpu->stat.generic.halt_successful_poll;
3564
3565                 if (!vcpu_valid_wakeup(vcpu))
3566                         ++vcpu->stat.generic.halt_poll_invalid;
3567
3568                 stats->halt_poll_success_ns += poll_ns;
3569                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3570         } else {
3571                 stats->halt_poll_fail_ns += poll_ns;
3572                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3573         }
3574 }
3575
3576 static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3577 {
3578         struct kvm *kvm = vcpu->kvm;
3579
3580         if (kvm->override_halt_poll_ns) {
3581                 /*
3582                  * Ensure kvm->max_halt_poll_ns is not read before
3583                  * kvm->override_halt_poll_ns.
3584                  *
3585                  * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3586                  */
3587                 smp_rmb();
3588                 return READ_ONCE(kvm->max_halt_poll_ns);
3589         }
3590
3591         return READ_ONCE(halt_poll_ns);
3592 }
3593
3594 /*
3595  * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
3596  * polling is enabled, busy wait for a short time before blocking to avoid the
3597  * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3598  * is halted.
3599  */
3600 void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3601 {
3602         unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3603         bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3604         ktime_t start, cur, poll_end;
3605         bool waited = false;
3606         bool do_halt_poll;
3607         u64 halt_ns;
3608
3609         if (vcpu->halt_poll_ns > max_halt_poll_ns)
3610                 vcpu->halt_poll_ns = max_halt_poll_ns;
3611
3612         do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3613
3614         start = cur = poll_end = ktime_get();
3615         if (do_halt_poll) {
3616                 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3617
3618                 do {
3619                         if (kvm_vcpu_check_block(vcpu) < 0)
3620                                 goto out;
3621                         cpu_relax();
3622                         poll_end = cur = ktime_get();
3623                 } while (kvm_vcpu_can_poll(cur, stop));
3624         }
3625
3626         waited = kvm_vcpu_block(vcpu);
3627
3628         cur = ktime_get();
3629         if (waited) {
3630                 vcpu->stat.generic.halt_wait_ns +=
3631                         ktime_to_ns(cur) - ktime_to_ns(poll_end);
3632                 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3633                                 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3634         }
3635 out:
3636         /* The total time the vCPU was "halted", including polling time. */
3637         halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3638
3639         /*
3640          * Note, halt-polling is considered successful so long as the vCPU was
3641          * never actually scheduled out, i.e. even if the wake event arrived
3642          * after of the halt-polling loop itself, but before the full wait.
3643          */
3644         if (do_halt_poll)
3645                 update_halt_poll_stats(vcpu, start, poll_end, !waited);
3646
3647         if (halt_poll_allowed) {
3648                 /* Recompute the max halt poll time in case it changed. */
3649                 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3650
3651                 if (!vcpu_valid_wakeup(vcpu)) {
3652                         shrink_halt_poll_ns(vcpu);
3653                 } else if (max_halt_poll_ns) {
3654                         if (halt_ns <= vcpu->halt_poll_ns)
3655                                 ;
3656                         /* we had a long block, shrink polling */
3657                         else if (vcpu->halt_poll_ns &&
3658                                  halt_ns > max_halt_poll_ns)
3659                                 shrink_halt_poll_ns(vcpu);
3660                         /* we had a short halt and our poll time is too small */
3661                         else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3662                                  halt_ns < max_halt_poll_ns)
3663                                 grow_halt_poll_ns(vcpu);
3664                 } else {
3665                         vcpu->halt_poll_ns = 0;
3666                 }
3667         }
3668
3669         trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3670 }
3671 EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3672
3673 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3674 {
3675         if (__kvm_vcpu_wake_up(vcpu)) {
3676                 WRITE_ONCE(vcpu->ready, true);
3677                 ++vcpu->stat.generic.halt_wakeup;
3678                 return true;
3679         }
3680
3681         return false;
3682 }
3683 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3684
3685 #ifndef CONFIG_S390
3686 /*
3687  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3688  */
3689 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3690 {
3691         int me, cpu;
3692
3693         if (kvm_vcpu_wake_up(vcpu))
3694                 return;
3695
3696         me = get_cpu();
3697         /*
3698          * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3699          * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
3700          * kick" check does not need atomic operations if kvm_vcpu_kick is used
3701          * within the vCPU thread itself.
3702          */
3703         if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3704                 if (vcpu->mode == IN_GUEST_MODE)
3705                         WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3706                 goto out;
3707         }
3708
3709         /*
3710          * Note, the vCPU could get migrated to a different pCPU at any point
3711          * after kvm_arch_vcpu_should_kick(), which could result in sending an
3712          * IPI to the previous pCPU.  But, that's ok because the purpose of the
3713          * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3714          * vCPU also requires it to leave IN_GUEST_MODE.
3715          */
3716         if (kvm_arch_vcpu_should_kick(vcpu)) {
3717                 cpu = READ_ONCE(vcpu->cpu);
3718                 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3719                         smp_send_reschedule(cpu);
3720         }
3721 out:
3722         put_cpu();
3723 }
3724 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3725 #endif /* !CONFIG_S390 */
3726
3727 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3728 {
3729         struct pid *pid;
3730         struct task_struct *task = NULL;
3731         int ret = 0;
3732
3733         rcu_read_lock();
3734         pid = rcu_dereference(target->pid);
3735         if (pid)
3736                 task = get_pid_task(pid, PIDTYPE_PID);
3737         rcu_read_unlock();
3738         if (!task)
3739                 return ret;
3740         ret = yield_to(task, 1);
3741         put_task_struct(task);
3742
3743         return ret;
3744 }
3745 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3746
3747 /*
3748  * Helper that checks whether a VCPU is eligible for directed yield.
3749  * Most eligible candidate to yield is decided by following heuristics:
3750  *
3751  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3752  *  (preempted lock holder), indicated by @in_spin_loop.
3753  *  Set at the beginning and cleared at the end of interception/PLE handler.
3754  *
3755  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3756  *  chance last time (mostly it has become eligible now since we have probably
3757  *  yielded to lockholder in last iteration. This is done by toggling
3758  *  @dy_eligible each time a VCPU checked for eligibility.)
3759  *
3760  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3761  *  to preempted lock-holder could result in wrong VCPU selection and CPU
3762  *  burning. Giving priority for a potential lock-holder increases lock
3763  *  progress.
3764  *
3765  *  Since algorithm is based on heuristics, accessing another VCPU data without
3766  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3767  *  and continue with next VCPU and so on.
3768  */
3769 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3770 {
3771 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3772         bool eligible;
3773
3774         eligible = !vcpu->spin_loop.in_spin_loop ||
3775                     vcpu->spin_loop.dy_eligible;
3776
3777         if (vcpu->spin_loop.in_spin_loop)
3778                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3779
3780         return eligible;
3781 #else
3782         return true;
3783 #endif
3784 }
3785
3786 /*
3787  * Unlike kvm_arch_vcpu_runnable, this function is called outside
3788  * a vcpu_load/vcpu_put pair.  However, for most architectures
3789  * kvm_arch_vcpu_runnable does not require vcpu_load.
3790  */
3791 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3792 {
3793         return kvm_arch_vcpu_runnable(vcpu);
3794 }
3795
3796 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3797 {
3798         if (kvm_arch_dy_runnable(vcpu))
3799                 return true;
3800
3801 #ifdef CONFIG_KVM_ASYNC_PF
3802         if (!list_empty_careful(&vcpu->async_pf.done))
3803                 return true;
3804 #endif
3805
3806         return false;
3807 }
3808
3809 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3810 {
3811         return false;
3812 }
3813
3814 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3815 {
3816         struct kvm *kvm = me->kvm;
3817         struct kvm_vcpu *vcpu;
3818         int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3819         unsigned long i;
3820         int yielded = 0;
3821         int try = 3;
3822         int pass;
3823
3824         kvm_vcpu_set_in_spin_loop(me, true);
3825         /*
3826          * We boost the priority of a VCPU that is runnable but not
3827          * currently running, because it got preempted by something
3828          * else and called schedule in __vcpu_run.  Hopefully that
3829          * VCPU is holding the lock that we need and will release it.
3830          * We approximate round-robin by starting at the last boosted VCPU.
3831          */
3832         for (pass = 0; pass < 2 && !yielded && try; pass++) {
3833                 kvm_for_each_vcpu(i, vcpu, kvm) {
3834                         if (!pass && i <= last_boosted_vcpu) {
3835                                 i = last_boosted_vcpu;
3836                                 continue;
3837                         } else if (pass && i > last_boosted_vcpu)
3838                                 break;
3839                         if (!READ_ONCE(vcpu->ready))
3840                                 continue;
3841                         if (vcpu == me)
3842                                 continue;
3843                         if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3844                                 continue;
3845                         if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3846                             !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3847                             !kvm_arch_vcpu_in_kernel(vcpu))
3848                                 continue;
3849                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3850                                 continue;
3851
3852                         yielded = kvm_vcpu_yield_to(vcpu);
3853                         if (yielded > 0) {
3854                                 kvm->last_boosted_vcpu = i;
3855                                 break;
3856                         } else if (yielded < 0) {
3857                                 try--;
3858                                 if (!try)
3859                                         break;
3860                         }
3861                 }
3862         }
3863         kvm_vcpu_set_in_spin_loop(me, false);
3864
3865         /* Ensure vcpu is not eligible during next spinloop */
3866         kvm_vcpu_set_dy_eligible(me, false);
3867 }
3868 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3869
3870 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3871 {
3872 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3873         return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3874             (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3875              kvm->dirty_ring_size / PAGE_SIZE);
3876 #else
3877         return false;
3878 #endif
3879 }
3880
3881 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3882 {
3883         struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3884         struct page *page;
3885
3886         if (vmf->pgoff == 0)
3887                 page = virt_to_page(vcpu->run);
3888 #ifdef CONFIG_X86
3889         else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3890                 page = virt_to_page(vcpu->arch.pio_data);
3891 #endif
3892 #ifdef CONFIG_KVM_MMIO
3893         else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3894                 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3895 #endif
3896         else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3897                 page = kvm_dirty_ring_get_page(
3898                     &vcpu->dirty_ring,
3899                     vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3900         else
3901                 return kvm_arch_vcpu_fault(vcpu, vmf);
3902         get_page(page);
3903         vmf->page = page;
3904         return 0;
3905 }
3906
3907 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3908         .fault = kvm_vcpu_fault,
3909 };
3910
3911 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3912 {
3913         struct kvm_vcpu *vcpu = file->private_data;
3914         unsigned long pages = vma_pages(vma);
3915
3916         if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3917              kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3918             ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3919                 return -EINVAL;
3920
3921         vma->vm_ops = &kvm_vcpu_vm_ops;
3922         return 0;
3923 }
3924
3925 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3926 {
3927         struct kvm_vcpu *vcpu = filp->private_data;
3928
3929         kvm_put_kvm(vcpu->kvm);
3930         return 0;
3931 }
3932
3933 static const struct file_operations kvm_vcpu_fops = {
3934         .release        = kvm_vcpu_release,
3935         .unlocked_ioctl = kvm_vcpu_ioctl,
3936         .mmap           = kvm_vcpu_mmap,
3937         .llseek         = noop_llseek,
3938         KVM_COMPAT(kvm_vcpu_compat_ioctl),
3939 };
3940
3941 /*
3942  * Allocates an inode for the vcpu.
3943  */
3944 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3945 {
3946         char name[8 + 1 + ITOA_MAX_LEN + 1];
3947
3948         snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3949         return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3950 }
3951
3952 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3953 static int vcpu_get_pid(void *data, u64 *val)
3954 {
3955         struct kvm_vcpu *vcpu = data;
3956
3957         rcu_read_lock();
3958         *val = pid_nr(rcu_dereference(vcpu->pid));
3959         rcu_read_unlock();
3960         return 0;
3961 }
3962
3963 DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
3964
3965 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3966 {
3967         struct dentry *debugfs_dentry;
3968         char dir_name[ITOA_MAX_LEN * 2];
3969
3970         if (!debugfs_initialized())
3971                 return;
3972
3973         snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3974         debugfs_dentry = debugfs_create_dir(dir_name,
3975                                             vcpu->kvm->debugfs_dentry);
3976         debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
3977                             &vcpu_get_pid_fops);
3978
3979         kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3980 }
3981 #endif
3982
3983 /*
3984  * Creates some virtual cpus.  Good luck creating more than one.
3985  */
3986 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3987 {
3988         int r;
3989         struct kvm_vcpu *vcpu;
3990         struct page *page;
3991
3992         if (id >= KVM_MAX_VCPU_IDS)
3993                 return -EINVAL;
3994
3995         mutex_lock(&kvm->lock);
3996         if (kvm->created_vcpus >= kvm->max_vcpus) {
3997                 mutex_unlock(&kvm->lock);
3998                 return -EINVAL;
3999         }
4000
4001         r = kvm_arch_vcpu_precreate(kvm, id);
4002         if (r) {
4003                 mutex_unlock(&kvm->lock);
4004                 return r;
4005         }
4006
4007         kvm->created_vcpus++;
4008         mutex_unlock(&kvm->lock);
4009
4010         vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
4011         if (!vcpu) {
4012                 r = -ENOMEM;
4013                 goto vcpu_decrement;
4014         }
4015
4016         BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
4017         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
4018         if (!page) {
4019                 r = -ENOMEM;
4020                 goto vcpu_free;
4021         }
4022         vcpu->run = page_address(page);
4023
4024         kvm_vcpu_init(vcpu, kvm, id);
4025
4026         r = kvm_arch_vcpu_create(vcpu);
4027         if (r)
4028                 goto vcpu_free_run_page;
4029
4030         if (kvm->dirty_ring_size) {
4031                 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
4032                                          id, kvm->dirty_ring_size);
4033                 if (r)
4034                         goto arch_vcpu_destroy;
4035         }
4036
4037         mutex_lock(&kvm->lock);
4038
4039 #ifdef CONFIG_LOCKDEP
4040         /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
4041         mutex_lock(&vcpu->mutex);
4042         mutex_unlock(&vcpu->mutex);
4043 #endif
4044
4045         if (kvm_get_vcpu_by_id(kvm, id)) {
4046                 r = -EEXIST;
4047                 goto unlock_vcpu_destroy;
4048         }
4049
4050         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4051         r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4052         if (r)
4053                 goto unlock_vcpu_destroy;
4054
4055         /* Now it's all set up, let userspace reach it */
4056         kvm_get_kvm(kvm);
4057         r = create_vcpu_fd(vcpu);
4058         if (r < 0)
4059                 goto kvm_put_xa_release;
4060
4061         if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4062                 r = -EINVAL;
4063                 goto kvm_put_xa_release;
4064         }
4065
4066         /*
4067          * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
4068          * pointer before kvm->online_vcpu's incremented value.
4069          */
4070         smp_wmb();
4071         atomic_inc(&kvm->online_vcpus);
4072
4073         mutex_unlock(&kvm->lock);
4074         kvm_arch_vcpu_postcreate(vcpu);
4075         kvm_create_vcpu_debugfs(vcpu);
4076         return r;
4077
4078 kvm_put_xa_release:
4079         kvm_put_kvm_no_destroy(kvm);
4080         xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4081 unlock_vcpu_destroy:
4082         mutex_unlock(&kvm->lock);
4083         kvm_dirty_ring_free(&vcpu->dirty_ring);
4084 arch_vcpu_destroy:
4085         kvm_arch_vcpu_destroy(vcpu);
4086 vcpu_free_run_page:
4087         free_page((unsigned long)vcpu->run);
4088 vcpu_free:
4089         kmem_cache_free(kvm_vcpu_cache, vcpu);
4090 vcpu_decrement:
4091         mutex_lock(&kvm->lock);
4092         kvm->created_vcpus--;
4093         mutex_unlock(&kvm->lock);
4094         return r;
4095 }
4096
4097 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4098 {
4099         if (sigset) {
4100                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4101                 vcpu->sigset_active = 1;
4102                 vcpu->sigset = *sigset;
4103         } else
4104                 vcpu->sigset_active = 0;
4105         return 0;
4106 }
4107
4108 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4109                               size_t size, loff_t *offset)
4110 {
4111         struct kvm_vcpu *vcpu = file->private_data;
4112
4113         return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4114                         &kvm_vcpu_stats_desc[0], &vcpu->stat,
4115                         sizeof(vcpu->stat), user_buffer, size, offset);
4116 }
4117
4118 static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4119 {
4120         struct kvm_vcpu *vcpu = file->private_data;
4121
4122         kvm_put_kvm(vcpu->kvm);
4123         return 0;
4124 }
4125
4126 static const struct file_operations kvm_vcpu_stats_fops = {
4127         .read = kvm_vcpu_stats_read,
4128         .release = kvm_vcpu_stats_release,
4129         .llseek = noop_llseek,
4130 };
4131
4132 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4133 {
4134         int fd;
4135         struct file *file;
4136         char name[15 + ITOA_MAX_LEN + 1];
4137
4138         snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4139
4140         fd = get_unused_fd_flags(O_CLOEXEC);
4141         if (fd < 0)
4142                 return fd;
4143
4144         file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4145         if (IS_ERR(file)) {
4146                 put_unused_fd(fd);
4147                 return PTR_ERR(file);
4148         }
4149
4150         kvm_get_kvm(vcpu->kvm);
4151
4152         file->f_mode |= FMODE_PREAD;
4153         fd_install(fd, file);
4154
4155         return fd;
4156 }
4157
4158 static long kvm_vcpu_ioctl(struct file *filp,
4159                            unsigned int ioctl, unsigned long arg)
4160 {
4161         struct kvm_vcpu *vcpu = filp->private_data;
4162         void __user *argp = (void __user *)arg;
4163         int r;
4164         struct kvm_fpu *fpu = NULL;
4165         struct kvm_sregs *kvm_sregs = NULL;
4166
4167         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4168                 return -EIO;
4169
4170         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4171                 return -EINVAL;
4172
4173         /*
4174          * Some architectures have vcpu ioctls that are asynchronous to vcpu
4175          * execution; mutex_lock() would break them.
4176          */
4177         r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4178         if (r != -ENOIOCTLCMD)
4179                 return r;
4180
4181         if (mutex_lock_killable(&vcpu->mutex))
4182                 return -EINTR;
4183         switch (ioctl) {
4184         case KVM_RUN: {
4185                 struct pid *oldpid;
4186                 r = -EINVAL;
4187                 if (arg)
4188                         goto out;
4189                 oldpid = rcu_access_pointer(vcpu->pid);
4190                 if (unlikely(oldpid != task_pid(current))) {
4191                         /* The thread running this VCPU changed. */
4192                         struct pid *newpid;
4193
4194                         r = kvm_arch_vcpu_run_pid_change(vcpu);
4195                         if (r)
4196                                 break;
4197
4198                         newpid = get_task_pid(current, PIDTYPE_PID);
4199                         rcu_assign_pointer(vcpu->pid, newpid);
4200                         if (oldpid)
4201                                 synchronize_rcu();
4202                         put_pid(oldpid);
4203                 }
4204                 r = kvm_arch_vcpu_ioctl_run(vcpu);
4205                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4206                 break;
4207         }
4208         case KVM_GET_REGS: {
4209                 struct kvm_regs *kvm_regs;
4210
4211                 r = -ENOMEM;
4212                 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
4213                 if (!kvm_regs)
4214                         goto out;
4215                 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4216                 if (r)
4217                         goto out_free1;
4218                 r = -EFAULT;
4219                 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4220                         goto out_free1;
4221                 r = 0;
4222 out_free1:
4223                 kfree(kvm_regs);
4224                 break;
4225         }
4226         case KVM_SET_REGS: {
4227                 struct kvm_regs *kvm_regs;
4228
4229                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4230                 if (IS_ERR(kvm_regs)) {
4231                         r = PTR_ERR(kvm_regs);
4232                         goto out;
4233                 }
4234                 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4235                 kfree(kvm_regs);
4236                 break;
4237         }
4238         case KVM_GET_SREGS: {
4239                 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4240                                     GFP_KERNEL_ACCOUNT);
4241                 r = -ENOMEM;
4242                 if (!kvm_sregs)
4243                         goto out;
4244                 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4245                 if (r)
4246                         goto out;
4247                 r = -EFAULT;
4248                 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4249                         goto out;
4250                 r = 0;
4251                 break;
4252         }
4253         case KVM_SET_SREGS: {
4254                 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4255                 if (IS_ERR(kvm_sregs)) {
4256                         r = PTR_ERR(kvm_sregs);
4257                         kvm_sregs = NULL;
4258                         goto out;
4259                 }
4260                 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4261                 break;
4262         }
4263         case KVM_GET_MP_STATE: {
4264                 struct kvm_mp_state mp_state;
4265
4266                 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4267                 if (r)
4268                         goto out;
4269                 r = -EFAULT;
4270                 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4271                         goto out;
4272                 r = 0;
4273                 break;
4274         }
4275         case KVM_SET_MP_STATE: {
4276                 struct kvm_mp_state mp_state;
4277
4278                 r = -EFAULT;
4279                 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4280                         goto out;
4281                 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4282                 break;
4283         }
4284         case KVM_TRANSLATE: {
4285                 struct kvm_translation tr;
4286
4287                 r = -EFAULT;
4288                 if (copy_from_user(&tr, argp, sizeof(tr)))
4289                         goto out;
4290                 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4291                 if (r)
4292                         goto out;
4293                 r = -EFAULT;
4294                 if (copy_to_user(argp, &tr, sizeof(tr)))
4295                         goto out;
4296                 r = 0;
4297                 break;
4298         }
4299         case KVM_SET_GUEST_DEBUG: {
4300                 struct kvm_guest_debug dbg;
4301
4302                 r = -EFAULT;
4303                 if (copy_from_user(&dbg, argp, sizeof(dbg)))
4304                         goto out;
4305                 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4306                 break;
4307         }
4308         case KVM_SET_SIGNAL_MASK: {
4309                 struct kvm_signal_mask __user *sigmask_arg = argp;
4310                 struct kvm_signal_mask kvm_sigmask;
4311                 sigset_t sigset, *p;
4312
4313                 p = NULL;
4314                 if (argp) {
4315                         r = -EFAULT;
4316                         if (copy_from_user(&kvm_sigmask, argp,
4317                                            sizeof(kvm_sigmask)))
4318                                 goto out;
4319                         r = -EINVAL;
4320                         if (kvm_sigmask.len != sizeof(sigset))
4321                                 goto out;
4322                         r = -EFAULT;
4323                         if (copy_from_user(&sigset, sigmask_arg->sigset,
4324                                            sizeof(sigset)))
4325                                 goto out;
4326                         p = &sigset;
4327                 }
4328                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4329                 break;
4330         }
4331         case KVM_GET_FPU: {
4332                 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4333                 r = -ENOMEM;
4334                 if (!fpu)
4335                         goto out;
4336                 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4337                 if (r)
4338                         goto out;
4339                 r = -EFAULT;
4340                 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4341                         goto out;
4342                 r = 0;
4343                 break;
4344         }
4345         case KVM_SET_FPU: {
4346                 fpu = memdup_user(argp, sizeof(*fpu));
4347                 if (IS_ERR(fpu)) {
4348                         r = PTR_ERR(fpu);
4349                         fpu = NULL;
4350                         goto out;
4351                 }
4352                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4353                 break;
4354         }
4355         case KVM_GET_STATS_FD: {
4356                 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4357                 break;
4358         }
4359         default:
4360                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4361         }
4362 out:
4363         mutex_unlock(&vcpu->mutex);
4364         kfree(fpu);
4365         kfree(kvm_sregs);
4366         return r;
4367 }
4368
4369 #ifdef CONFIG_KVM_COMPAT
4370 static long kvm_vcpu_compat_ioctl(struct file *filp,
4371                                   unsigned int ioctl, unsigned long arg)
4372 {
4373         struct kvm_vcpu *vcpu = filp->private_data;
4374         void __user *argp = compat_ptr(arg);
4375         int r;
4376
4377         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4378                 return -EIO;
4379
4380         switch (ioctl) {
4381         case KVM_SET_SIGNAL_MASK: {
4382                 struct kvm_signal_mask __user *sigmask_arg = argp;
4383                 struct kvm_signal_mask kvm_sigmask;
4384                 sigset_t sigset;
4385
4386                 if (argp) {
4387                         r = -EFAULT;
4388                         if (copy_from_user(&kvm_sigmask, argp,
4389                                            sizeof(kvm_sigmask)))
4390                                 goto out;
4391                         r = -EINVAL;
4392                         if (kvm_sigmask.len != sizeof(compat_sigset_t))
4393                                 goto out;
4394                         r = -EFAULT;
4395                         if (get_compat_sigset(&sigset,
4396                                               (compat_sigset_t __user *)sigmask_arg->sigset))
4397                                 goto out;
4398                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4399                 } else
4400                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4401                 break;
4402         }
4403         default:
4404                 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4405         }
4406
4407 out:
4408         return r;
4409 }
4410 #endif
4411
4412 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4413 {
4414         struct kvm_device *dev = filp->private_data;
4415
4416         if (dev->ops->mmap)
4417                 return dev->ops->mmap(dev, vma);
4418
4419         return -ENODEV;
4420 }
4421
4422 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4423                                  int (*accessor)(struct kvm_device *dev,
4424                                                  struct kvm_device_attr *attr),
4425                                  unsigned long arg)
4426 {
4427         struct kvm_device_attr attr;
4428
4429         if (!accessor)
4430                 return -EPERM;
4431
4432         if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4433                 return -EFAULT;
4434
4435         return accessor(dev, &attr);
4436 }
4437
4438 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4439                              unsigned long arg)
4440 {
4441         struct kvm_device *dev = filp->private_data;
4442
4443         if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4444                 return -EIO;
4445
4446         switch (ioctl) {
4447         case KVM_SET_DEVICE_ATTR:
4448                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4449         case KVM_GET_DEVICE_ATTR:
4450                 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4451         case KVM_HAS_DEVICE_ATTR:
4452                 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4453         default:
4454                 if (dev->ops->ioctl)
4455                         return dev->ops->ioctl(dev, ioctl, arg);
4456
4457                 return -ENOTTY;
4458         }
4459 }
4460
4461 static int kvm_device_release(struct inode *inode, struct file *filp)
4462 {
4463         struct kvm_device *dev = filp->private_data;
4464         struct kvm *kvm = dev->kvm;
4465
4466         if (dev->ops->release) {
4467                 mutex_lock(&kvm->lock);
4468                 list_del(&dev->vm_node);
4469                 dev->ops->release(dev);
4470                 mutex_unlock(&kvm->lock);
4471         }
4472
4473         kvm_put_kvm(kvm);
4474         return 0;
4475 }
4476
4477 static const struct file_operations kvm_device_fops = {
4478         .unlocked_ioctl = kvm_device_ioctl,
4479         .release = kvm_device_release,
4480         KVM_COMPAT(kvm_device_ioctl),
4481         .mmap = kvm_device_mmap,
4482 };
4483
4484 struct kvm_device *kvm_device_from_filp(struct file *filp)
4485 {
4486         if (filp->f_op != &kvm_device_fops)
4487                 return NULL;
4488
4489         return filp->private_data;
4490 }
4491
4492 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4493 #ifdef CONFIG_KVM_MPIC
4494         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4495         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4496 #endif
4497 };
4498
4499 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4500 {
4501         if (type >= ARRAY_SIZE(kvm_device_ops_table))
4502                 return -ENOSPC;
4503
4504         if (kvm_device_ops_table[type] != NULL)
4505                 return -EEXIST;
4506
4507         kvm_device_ops_table[type] = ops;
4508         return 0;
4509 }
4510
4511 void kvm_unregister_device_ops(u32 type)
4512 {
4513         if (kvm_device_ops_table[type] != NULL)
4514                 kvm_device_ops_table[type] = NULL;
4515 }
4516
4517 static int kvm_ioctl_create_device(struct kvm *kvm,
4518                                    struct kvm_create_device *cd)
4519 {
4520         const struct kvm_device_ops *ops;
4521         struct kvm_device *dev;
4522         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4523         int type;
4524         int ret;
4525
4526         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4527                 return -ENODEV;
4528
4529         type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4530         ops = kvm_device_ops_table[type];
4531         if (ops == NULL)
4532                 return -ENODEV;
4533
4534         if (test)
4535                 return 0;
4536
4537         dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4538         if (!dev)
4539                 return -ENOMEM;
4540
4541         dev->ops = ops;
4542         dev->kvm = kvm;
4543
4544         mutex_lock(&kvm->lock);
4545         ret = ops->create(dev, type);
4546         if (ret < 0) {
4547                 mutex_unlock(&kvm->lock);
4548                 kfree(dev);
4549                 return ret;
4550         }
4551         list_add(&dev->vm_node, &kvm->devices);
4552         mutex_unlock(&kvm->lock);
4553
4554         if (ops->init)
4555                 ops->init(dev);
4556
4557         kvm_get_kvm(kvm);
4558         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4559         if (ret < 0) {
4560                 kvm_put_kvm_no_destroy(kvm);
4561                 mutex_lock(&kvm->lock);
4562                 list_del(&dev->vm_node);
4563                 if (ops->release)
4564                         ops->release(dev);
4565                 mutex_unlock(&kvm->lock);
4566                 if (ops->destroy)
4567                         ops->destroy(dev);
4568                 return ret;
4569         }
4570
4571         cd->fd = ret;
4572         return 0;
4573 }
4574
4575 static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4576 {
4577         switch (arg) {
4578         case KVM_CAP_USER_MEMORY:
4579         case KVM_CAP_USER_MEMORY2:
4580         case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4581         case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4582         case KVM_CAP_INTERNAL_ERROR_DATA:
4583 #ifdef CONFIG_HAVE_KVM_MSI
4584         case KVM_CAP_SIGNAL_MSI:
4585 #endif
4586 #ifdef CONFIG_HAVE_KVM_IRQFD
4587         case KVM_CAP_IRQFD:
4588 #endif
4589         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4590         case KVM_CAP_CHECK_EXTENSION_VM:
4591         case KVM_CAP_ENABLE_CAP_VM:
4592         case KVM_CAP_HALT_POLL:
4593                 return 1;
4594 #ifdef CONFIG_KVM_MMIO
4595         case KVM_CAP_COALESCED_MMIO:
4596                 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4597         case KVM_CAP_COALESCED_PIO:
4598                 return 1;
4599 #endif
4600 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4601         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4602                 return KVM_DIRTY_LOG_MANUAL_CAPS;
4603 #endif
4604 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4605         case KVM_CAP_IRQ_ROUTING:
4606                 return KVM_MAX_IRQ_ROUTES;
4607 #endif
4608 #if KVM_ADDRESS_SPACE_NUM > 1
4609         case KVM_CAP_MULTI_ADDRESS_SPACE:
4610                 return KVM_ADDRESS_SPACE_NUM;
4611 #endif
4612         case KVM_CAP_NR_MEMSLOTS:
4613                 return KVM_USER_MEM_SLOTS;
4614         case KVM_CAP_DIRTY_LOG_RING:
4615 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4616                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4617 #else
4618                 return 0;
4619 #endif
4620         case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4621 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4622                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4623 #else
4624                 return 0;
4625 #endif
4626 #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4627         case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4628 #endif
4629         case KVM_CAP_BINARY_STATS_FD:
4630         case KVM_CAP_SYSTEM_EVENT_DATA:
4631                 return 1;
4632         default:
4633                 break;
4634         }
4635         return kvm_vm_ioctl_check_extension(kvm, arg);
4636 }
4637
4638 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4639 {
4640         int r;
4641
4642         if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4643                 return -EINVAL;
4644
4645         /* the size should be power of 2 */
4646         if (!size || (size & (size - 1)))
4647                 return -EINVAL;
4648
4649         /* Should be bigger to keep the reserved entries, or a page */
4650         if (size < kvm_dirty_ring_get_rsvd_entries() *
4651             sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4652                 return -EINVAL;
4653
4654         if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4655             sizeof(struct kvm_dirty_gfn))
4656                 return -E2BIG;
4657
4658         /* We only allow it to set once */
4659         if (kvm->dirty_ring_size)
4660                 return -EINVAL;
4661
4662         mutex_lock(&kvm->lock);
4663
4664         if (kvm->created_vcpus) {
4665                 /* We don't allow to change this value after vcpu created */
4666                 r = -EINVAL;
4667         } else {
4668                 kvm->dirty_ring_size = size;
4669                 r = 0;
4670         }
4671
4672         mutex_unlock(&kvm->lock);
4673         return r;
4674 }
4675
4676 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4677 {
4678         unsigned long i;
4679         struct kvm_vcpu *vcpu;
4680         int cleared = 0;
4681
4682         if (!kvm->dirty_ring_size)
4683                 return -EINVAL;
4684
4685         mutex_lock(&kvm->slots_lock);
4686
4687         kvm_for_each_vcpu(i, vcpu, kvm)
4688                 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4689
4690         mutex_unlock(&kvm->slots_lock);
4691
4692         if (cleared)
4693                 kvm_flush_remote_tlbs(kvm);
4694
4695         return cleared;
4696 }
4697
4698 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4699                                                   struct kvm_enable_cap *cap)
4700 {
4701         return -EINVAL;
4702 }
4703
4704 bool kvm_are_all_memslots_empty(struct kvm *kvm)
4705 {
4706         int i;
4707
4708         lockdep_assert_held(&kvm->slots_lock);
4709
4710         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
4711                 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4712                         return false;
4713         }
4714
4715         return true;
4716 }
4717 EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
4718
4719 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4720                                            struct kvm_enable_cap *cap)
4721 {
4722         switch (cap->cap) {
4723 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4724         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4725                 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4726
4727                 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4728                         allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4729
4730                 if (cap->flags || (cap->args[0] & ~allowed_options))
4731                         return -EINVAL;
4732                 kvm->manual_dirty_log_protect = cap->args[0];
4733                 return 0;
4734         }
4735 #endif
4736         case KVM_CAP_HALT_POLL: {
4737                 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4738                         return -EINVAL;
4739
4740                 kvm->max_halt_poll_ns = cap->args[0];
4741
4742                 /*
4743                  * Ensure kvm->override_halt_poll_ns does not become visible
4744                  * before kvm->max_halt_poll_ns.
4745                  *
4746                  * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4747                  */
4748                 smp_wmb();
4749                 kvm->override_halt_poll_ns = true;
4750
4751                 return 0;
4752         }
4753         case KVM_CAP_DIRTY_LOG_RING:
4754         case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4755                 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
4756                         return -EINVAL;
4757
4758                 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4759         case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
4760                 int r = -EINVAL;
4761
4762                 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
4763                     !kvm->dirty_ring_size || cap->flags)
4764                         return r;
4765
4766                 mutex_lock(&kvm->slots_lock);
4767
4768                 /*
4769                  * For simplicity, allow enabling ring+bitmap if and only if
4770                  * there are no memslots, e.g. to ensure all memslots allocate
4771                  * a bitmap after the capability is enabled.
4772                  */
4773                 if (kvm_are_all_memslots_empty(kvm)) {
4774                         kvm->dirty_ring_with_bitmap = true;
4775                         r = 0;
4776                 }
4777
4778                 mutex_unlock(&kvm->slots_lock);
4779
4780                 return r;
4781         }
4782         default:
4783                 return kvm_vm_ioctl_enable_cap(kvm, cap);
4784         }
4785 }
4786
4787 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4788                               size_t size, loff_t *offset)
4789 {
4790         struct kvm *kvm = file->private_data;
4791
4792         return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4793                                 &kvm_vm_stats_desc[0], &kvm->stat,
4794                                 sizeof(kvm->stat), user_buffer, size, offset);
4795 }
4796
4797 static int kvm_vm_stats_release(struct inode *inode, struct file *file)
4798 {
4799         struct kvm *kvm = file->private_data;
4800
4801         kvm_put_kvm(kvm);
4802         return 0;
4803 }
4804
4805 static const struct file_operations kvm_vm_stats_fops = {
4806         .read = kvm_vm_stats_read,
4807         .release = kvm_vm_stats_release,
4808         .llseek = noop_llseek,
4809 };
4810
4811 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4812 {
4813         int fd;
4814         struct file *file;
4815
4816         fd = get_unused_fd_flags(O_CLOEXEC);
4817         if (fd < 0)
4818                 return fd;
4819
4820         file = anon_inode_getfile("kvm-vm-stats",
4821                         &kvm_vm_stats_fops, kvm, O_RDONLY);
4822         if (IS_ERR(file)) {
4823                 put_unused_fd(fd);
4824                 return PTR_ERR(file);
4825         }
4826
4827         kvm_get_kvm(kvm);
4828
4829         file->f_mode |= FMODE_PREAD;
4830         fd_install(fd, file);
4831
4832         return fd;
4833 }
4834
4835 #define SANITY_CHECK_MEM_REGION_FIELD(field)                                    \
4836 do {                                                                            \
4837         BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) !=             \
4838                      offsetof(struct kvm_userspace_memory_region2, field));     \
4839         BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) !=         \
4840                      sizeof_field(struct kvm_userspace_memory_region2, field)); \
4841 } while (0)
4842
4843 static long kvm_vm_ioctl(struct file *filp,
4844                            unsigned int ioctl, unsigned long arg)
4845 {
4846         struct kvm *kvm = filp->private_data;
4847         void __user *argp = (void __user *)arg;
4848         int r;
4849
4850         if (kvm->mm != current->mm || kvm->vm_dead)
4851                 return -EIO;
4852         switch (ioctl) {
4853         case KVM_CREATE_VCPU:
4854                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4855                 break;
4856         case KVM_ENABLE_CAP: {
4857                 struct kvm_enable_cap cap;
4858
4859                 r = -EFAULT;
4860                 if (copy_from_user(&cap, argp, sizeof(cap)))
4861                         goto out;
4862                 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4863                 break;
4864         }
4865         case KVM_SET_USER_MEMORY_REGION2:
4866         case KVM_SET_USER_MEMORY_REGION: {
4867                 struct kvm_userspace_memory_region2 mem;
4868                 unsigned long size;
4869
4870                 if (ioctl == KVM_SET_USER_MEMORY_REGION) {
4871                         /*
4872                          * Fields beyond struct kvm_userspace_memory_region shouldn't be
4873                          * accessed, but avoid leaking kernel memory in case of a bug.
4874                          */
4875                         memset(&mem, 0, sizeof(mem));
4876                         size = sizeof(struct kvm_userspace_memory_region);
4877                 } else {
4878                         size = sizeof(struct kvm_userspace_memory_region2);
4879                 }
4880
4881                 /* Ensure the common parts of the two structs are identical. */
4882                 SANITY_CHECK_MEM_REGION_FIELD(slot);
4883                 SANITY_CHECK_MEM_REGION_FIELD(flags);
4884                 SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
4885                 SANITY_CHECK_MEM_REGION_FIELD(memory_size);
4886                 SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
4887
4888                 r = -EFAULT;
4889                 if (copy_from_user(&mem, argp, size))
4890                         goto out;
4891
4892                 r = -EINVAL;
4893                 if (ioctl == KVM_SET_USER_MEMORY_REGION &&
4894                     (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
4895                         goto out;
4896
4897                 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
4898                 break;
4899         }
4900         case KVM_GET_DIRTY_LOG: {
4901                 struct kvm_dirty_log log;
4902
4903                 r = -EFAULT;
4904                 if (copy_from_user(&log, argp, sizeof(log)))
4905                         goto out;
4906                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4907                 break;
4908         }
4909 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4910         case KVM_CLEAR_DIRTY_LOG: {
4911                 struct kvm_clear_dirty_log log;
4912
4913                 r = -EFAULT;
4914                 if (copy_from_user(&log, argp, sizeof(log)))
4915                         goto out;
4916                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4917                 break;
4918         }
4919 #endif
4920 #ifdef CONFIG_KVM_MMIO
4921         case KVM_REGISTER_COALESCED_MMIO: {
4922                 struct kvm_coalesced_mmio_zone zone;
4923
4924                 r = -EFAULT;
4925                 if (copy_from_user(&zone, argp, sizeof(zone)))
4926                         goto out;
4927                 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4928                 break;
4929         }
4930         case KVM_UNREGISTER_COALESCED_MMIO: {
4931                 struct kvm_coalesced_mmio_zone zone;
4932
4933                 r = -EFAULT;
4934                 if (copy_from_user(&zone, argp, sizeof(zone)))
4935                         goto out;
4936                 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4937                 break;
4938         }
4939 #endif
4940         case KVM_IRQFD: {
4941                 struct kvm_irqfd data;
4942
4943                 r = -EFAULT;
4944                 if (copy_from_user(&data, argp, sizeof(data)))
4945                         goto out;
4946                 r = kvm_irqfd(kvm, &data);
4947                 break;
4948         }
4949         case KVM_IOEVENTFD: {
4950                 struct kvm_ioeventfd data;
4951
4952                 r = -EFAULT;
4953                 if (copy_from_user(&data, argp, sizeof(data)))
4954                         goto out;
4955                 r = kvm_ioeventfd(kvm, &data);
4956                 break;
4957         }
4958 #ifdef CONFIG_HAVE_KVM_MSI
4959         case KVM_SIGNAL_MSI: {
4960                 struct kvm_msi msi;
4961
4962                 r = -EFAULT;
4963                 if (copy_from_user(&msi, argp, sizeof(msi)))
4964                         goto out;
4965                 r = kvm_send_userspace_msi(kvm, &msi);
4966                 break;
4967         }
4968 #endif
4969 #ifdef __KVM_HAVE_IRQ_LINE
4970         case KVM_IRQ_LINE_STATUS:
4971         case KVM_IRQ_LINE: {
4972                 struct kvm_irq_level irq_event;
4973
4974                 r = -EFAULT;
4975                 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4976                         goto out;
4977
4978                 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4979                                         ioctl == KVM_IRQ_LINE_STATUS);
4980                 if (r)
4981                         goto out;
4982
4983                 r = -EFAULT;
4984                 if (ioctl == KVM_IRQ_LINE_STATUS) {
4985                         if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4986                                 goto out;
4987                 }
4988
4989                 r = 0;
4990                 break;
4991         }
4992 #endif
4993 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4994         case KVM_SET_GSI_ROUTING: {
4995                 struct kvm_irq_routing routing;
4996                 struct kvm_irq_routing __user *urouting;
4997                 struct kvm_irq_routing_entry *entries = NULL;
4998
4999                 r = -EFAULT;
5000                 if (copy_from_user(&routing, argp, sizeof(routing)))
5001                         goto out;
5002                 r = -EINVAL;
5003                 if (!kvm_arch_can_set_irq_routing(kvm))
5004                         goto out;
5005                 if (routing.nr > KVM_MAX_IRQ_ROUTES)
5006                         goto out;
5007                 if (routing.flags)
5008                         goto out;
5009                 if (routing.nr) {
5010                         urouting = argp;
5011                         entries = vmemdup_user(urouting->entries,
5012                                                array_size(sizeof(*entries),
5013                                                           routing.nr));
5014                         if (IS_ERR(entries)) {
5015                                 r = PTR_ERR(entries);
5016                                 goto out;
5017                         }
5018                 }
5019                 r = kvm_set_irq_routing(kvm, entries, routing.nr,
5020                                         routing.flags);
5021                 kvfree(entries);
5022                 break;
5023         }
5024 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
5025         case KVM_CREATE_DEVICE: {
5026                 struct kvm_create_device cd;
5027
5028                 r = -EFAULT;
5029                 if (copy_from_user(&cd, argp, sizeof(cd)))
5030                         goto out;
5031
5032                 r = kvm_ioctl_create_device(kvm, &cd);
5033                 if (r)
5034                         goto out;
5035
5036                 r = -EFAULT;
5037                 if (copy_to_user(argp, &cd, sizeof(cd)))
5038                         goto out;
5039
5040                 r = 0;
5041                 break;
5042         }
5043         case KVM_CHECK_EXTENSION:
5044                 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
5045                 break;
5046         case KVM_RESET_DIRTY_RINGS:
5047                 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
5048                 break;
5049         case KVM_GET_STATS_FD:
5050                 r = kvm_vm_ioctl_get_stats_fd(kvm);
5051                 break;
5052         default:
5053                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
5054         }
5055 out:
5056         return r;
5057 }
5058
5059 #ifdef CONFIG_KVM_COMPAT
5060 struct compat_kvm_dirty_log {
5061         __u32 slot;
5062         __u32 padding1;
5063         union {
5064                 compat_uptr_t dirty_bitmap; /* one bit per page */
5065                 __u64 padding2;
5066         };
5067 };
5068
5069 struct compat_kvm_clear_dirty_log {
5070         __u32 slot;
5071         __u32 num_pages;
5072         __u64 first_page;
5073         union {
5074                 compat_uptr_t dirty_bitmap; /* one bit per page */
5075                 __u64 padding2;
5076         };
5077 };
5078
5079 long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5080                                      unsigned long arg)
5081 {
5082         return -ENOTTY;
5083 }
5084
5085 static long kvm_vm_compat_ioctl(struct file *filp,
5086                            unsigned int ioctl, unsigned long arg)
5087 {
5088         struct kvm *kvm = filp->private_data;
5089         int r;
5090
5091         if (kvm->mm != current->mm || kvm->vm_dead)
5092                 return -EIO;
5093
5094         r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5095         if (r != -ENOTTY)
5096                 return r;
5097
5098         switch (ioctl) {
5099 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5100         case KVM_CLEAR_DIRTY_LOG: {
5101                 struct compat_kvm_clear_dirty_log compat_log;
5102                 struct kvm_clear_dirty_log log;
5103
5104                 if (copy_from_user(&compat_log, (void __user *)arg,
5105                                    sizeof(compat_log)))
5106                         return -EFAULT;
5107                 log.slot         = compat_log.slot;
5108                 log.num_pages    = compat_log.num_pages;
5109                 log.first_page   = compat_log.first_page;
5110                 log.padding2     = compat_log.padding2;
5111                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5112
5113                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5114                 break;
5115         }
5116 #endif
5117         case KVM_GET_DIRTY_LOG: {
5118                 struct compat_kvm_dirty_log compat_log;
5119                 struct kvm_dirty_log log;
5120
5121                 if (copy_from_user(&compat_log, (void __user *)arg,
5122                                    sizeof(compat_log)))
5123                         return -EFAULT;
5124                 log.slot         = compat_log.slot;
5125                 log.padding1     = compat_log.padding1;
5126                 log.padding2     = compat_log.padding2;
5127                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5128
5129                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5130                 break;
5131         }
5132         default:
5133                 r = kvm_vm_ioctl(filp, ioctl, arg);
5134         }
5135         return r;
5136 }
5137 #endif
5138
5139 static const struct file_operations kvm_vm_fops = {
5140         .release        = kvm_vm_release,
5141         .unlocked_ioctl = kvm_vm_ioctl,
5142         .llseek         = noop_llseek,
5143         KVM_COMPAT(kvm_vm_compat_ioctl),
5144 };
5145
5146 bool file_is_kvm(struct file *file)
5147 {
5148         return file && file->f_op == &kvm_vm_fops;
5149 }
5150 EXPORT_SYMBOL_GPL(file_is_kvm);
5151
5152 static int kvm_dev_ioctl_create_vm(unsigned long type)
5153 {
5154         char fdname[ITOA_MAX_LEN + 1];
5155         int r, fd;
5156         struct kvm *kvm;
5157         struct file *file;
5158
5159         fd = get_unused_fd_flags(O_CLOEXEC);
5160         if (fd < 0)
5161                 return fd;
5162
5163         snprintf(fdname, sizeof(fdname), "%d", fd);
5164
5165         kvm = kvm_create_vm(type, fdname);
5166         if (IS_ERR(kvm)) {
5167                 r = PTR_ERR(kvm);
5168                 goto put_fd;
5169         }
5170
5171         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5172         if (IS_ERR(file)) {
5173                 r = PTR_ERR(file);
5174                 goto put_kvm;
5175         }
5176
5177         /*
5178          * Don't call kvm_put_kvm anymore at this point; file->f_op is
5179          * already set, with ->release() being kvm_vm_release().  In error
5180          * cases it will be called by the final fput(file) and will take
5181          * care of doing kvm_put_kvm(kvm).
5182          */
5183         kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5184
5185         fd_install(fd, file);
5186         return fd;
5187
5188 put_kvm:
5189         kvm_put_kvm(kvm);
5190 put_fd:
5191         put_unused_fd(fd);
5192         return r;
5193 }
5194
5195 static long kvm_dev_ioctl(struct file *filp,
5196                           unsigned int ioctl, unsigned long arg)
5197 {
5198         int r = -EINVAL;
5199
5200         switch (ioctl) {
5201         case KVM_GET_API_VERSION:
5202                 if (arg)
5203                         goto out;
5204                 r = KVM_API_VERSION;
5205                 break;
5206         case KVM_CREATE_VM:
5207                 r = kvm_dev_ioctl_create_vm(arg);
5208                 break;
5209         case KVM_CHECK_EXTENSION:
5210                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5211                 break;
5212         case KVM_GET_VCPU_MMAP_SIZE:
5213                 if (arg)
5214                         goto out;
5215                 r = PAGE_SIZE;     /* struct kvm_run */
5216 #ifdef CONFIG_X86
5217                 r += PAGE_SIZE;    /* pio data page */
5218 #endif
5219 #ifdef CONFIG_KVM_MMIO
5220                 r += PAGE_SIZE;    /* coalesced mmio ring page */
5221 #endif
5222                 break;
5223         case KVM_TRACE_ENABLE:
5224         case KVM_TRACE_PAUSE:
5225         case KVM_TRACE_DISABLE:
5226                 r = -EOPNOTSUPP;
5227                 break;
5228         default:
5229                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
5230         }
5231 out:
5232         return r;
5233 }
5234
5235 static struct file_operations kvm_chardev_ops = {
5236         .unlocked_ioctl = kvm_dev_ioctl,
5237         .llseek         = noop_llseek,
5238         KVM_COMPAT(kvm_dev_ioctl),
5239 };
5240
5241 static struct miscdevice kvm_dev = {
5242         KVM_MINOR,
5243         "kvm",
5244         &kvm_chardev_ops,
5245 };
5246
5247 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5248 __visible bool kvm_rebooting;
5249 EXPORT_SYMBOL_GPL(kvm_rebooting);
5250
5251 static DEFINE_PER_CPU(bool, hardware_enabled);
5252 static int kvm_usage_count;
5253
5254 static int __hardware_enable_nolock(void)
5255 {
5256         if (__this_cpu_read(hardware_enabled))
5257                 return 0;
5258
5259         if (kvm_arch_hardware_enable()) {
5260                 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5261                         raw_smp_processor_id());
5262                 return -EIO;
5263         }
5264
5265         __this_cpu_write(hardware_enabled, true);
5266         return 0;
5267 }
5268
5269 static void hardware_enable_nolock(void *failed)
5270 {
5271         if (__hardware_enable_nolock())
5272                 atomic_inc(failed);
5273 }
5274
5275 static int kvm_online_cpu(unsigned int cpu)
5276 {
5277         int ret = 0;
5278
5279         /*
5280          * Abort the CPU online process if hardware virtualization cannot
5281          * be enabled. Otherwise running VMs would encounter unrecoverable
5282          * errors when scheduled to this CPU.
5283          */
5284         mutex_lock(&kvm_lock);
5285         if (kvm_usage_count)
5286                 ret = __hardware_enable_nolock();
5287         mutex_unlock(&kvm_lock);
5288         return ret;
5289 }
5290
5291 static void hardware_disable_nolock(void *junk)
5292 {
5293         /*
5294          * Note, hardware_disable_all_nolock() tells all online CPUs to disable
5295          * hardware, not just CPUs that successfully enabled hardware!
5296          */
5297         if (!__this_cpu_read(hardware_enabled))
5298                 return;
5299
5300         kvm_arch_hardware_disable();
5301
5302         __this_cpu_write(hardware_enabled, false);
5303 }
5304
5305 static int kvm_offline_cpu(unsigned int cpu)
5306 {
5307         mutex_lock(&kvm_lock);
5308         if (kvm_usage_count)
5309                 hardware_disable_nolock(NULL);
5310         mutex_unlock(&kvm_lock);
5311         return 0;
5312 }
5313
5314 static void hardware_disable_all_nolock(void)
5315 {
5316         BUG_ON(!kvm_usage_count);
5317
5318         kvm_usage_count--;
5319         if (!kvm_usage_count)
5320                 on_each_cpu(hardware_disable_nolock, NULL, 1);
5321 }
5322
5323 static void hardware_disable_all(void)
5324 {
5325         cpus_read_lock();
5326         mutex_lock(&kvm_lock);
5327         hardware_disable_all_nolock();
5328         mutex_unlock(&kvm_lock);
5329         cpus_read_unlock();
5330 }
5331
5332 static int hardware_enable_all(void)
5333 {
5334         atomic_t failed = ATOMIC_INIT(0);
5335         int r;
5336
5337         /*
5338          * Do not enable hardware virtualization if the system is going down.
5339          * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5340          * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5341          * after kvm_reboot() is called.  Note, this relies on system_state
5342          * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5343          * hook instead of registering a dedicated reboot notifier (the latter
5344          * runs before system_state is updated).
5345          */
5346         if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5347             system_state == SYSTEM_RESTART)
5348                 return -EBUSY;
5349
5350         /*
5351          * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5352          * is called, and so on_each_cpu() between them includes the CPU that
5353          * is being onlined.  As a result, hardware_enable_nolock() may get
5354          * invoked before kvm_online_cpu(), which also enables hardware if the
5355          * usage count is non-zero.  Disable CPU hotplug to avoid attempting to
5356          * enable hardware multiple times.
5357          */
5358         cpus_read_lock();
5359         mutex_lock(&kvm_lock);
5360
5361         r = 0;
5362
5363         kvm_usage_count++;
5364         if (kvm_usage_count == 1) {
5365                 on_each_cpu(hardware_enable_nolock, &failed, 1);
5366
5367                 if (atomic_read(&failed)) {
5368                         hardware_disable_all_nolock();
5369                         r = -EBUSY;
5370                 }
5371         }
5372
5373         mutex_unlock(&kvm_lock);
5374         cpus_read_unlock();
5375
5376         return r;
5377 }
5378
5379 static void kvm_shutdown(void)
5380 {
5381         /*
5382          * Disable hardware virtualization and set kvm_rebooting to indicate
5383          * that KVM has asynchronously disabled hardware virtualization, i.e.
5384          * that relevant errors and exceptions aren't entirely unexpected.
5385          * Some flavors of hardware virtualization need to be disabled before
5386          * transferring control to firmware (to perform shutdown/reboot), e.g.
5387          * on x86, virtualization can block INIT interrupts, which are used by
5388          * firmware to pull APs back under firmware control.  Note, this path
5389          * is used for both shutdown and reboot scenarios, i.e. neither name is
5390          * 100% comprehensive.
5391          */
5392         pr_info("kvm: exiting hardware virtualization\n");
5393         kvm_rebooting = true;
5394         on_each_cpu(hardware_disable_nolock, NULL, 1);
5395 }
5396
5397 static int kvm_suspend(void)
5398 {
5399         /*
5400          * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5401          * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5402          * is stable.  Assert that kvm_lock is not held to ensure the system
5403          * isn't suspended while KVM is enabling hardware.  Hardware enabling
5404          * can be preempted, but the task cannot be frozen until it has dropped
5405          * all locks (userspace tasks are frozen via a fake signal).
5406          */
5407         lockdep_assert_not_held(&kvm_lock);
5408         lockdep_assert_irqs_disabled();
5409
5410         if (kvm_usage_count)
5411                 hardware_disable_nolock(NULL);
5412         return 0;
5413 }
5414
5415 static void kvm_resume(void)
5416 {
5417         lockdep_assert_not_held(&kvm_lock);
5418         lockdep_assert_irqs_disabled();
5419
5420         if (kvm_usage_count)
5421                 WARN_ON_ONCE(__hardware_enable_nolock());
5422 }
5423
5424 static struct syscore_ops kvm_syscore_ops = {
5425         .suspend = kvm_suspend,
5426         .resume = kvm_resume,
5427         .shutdown = kvm_shutdown,
5428 };
5429 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5430 static int hardware_enable_all(void)
5431 {
5432         return 0;
5433 }
5434
5435 static void hardware_disable_all(void)
5436 {
5437
5438 }
5439 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5440
5441 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5442 {
5443         if (dev->ops->destructor)
5444                 dev->ops->destructor(dev);
5445 }
5446
5447 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5448 {
5449         int i;
5450
5451         for (i = 0; i < bus->dev_count; i++) {
5452                 struct kvm_io_device *pos = bus->range[i].dev;
5453
5454                 kvm_iodevice_destructor(pos);
5455         }
5456         kfree(bus);
5457 }
5458
5459 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5460                                  const struct kvm_io_range *r2)
5461 {
5462         gpa_t addr1 = r1->addr;
5463         gpa_t addr2 = r2->addr;
5464
5465         if (addr1 < addr2)
5466                 return -1;
5467
5468         /* If r2->len == 0, match the exact address.  If r2->len != 0,
5469          * accept any overlapping write.  Any order is acceptable for
5470          * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5471          * we process all of them.
5472          */
5473         if (r2->len) {
5474                 addr1 += r1->len;
5475                 addr2 += r2->len;
5476         }
5477
5478         if (addr1 > addr2)
5479                 return 1;
5480
5481         return 0;
5482 }
5483
5484 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5485 {
5486         return kvm_io_bus_cmp(p1, p2);
5487 }
5488
5489 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5490                              gpa_t addr, int len)
5491 {
5492         struct kvm_io_range *range, key;
5493         int off;
5494
5495         key = (struct kvm_io_range) {
5496                 .addr = addr,
5497                 .len = len,
5498         };
5499
5500         range = bsearch(&key, bus->range, bus->dev_count,
5501                         sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5502         if (range == NULL)
5503                 return -ENOENT;
5504
5505         off = range - bus->range;
5506
5507         while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5508                 off--;
5509
5510         return off;
5511 }
5512
5513 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5514                               struct kvm_io_range *range, const void *val)
5515 {
5516         int idx;
5517
5518         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5519         if (idx < 0)
5520                 return -EOPNOTSUPP;
5521
5522         while (idx < bus->dev_count &&
5523                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5524                 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5525                                         range->len, val))
5526                         return idx;
5527                 idx++;
5528         }
5529
5530         return -EOPNOTSUPP;
5531 }
5532
5533 /* kvm_io_bus_write - called under kvm->slots_lock */
5534 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5535                      int len, const void *val)
5536 {
5537         struct kvm_io_bus *bus;
5538         struct kvm_io_range range;
5539         int r;
5540
5541         range = (struct kvm_io_range) {
5542                 .addr = addr,
5543                 .len = len,
5544         };
5545
5546         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5547         if (!bus)
5548                 return -ENOMEM;
5549         r = __kvm_io_bus_write(vcpu, bus, &range, val);
5550         return r < 0 ? r : 0;
5551 }
5552 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5553
5554 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
5555 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5556                             gpa_t addr, int len, const void *val, long cookie)
5557 {
5558         struct kvm_io_bus *bus;
5559         struct kvm_io_range range;
5560
5561         range = (struct kvm_io_range) {
5562                 .addr = addr,
5563                 .len = len,
5564         };
5565
5566         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5567         if (!bus)
5568                 return -ENOMEM;
5569
5570         /* First try the device referenced by cookie. */
5571         if ((cookie >= 0) && (cookie < bus->dev_count) &&
5572             (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5573                 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5574                                         val))
5575                         return cookie;
5576
5577         /*
5578          * cookie contained garbage; fall back to search and return the
5579          * correct cookie value.
5580          */
5581         return __kvm_io_bus_write(vcpu, bus, &range, val);
5582 }
5583
5584 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5585                              struct kvm_io_range *range, void *val)
5586 {
5587         int idx;
5588
5589         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5590         if (idx < 0)
5591                 return -EOPNOTSUPP;
5592
5593         while (idx < bus->dev_count &&
5594                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5595                 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5596                                        range->len, val))
5597                         return idx;
5598                 idx++;
5599         }
5600
5601         return -EOPNOTSUPP;
5602 }
5603
5604 /* kvm_io_bus_read - called under kvm->slots_lock */
5605 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5606                     int len, void *val)
5607 {
5608         struct kvm_io_bus *bus;
5609         struct kvm_io_range range;
5610         int r;
5611
5612         range = (struct kvm_io_range) {
5613                 .addr = addr,
5614                 .len = len,
5615         };
5616
5617         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5618         if (!bus)
5619                 return -ENOMEM;
5620         r = __kvm_io_bus_read(vcpu, bus, &range, val);
5621         return r < 0 ? r : 0;
5622 }
5623
5624 /* Caller must hold slots_lock. */
5625 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5626                             int len, struct kvm_io_device *dev)
5627 {
5628         int i;
5629         struct kvm_io_bus *new_bus, *bus;
5630         struct kvm_io_range range;
5631
5632         bus = kvm_get_bus(kvm, bus_idx);
5633         if (!bus)
5634                 return -ENOMEM;
5635
5636         /* exclude ioeventfd which is limited by maximum fd */
5637         if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5638                 return -ENOSPC;
5639
5640         new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5641                           GFP_KERNEL_ACCOUNT);
5642         if (!new_bus)
5643                 return -ENOMEM;
5644
5645         range = (struct kvm_io_range) {
5646                 .addr = addr,
5647                 .len = len,
5648                 .dev = dev,
5649         };
5650
5651         for (i = 0; i < bus->dev_count; i++)
5652                 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5653                         break;
5654
5655         memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5656         new_bus->dev_count++;
5657         new_bus->range[i] = range;
5658         memcpy(new_bus->range + i + 1, bus->range + i,
5659                 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5660         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5661         synchronize_srcu_expedited(&kvm->srcu);
5662         kfree(bus);
5663
5664         return 0;
5665 }
5666
5667 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5668                               struct kvm_io_device *dev)
5669 {
5670         int i;
5671         struct kvm_io_bus *new_bus, *bus;
5672
5673         lockdep_assert_held(&kvm->slots_lock);
5674
5675         bus = kvm_get_bus(kvm, bus_idx);
5676         if (!bus)
5677                 return 0;
5678
5679         for (i = 0; i < bus->dev_count; i++) {
5680                 if (bus->range[i].dev == dev) {
5681                         break;
5682                 }
5683         }
5684
5685         if (i == bus->dev_count)
5686                 return 0;
5687
5688         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5689                           GFP_KERNEL_ACCOUNT);
5690         if (new_bus) {
5691                 memcpy(new_bus, bus, struct_size(bus, range, i));
5692                 new_bus->dev_count--;
5693                 memcpy(new_bus->range + i, bus->range + i + 1,
5694                                 flex_array_size(new_bus, range, new_bus->dev_count - i));
5695         }
5696
5697         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5698         synchronize_srcu_expedited(&kvm->srcu);
5699
5700         /*
5701          * If NULL bus is installed, destroy the old bus, including all the
5702          * attached devices. Otherwise, destroy the caller's device only.
5703          */
5704         if (!new_bus) {
5705                 pr_err("kvm: failed to shrink bus, removing it completely\n");
5706                 kvm_io_bus_destroy(bus);
5707                 return -ENOMEM;
5708         }
5709
5710         kvm_iodevice_destructor(dev);
5711         kfree(bus);
5712         return 0;
5713 }
5714
5715 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5716                                          gpa_t addr)
5717 {
5718         struct kvm_io_bus *bus;
5719         int dev_idx, srcu_idx;
5720         struct kvm_io_device *iodev = NULL;
5721
5722         srcu_idx = srcu_read_lock(&kvm->srcu);
5723
5724         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5725         if (!bus)
5726                 goto out_unlock;
5727
5728         dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5729         if (dev_idx < 0)
5730                 goto out_unlock;
5731
5732         iodev = bus->range[dev_idx].dev;
5733
5734 out_unlock:
5735         srcu_read_unlock(&kvm->srcu, srcu_idx);
5736
5737         return iodev;
5738 }
5739 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5740
5741 static int kvm_debugfs_open(struct inode *inode, struct file *file,
5742                            int (*get)(void *, u64 *), int (*set)(void *, u64),
5743                            const char *fmt)
5744 {
5745         int ret;
5746         struct kvm_stat_data *stat_data = inode->i_private;
5747
5748         /*
5749          * The debugfs files are a reference to the kvm struct which
5750         * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5751         * avoids the race between open and the removal of the debugfs directory.
5752          */
5753         if (!kvm_get_kvm_safe(stat_data->kvm))
5754                 return -ENOENT;
5755
5756         ret = simple_attr_open(inode, file, get,
5757                                kvm_stats_debugfs_mode(stat_data->desc) & 0222
5758                                ? set : NULL, fmt);
5759         if (ret)
5760                 kvm_put_kvm(stat_data->kvm);
5761
5762         return ret;
5763 }
5764
5765 static int kvm_debugfs_release(struct inode *inode, struct file *file)
5766 {
5767         struct kvm_stat_data *stat_data = inode->i_private;
5768
5769         simple_attr_release(inode, file);
5770         kvm_put_kvm(stat_data->kvm);
5771
5772         return 0;
5773 }
5774
5775 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5776 {
5777         *val = *(u64 *)((void *)(&kvm->stat) + offset);
5778
5779         return 0;
5780 }
5781
5782 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5783 {
5784         *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5785
5786         return 0;
5787 }
5788
5789 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5790 {
5791         unsigned long i;
5792         struct kvm_vcpu *vcpu;
5793
5794         *val = 0;
5795
5796         kvm_for_each_vcpu(i, vcpu, kvm)
5797                 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5798
5799         return 0;
5800 }
5801
5802 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5803 {
5804         unsigned long i;
5805         struct kvm_vcpu *vcpu;
5806
5807         kvm_for_each_vcpu(i, vcpu, kvm)
5808                 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5809
5810         return 0;
5811 }
5812
5813 static int kvm_stat_data_get(void *data, u64 *val)
5814 {
5815         int r = -EFAULT;
5816         struct kvm_stat_data *stat_data = data;
5817
5818         switch (stat_data->kind) {
5819         case KVM_STAT_VM:
5820                 r = kvm_get_stat_per_vm(stat_data->kvm,
5821                                         stat_data->desc->desc.offset, val);
5822                 break;
5823         case KVM_STAT_VCPU:
5824                 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5825                                           stat_data->desc->desc.offset, val);
5826                 break;
5827         }
5828
5829         return r;
5830 }
5831
5832 static int kvm_stat_data_clear(void *data, u64 val)
5833 {
5834         int r = -EFAULT;
5835         struct kvm_stat_data *stat_data = data;
5836
5837         if (val)
5838                 return -EINVAL;
5839
5840         switch (stat_data->kind) {
5841         case KVM_STAT_VM:
5842                 r = kvm_clear_stat_per_vm(stat_data->kvm,
5843                                           stat_data->desc->desc.offset);
5844                 break;
5845         case KVM_STAT_VCPU:
5846                 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5847                                             stat_data->desc->desc.offset);
5848                 break;
5849         }
5850
5851         return r;
5852 }
5853
5854 static int kvm_stat_data_open(struct inode *inode, struct file *file)
5855 {
5856         __simple_attr_check_format("%llu\n", 0ull);
5857         return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5858                                 kvm_stat_data_clear, "%llu\n");
5859 }
5860
5861 static const struct file_operations stat_fops_per_vm = {
5862         .owner = THIS_MODULE,
5863         .open = kvm_stat_data_open,
5864         .release = kvm_debugfs_release,
5865         .read = simple_attr_read,
5866         .write = simple_attr_write,
5867         .llseek = no_llseek,
5868 };
5869
5870 static int vm_stat_get(void *_offset, u64 *val)
5871 {
5872         unsigned offset = (long)_offset;
5873         struct kvm *kvm;
5874         u64 tmp_val;
5875
5876         *val = 0;
5877         mutex_lock(&kvm_lock);
5878         list_for_each_entry(kvm, &vm_list, vm_list) {
5879                 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5880                 *val += tmp_val;
5881         }
5882         mutex_unlock(&kvm_lock);
5883         return 0;
5884 }
5885
5886 static int vm_stat_clear(void *_offset, u64 val)
5887 {
5888         unsigned offset = (long)_offset;
5889         struct kvm *kvm;
5890
5891         if (val)
5892                 return -EINVAL;
5893
5894         mutex_lock(&kvm_lock);
5895         list_for_each_entry(kvm, &vm_list, vm_list) {
5896                 kvm_clear_stat_per_vm(kvm, offset);
5897         }
5898         mutex_unlock(&kvm_lock);
5899
5900         return 0;
5901 }
5902
5903 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5904 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5905
5906 static int vcpu_stat_get(void *_offset, u64 *val)
5907 {
5908         unsigned offset = (long)_offset;
5909         struct kvm *kvm;
5910         u64 tmp_val;
5911
5912         *val = 0;
5913         mutex_lock(&kvm_lock);
5914         list_for_each_entry(kvm, &vm_list, vm_list) {
5915                 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5916                 *val += tmp_val;
5917         }
5918         mutex_unlock(&kvm_lock);
5919         return 0;
5920 }
5921
5922 static int vcpu_stat_clear(void *_offset, u64 val)
5923 {
5924         unsigned offset = (long)_offset;
5925         struct kvm *kvm;
5926
5927         if (val)
5928                 return -EINVAL;
5929
5930         mutex_lock(&kvm_lock);
5931         list_for_each_entry(kvm, &vm_list, vm_list) {
5932                 kvm_clear_stat_per_vcpu(kvm, offset);
5933         }
5934         mutex_unlock(&kvm_lock);
5935
5936         return 0;
5937 }
5938
5939 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5940                         "%llu\n");
5941 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5942
5943 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5944 {
5945         struct kobj_uevent_env *env;
5946         unsigned long long created, active;
5947
5948         if (!kvm_dev.this_device || !kvm)
5949                 return;
5950
5951         mutex_lock(&kvm_lock);
5952         if (type == KVM_EVENT_CREATE_VM) {
5953                 kvm_createvm_count++;
5954                 kvm_active_vms++;
5955         } else if (type == KVM_EVENT_DESTROY_VM) {
5956                 kvm_active_vms--;
5957         }
5958         created = kvm_createvm_count;
5959         active = kvm_active_vms;
5960         mutex_unlock(&kvm_lock);
5961
5962         env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5963         if (!env)
5964                 return;
5965
5966         add_uevent_var(env, "CREATED=%llu", created);
5967         add_uevent_var(env, "COUNT=%llu", active);
5968
5969         if (type == KVM_EVENT_CREATE_VM) {
5970                 add_uevent_var(env, "EVENT=create");
5971                 kvm->userspace_pid = task_pid_nr(current);
5972         } else if (type == KVM_EVENT_DESTROY_VM) {
5973                 add_uevent_var(env, "EVENT=destroy");
5974         }
5975         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5976
5977         if (!IS_ERR(kvm->debugfs_dentry)) {
5978                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5979
5980                 if (p) {
5981                         tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5982                         if (!IS_ERR(tmp))
5983                                 add_uevent_var(env, "STATS_PATH=%s", tmp);
5984                         kfree(p);
5985                 }
5986         }
5987         /* no need for checks, since we are adding at most only 5 keys */
5988         env->envp[env->envp_idx++] = NULL;
5989         kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5990         kfree(env);
5991 }
5992
5993 static void kvm_init_debug(void)
5994 {
5995         const struct file_operations *fops;
5996         const struct _kvm_stats_desc *pdesc;
5997         int i;
5998
5999         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6000
6001         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
6002                 pdesc = &kvm_vm_stats_desc[i];
6003                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6004                         fops = &vm_stat_fops;
6005                 else
6006                         fops = &vm_stat_readonly_fops;
6007                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6008                                 kvm_debugfs_dir,
6009                                 (void *)(long)pdesc->desc.offset, fops);
6010         }
6011
6012         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
6013                 pdesc = &kvm_vcpu_stats_desc[i];
6014                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6015                         fops = &vcpu_stat_fops;
6016                 else
6017                         fops = &vcpu_stat_readonly_fops;
6018                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6019                                 kvm_debugfs_dir,
6020                                 (void *)(long)pdesc->desc.offset, fops);
6021         }
6022 }
6023
6024 static inline
6025 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
6026 {
6027         return container_of(pn, struct kvm_vcpu, preempt_notifier);
6028 }
6029
6030 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
6031 {
6032         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6033
6034         WRITE_ONCE(vcpu->preempted, false);
6035         WRITE_ONCE(vcpu->ready, false);
6036
6037         __this_cpu_write(kvm_running_vcpu, vcpu);
6038         kvm_arch_sched_in(vcpu, cpu);
6039         kvm_arch_vcpu_load(vcpu, cpu);
6040 }
6041
6042 static void kvm_sched_out(struct preempt_notifier *pn,
6043                           struct task_struct *next)
6044 {
6045         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6046
6047         if (current->on_rq) {
6048                 WRITE_ONCE(vcpu->preempted, true);
6049                 WRITE_ONCE(vcpu->ready, true);
6050         }
6051         kvm_arch_vcpu_put(vcpu);
6052         __this_cpu_write(kvm_running_vcpu, NULL);
6053 }
6054
6055 /**
6056  * kvm_get_running_vcpu - get the vcpu running on the current CPU.
6057  *
6058  * We can disable preemption locally around accessing the per-CPU variable,
6059  * and use the resolved vcpu pointer after enabling preemption again,
6060  * because even if the current thread is migrated to another CPU, reading
6061  * the per-CPU value later will give us the same value as we update the
6062  * per-CPU variable in the preempt notifier handlers.
6063  */
6064 struct kvm_vcpu *kvm_get_running_vcpu(void)
6065 {
6066         struct kvm_vcpu *vcpu;
6067
6068         preempt_disable();
6069         vcpu = __this_cpu_read(kvm_running_vcpu);
6070         preempt_enable();
6071
6072         return vcpu;
6073 }
6074 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
6075
6076 /**
6077  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6078  */
6079 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6080 {
6081         return &kvm_running_vcpu;
6082 }
6083
6084 #ifdef CONFIG_GUEST_PERF_EVENTS
6085 static unsigned int kvm_guest_state(void)
6086 {
6087         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6088         unsigned int state;
6089
6090         if (!kvm_arch_pmi_in_guest(vcpu))
6091                 return 0;
6092
6093         state = PERF_GUEST_ACTIVE;
6094         if (!kvm_arch_vcpu_in_kernel(vcpu))
6095                 state |= PERF_GUEST_USER;
6096
6097         return state;
6098 }
6099
6100 static unsigned long kvm_guest_get_ip(void)
6101 {
6102         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6103
6104         /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6105         if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6106                 return 0;
6107
6108         return kvm_arch_vcpu_get_ip(vcpu);
6109 }
6110
6111 static struct perf_guest_info_callbacks kvm_guest_cbs = {
6112         .state                  = kvm_guest_state,
6113         .get_ip                 = kvm_guest_get_ip,
6114         .handle_intel_pt_intr   = NULL,
6115 };
6116
6117 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6118 {
6119         kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6120         perf_register_guest_info_callbacks(&kvm_guest_cbs);
6121 }
6122 void kvm_unregister_perf_callbacks(void)
6123 {
6124         perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6125 }
6126 #endif
6127
6128 int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6129 {
6130         int r;
6131         int cpu;
6132
6133 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6134         r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6135                                       kvm_online_cpu, kvm_offline_cpu);
6136         if (r)
6137                 return r;
6138
6139         register_syscore_ops(&kvm_syscore_ops);
6140 #endif
6141
6142         /* A kmem cache lets us meet the alignment requirements of fx_save. */
6143         if (!vcpu_align)
6144                 vcpu_align = __alignof__(struct kvm_vcpu);
6145         kvm_vcpu_cache =
6146                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6147                                            SLAB_ACCOUNT,
6148                                            offsetof(struct kvm_vcpu, arch),
6149                                            offsetofend(struct kvm_vcpu, stats_id)
6150                                            - offsetof(struct kvm_vcpu, arch),
6151                                            NULL);
6152         if (!kvm_vcpu_cache) {
6153                 r = -ENOMEM;
6154                 goto err_vcpu_cache;
6155         }
6156
6157         for_each_possible_cpu(cpu) {
6158                 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6159                                             GFP_KERNEL, cpu_to_node(cpu))) {
6160                         r = -ENOMEM;
6161                         goto err_cpu_kick_mask;
6162                 }
6163         }
6164
6165         r = kvm_irqfd_init();
6166         if (r)
6167                 goto err_irqfd;
6168
6169         r = kvm_async_pf_init();
6170         if (r)
6171                 goto err_async_pf;
6172
6173         kvm_chardev_ops.owner = module;
6174
6175         kvm_preempt_ops.sched_in = kvm_sched_in;
6176         kvm_preempt_ops.sched_out = kvm_sched_out;
6177
6178         kvm_init_debug();
6179
6180         r = kvm_vfio_ops_init();
6181         if (WARN_ON_ONCE(r))
6182                 goto err_vfio;
6183
6184         /*
6185          * Registration _must_ be the very last thing done, as this exposes
6186          * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6187          */
6188         r = misc_register(&kvm_dev);
6189         if (r) {
6190                 pr_err("kvm: misc device register failed\n");
6191                 goto err_register;
6192         }
6193
6194         return 0;
6195
6196 err_register:
6197         kvm_vfio_ops_exit();
6198 err_vfio:
6199         kvm_async_pf_deinit();
6200 err_async_pf:
6201         kvm_irqfd_exit();
6202 err_irqfd:
6203 err_cpu_kick_mask:
6204         for_each_possible_cpu(cpu)
6205                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6206         kmem_cache_destroy(kvm_vcpu_cache);
6207 err_vcpu_cache:
6208 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6209         unregister_syscore_ops(&kvm_syscore_ops);
6210         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6211 #endif
6212         return r;
6213 }
6214 EXPORT_SYMBOL_GPL(kvm_init);
6215
6216 void kvm_exit(void)
6217 {
6218         int cpu;
6219
6220         /*
6221          * Note, unregistering /dev/kvm doesn't strictly need to come first,
6222          * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6223          * to KVM while the module is being stopped.
6224          */
6225         misc_deregister(&kvm_dev);
6226
6227         debugfs_remove_recursive(kvm_debugfs_dir);
6228         for_each_possible_cpu(cpu)
6229                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6230         kmem_cache_destroy(kvm_vcpu_cache);
6231         kvm_vfio_ops_exit();
6232         kvm_async_pf_deinit();
6233 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6234         unregister_syscore_ops(&kvm_syscore_ops);
6235         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6236 #endif
6237         kvm_irqfd_exit();
6238 }
6239 EXPORT_SYMBOL_GPL(kvm_exit);
6240
6241 struct kvm_vm_worker_thread_context {
6242         struct kvm *kvm;
6243         struct task_struct *parent;
6244         struct completion init_done;
6245         kvm_vm_thread_fn_t thread_fn;
6246         uintptr_t data;
6247         int err;
6248 };
6249
6250 static int kvm_vm_worker_thread(void *context)
6251 {
6252         /*
6253          * The init_context is allocated on the stack of the parent thread, so
6254          * we have to locally copy anything that is needed beyond initialization
6255          */
6256         struct kvm_vm_worker_thread_context *init_context = context;
6257         struct task_struct *parent;
6258         struct kvm *kvm = init_context->kvm;
6259         kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6260         uintptr_t data = init_context->data;
6261         int err;
6262
6263         err = kthread_park(current);
6264         /* kthread_park(current) is never supposed to return an error */
6265         WARN_ON(err != 0);
6266         if (err)
6267                 goto init_complete;
6268
6269         err = cgroup_attach_task_all(init_context->parent, current);
6270         if (err) {
6271                 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6272                         __func__, err);
6273                 goto init_complete;
6274         }
6275
6276         set_user_nice(current, task_nice(init_context->parent));
6277
6278 init_complete:
6279         init_context->err = err;
6280         complete(&init_context->init_done);
6281         init_context = NULL;
6282
6283         if (err)
6284                 goto out;
6285
6286         /* Wait to be woken up by the spawner before proceeding. */
6287         kthread_parkme();
6288
6289         if (!kthread_should_stop())
6290                 err = thread_fn(kvm, data);
6291
6292 out:
6293         /*
6294          * Move kthread back to its original cgroup to prevent it lingering in
6295          * the cgroup of the VM process, after the latter finishes its
6296          * execution.
6297          *
6298          * kthread_stop() waits on the 'exited' completion condition which is
6299          * set in exit_mm(), via mm_release(), in do_exit(). However, the
6300          * kthread is removed from the cgroup in the cgroup_exit() which is
6301          * called after the exit_mm(). This causes the kthread_stop() to return
6302          * before the kthread actually quits the cgroup.
6303          */
6304         rcu_read_lock();
6305         parent = rcu_dereference(current->real_parent);
6306         get_task_struct(parent);
6307         rcu_read_unlock();
6308         cgroup_attach_task_all(parent, current);
6309         put_task_struct(parent);
6310
6311         return err;
6312 }
6313
6314 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6315                                 uintptr_t data, const char *name,
6316                                 struct task_struct **thread_ptr)
6317 {
6318         struct kvm_vm_worker_thread_context init_context = {};
6319         struct task_struct *thread;
6320
6321         *thread_ptr = NULL;
6322         init_context.kvm = kvm;
6323         init_context.parent = current;
6324         init_context.thread_fn = thread_fn;
6325         init_context.data = data;
6326         init_completion(&init_context.init_done);
6327
6328         thread = kthread_run(kvm_vm_worker_thread, &init_context,
6329                              "%s-%d", name, task_pid_nr(current));
6330         if (IS_ERR(thread))
6331                 return PTR_ERR(thread);
6332
6333         /* kthread_run is never supposed to return NULL */
6334         WARN_ON(thread == NULL);
6335
6336         wait_for_completion(&init_context.init_done);
6337
6338         if (!init_context.err)
6339                 *thread_ptr = thread;
6340
6341         return init_context.err;
6342 }