]> git.ipfire.org Git - thirdparty/kernel/stable.git/blob - virt/kvm/kvm_main.c
Merge tag 'loongarch-kvm-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhu...
[thirdparty/kernel/stable.git] / virt / kvm / kvm_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
14 */
15
16 #include <kvm/iodev.h>
17
18 #include <linux/kvm_host.h>
19 #include <linux/kvm.h>
20 #include <linux/module.h>
21 #include <linux/errno.h>
22 #include <linux/percpu.h>
23 #include <linux/mm.h>
24 #include <linux/miscdevice.h>
25 #include <linux/vmalloc.h>
26 #include <linux/reboot.h>
27 #include <linux/debugfs.h>
28 #include <linux/highmem.h>
29 #include <linux/file.h>
30 #include <linux/syscore_ops.h>
31 #include <linux/cpu.h>
32 #include <linux/sched/signal.h>
33 #include <linux/sched/mm.h>
34 #include <linux/sched/stat.h>
35 #include <linux/cpumask.h>
36 #include <linux/smp.h>
37 #include <linux/anon_inodes.h>
38 #include <linux/profile.h>
39 #include <linux/kvm_para.h>
40 #include <linux/pagemap.h>
41 #include <linux/mman.h>
42 #include <linux/swap.h>
43 #include <linux/bitops.h>
44 #include <linux/spinlock.h>
45 #include <linux/compat.h>
46 #include <linux/srcu.h>
47 #include <linux/hugetlb.h>
48 #include <linux/slab.h>
49 #include <linux/sort.h>
50 #include <linux/bsearch.h>
51 #include <linux/io.h>
52 #include <linux/lockdep.h>
53 #include <linux/kthread.h>
54 #include <linux/suspend.h>
55
56 #include <asm/processor.h>
57 #include <asm/ioctl.h>
58 #include <linux/uaccess.h>
59
60 #include "coalesced_mmio.h"
61 #include "async_pf.h"
62 #include "kvm_mm.h"
63 #include "vfio.h"
64
65 #include <trace/events/ipi.h>
66
67 #define CREATE_TRACE_POINTS
68 #include <trace/events/kvm.h>
69
70 #include <linux/kvm_dirty_ring.h>
71
72
73 /* Worst case buffer size needed for holding an integer. */
74 #define ITOA_MAX_LEN 12
75
76 MODULE_AUTHOR("Qumranet");
77 MODULE_LICENSE("GPL");
78
79 /* Architectures should define their poll value according to the halt latency */
80 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
81 module_param(halt_poll_ns, uint, 0644);
82 EXPORT_SYMBOL_GPL(halt_poll_ns);
83
84 /* Default doubles per-vcpu halt_poll_ns. */
85 unsigned int halt_poll_ns_grow = 2;
86 module_param(halt_poll_ns_grow, uint, 0644);
87 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
88
89 /* The start value to grow halt_poll_ns from */
90 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
91 module_param(halt_poll_ns_grow_start, uint, 0644);
92 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
93
94 /* Default resets per-vcpu halt_poll_ns . */
95 unsigned int halt_poll_ns_shrink;
96 module_param(halt_poll_ns_shrink, uint, 0644);
97 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
98
99 /*
100 * Ordering of locks:
101 *
102 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
103 */
104
105 DEFINE_MUTEX(kvm_lock);
106 LIST_HEAD(vm_list);
107
108 static struct kmem_cache *kvm_vcpu_cache;
109
110 static __read_mostly struct preempt_ops kvm_preempt_ops;
111 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
112
113 struct dentry *kvm_debugfs_dir;
114 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
115
116 static const struct file_operations stat_fops_per_vm;
117
118 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
119 unsigned long arg);
120 #ifdef CONFIG_KVM_COMPAT
121 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
122 unsigned long arg);
123 #define KVM_COMPAT(c) .compat_ioctl = (c)
124 #else
125 /*
126 * For architectures that don't implement a compat infrastructure,
127 * adopt a double line of defense:
128 * - Prevent a compat task from opening /dev/kvm
129 * - If the open has been done by a 64bit task, and the KVM fd
130 * passed to a compat task, let the ioctls fail.
131 */
132 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
133 unsigned long arg) { return -EINVAL; }
134
135 static int kvm_no_compat_open(struct inode *inode, struct file *file)
136 {
137 return is_compat_task() ? -ENODEV : 0;
138 }
139 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
140 .open = kvm_no_compat_open
141 #endif
142 static int hardware_enable_all(void);
143 static void hardware_disable_all(void);
144
145 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
146
147 #define KVM_EVENT_CREATE_VM 0
148 #define KVM_EVENT_DESTROY_VM 1
149 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
150 static unsigned long long kvm_createvm_count;
151 static unsigned long long kvm_active_vms;
152
153 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
154
155 __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
156 {
157 }
158
159 bool kvm_is_zone_device_page(struct page *page)
160 {
161 /*
162 * The metadata used by is_zone_device_page() to determine whether or
163 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
164 * the device has been pinned, e.g. by get_user_pages(). WARN if the
165 * page_count() is zero to help detect bad usage of this helper.
166 */
167 if (WARN_ON_ONCE(!page_count(page)))
168 return false;
169
170 return is_zone_device_page(page);
171 }
172
173 /*
174 * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
175 * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
176 * is likely incomplete, it has been compiled purely through people wanting to
177 * back guest with a certain type of memory and encountering issues.
178 */
179 struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
180 {
181 struct page *page;
182
183 if (!pfn_valid(pfn))
184 return NULL;
185
186 page = pfn_to_page(pfn);
187 if (!PageReserved(page))
188 return page;
189
190 /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
191 if (is_zero_pfn(pfn))
192 return page;
193
194 /*
195 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
196 * perspective they are "normal" pages, albeit with slightly different
197 * usage rules.
198 */
199 if (kvm_is_zone_device_page(page))
200 return page;
201
202 return NULL;
203 }
204
205 /*
206 * Switches to specified vcpu, until a matching vcpu_put()
207 */
208 void vcpu_load(struct kvm_vcpu *vcpu)
209 {
210 int cpu = get_cpu();
211
212 __this_cpu_write(kvm_running_vcpu, vcpu);
213 preempt_notifier_register(&vcpu->preempt_notifier);
214 kvm_arch_vcpu_load(vcpu, cpu);
215 put_cpu();
216 }
217 EXPORT_SYMBOL_GPL(vcpu_load);
218
219 void vcpu_put(struct kvm_vcpu *vcpu)
220 {
221 preempt_disable();
222 kvm_arch_vcpu_put(vcpu);
223 preempt_notifier_unregister(&vcpu->preempt_notifier);
224 __this_cpu_write(kvm_running_vcpu, NULL);
225 preempt_enable();
226 }
227 EXPORT_SYMBOL_GPL(vcpu_put);
228
229 /* TODO: merge with kvm_arch_vcpu_should_kick */
230 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
231 {
232 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
233
234 /*
235 * We need to wait for the VCPU to reenable interrupts and get out of
236 * READING_SHADOW_PAGE_TABLES mode.
237 */
238 if (req & KVM_REQUEST_WAIT)
239 return mode != OUTSIDE_GUEST_MODE;
240
241 /*
242 * Need to kick a running VCPU, but otherwise there is nothing to do.
243 */
244 return mode == IN_GUEST_MODE;
245 }
246
247 static void ack_kick(void *_completed)
248 {
249 }
250
251 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
252 {
253 if (cpumask_empty(cpus))
254 return false;
255
256 smp_call_function_many(cpus, ack_kick, NULL, wait);
257 return true;
258 }
259
260 static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
261 struct cpumask *tmp, int current_cpu)
262 {
263 int cpu;
264
265 if (likely(!(req & KVM_REQUEST_NO_ACTION)))
266 __kvm_make_request(req, vcpu);
267
268 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
269 return;
270
271 /*
272 * Note, the vCPU could get migrated to a different pCPU at any point
273 * after kvm_request_needs_ipi(), which could result in sending an IPI
274 * to the previous pCPU. But, that's OK because the purpose of the IPI
275 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
276 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
277 * after this point is also OK, as the requirement is only that KVM wait
278 * for vCPUs that were reading SPTEs _before_ any changes were
279 * finalized. See kvm_vcpu_kick() for more details on handling requests.
280 */
281 if (kvm_request_needs_ipi(vcpu, req)) {
282 cpu = READ_ONCE(vcpu->cpu);
283 if (cpu != -1 && cpu != current_cpu)
284 __cpumask_set_cpu(cpu, tmp);
285 }
286 }
287
288 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
289 unsigned long *vcpu_bitmap)
290 {
291 struct kvm_vcpu *vcpu;
292 struct cpumask *cpus;
293 int i, me;
294 bool called;
295
296 me = get_cpu();
297
298 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
299 cpumask_clear(cpus);
300
301 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
302 vcpu = kvm_get_vcpu(kvm, i);
303 if (!vcpu)
304 continue;
305 kvm_make_vcpu_request(vcpu, req, cpus, me);
306 }
307
308 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
309 put_cpu();
310
311 return called;
312 }
313
314 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
315 struct kvm_vcpu *except)
316 {
317 struct kvm_vcpu *vcpu;
318 struct cpumask *cpus;
319 unsigned long i;
320 bool called;
321 int me;
322
323 me = get_cpu();
324
325 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
326 cpumask_clear(cpus);
327
328 kvm_for_each_vcpu(i, vcpu, kvm) {
329 if (vcpu == except)
330 continue;
331 kvm_make_vcpu_request(vcpu, req, cpus, me);
332 }
333
334 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
335 put_cpu();
336
337 return called;
338 }
339
340 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
341 {
342 return kvm_make_all_cpus_request_except(kvm, req, NULL);
343 }
344 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
345
346 void kvm_flush_remote_tlbs(struct kvm *kvm)
347 {
348 ++kvm->stat.generic.remote_tlb_flush_requests;
349
350 /*
351 * We want to publish modifications to the page tables before reading
352 * mode. Pairs with a memory barrier in arch-specific code.
353 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
354 * and smp_mb in walk_shadow_page_lockless_begin/end.
355 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
356 *
357 * There is already an smp_mb__after_atomic() before
358 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
359 * barrier here.
360 */
361 if (!kvm_arch_flush_remote_tlbs(kvm)
362 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
363 ++kvm->stat.generic.remote_tlb_flush;
364 }
365 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
366
367 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
368 {
369 if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
370 return;
371
372 /*
373 * Fall back to a flushing entire TLBs if the architecture range-based
374 * TLB invalidation is unsupported or can't be performed for whatever
375 * reason.
376 */
377 kvm_flush_remote_tlbs(kvm);
378 }
379
380 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
381 const struct kvm_memory_slot *memslot)
382 {
383 /*
384 * All current use cases for flushing the TLBs for a specific memslot
385 * are related to dirty logging, and many do the TLB flush out of
386 * mmu_lock. The interaction between the various operations on memslot
387 * must be serialized by slots_locks to ensure the TLB flush from one
388 * operation is observed by any other operation on the same memslot.
389 */
390 lockdep_assert_held(&kvm->slots_lock);
391 kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
392 }
393
394 static void kvm_flush_shadow_all(struct kvm *kvm)
395 {
396 kvm_arch_flush_shadow_all(kvm);
397 kvm_arch_guest_memory_reclaimed(kvm);
398 }
399
400 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
401 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
402 gfp_t gfp_flags)
403 {
404 gfp_flags |= mc->gfp_zero;
405
406 if (mc->kmem_cache)
407 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
408 else
409 return (void *)__get_free_page(gfp_flags);
410 }
411
412 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
413 {
414 gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
415 void *obj;
416
417 if (mc->nobjs >= min)
418 return 0;
419
420 if (unlikely(!mc->objects)) {
421 if (WARN_ON_ONCE(!capacity))
422 return -EIO;
423
424 mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
425 if (!mc->objects)
426 return -ENOMEM;
427
428 mc->capacity = capacity;
429 }
430
431 /* It is illegal to request a different capacity across topups. */
432 if (WARN_ON_ONCE(mc->capacity != capacity))
433 return -EIO;
434
435 while (mc->nobjs < mc->capacity) {
436 obj = mmu_memory_cache_alloc_obj(mc, gfp);
437 if (!obj)
438 return mc->nobjs >= min ? 0 : -ENOMEM;
439 mc->objects[mc->nobjs++] = obj;
440 }
441 return 0;
442 }
443
444 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
445 {
446 return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
447 }
448
449 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
450 {
451 return mc->nobjs;
452 }
453
454 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
455 {
456 while (mc->nobjs) {
457 if (mc->kmem_cache)
458 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
459 else
460 free_page((unsigned long)mc->objects[--mc->nobjs]);
461 }
462
463 kvfree(mc->objects);
464
465 mc->objects = NULL;
466 mc->capacity = 0;
467 }
468
469 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
470 {
471 void *p;
472
473 if (WARN_ON(!mc->nobjs))
474 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
475 else
476 p = mc->objects[--mc->nobjs];
477 BUG_ON(!p);
478 return p;
479 }
480 #endif
481
482 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
483 {
484 mutex_init(&vcpu->mutex);
485 vcpu->cpu = -1;
486 vcpu->kvm = kvm;
487 vcpu->vcpu_id = id;
488 vcpu->pid = NULL;
489 #ifndef __KVM_HAVE_ARCH_WQP
490 rcuwait_init(&vcpu->wait);
491 #endif
492 kvm_async_pf_vcpu_init(vcpu);
493
494 kvm_vcpu_set_in_spin_loop(vcpu, false);
495 kvm_vcpu_set_dy_eligible(vcpu, false);
496 vcpu->preempted = false;
497 vcpu->ready = false;
498 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
499 vcpu->last_used_slot = NULL;
500
501 /* Fill the stats id string for the vcpu */
502 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
503 task_pid_nr(current), id);
504 }
505
506 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
507 {
508 kvm_arch_vcpu_destroy(vcpu);
509 kvm_dirty_ring_free(&vcpu->dirty_ring);
510
511 /*
512 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
513 * the vcpu->pid pointer, and at destruction time all file descriptors
514 * are already gone.
515 */
516 put_pid(rcu_dereference_protected(vcpu->pid, 1));
517
518 free_page((unsigned long)vcpu->run);
519 kmem_cache_free(kvm_vcpu_cache, vcpu);
520 }
521
522 void kvm_destroy_vcpus(struct kvm *kvm)
523 {
524 unsigned long i;
525 struct kvm_vcpu *vcpu;
526
527 kvm_for_each_vcpu(i, vcpu, kvm) {
528 kvm_vcpu_destroy(vcpu);
529 xa_erase(&kvm->vcpu_array, i);
530 }
531
532 atomic_set(&kvm->online_vcpus, 0);
533 }
534 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
535
536 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
537 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
538 {
539 return container_of(mn, struct kvm, mmu_notifier);
540 }
541
542 typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
543
544 typedef void (*on_lock_fn_t)(struct kvm *kvm);
545
546 struct kvm_mmu_notifier_range {
547 /*
548 * 64-bit addresses, as KVM notifiers can operate on host virtual
549 * addresses (unsigned long) and guest physical addresses (64-bit).
550 */
551 u64 start;
552 u64 end;
553 union kvm_mmu_notifier_arg arg;
554 gfn_handler_t handler;
555 on_lock_fn_t on_lock;
556 bool flush_on_ret;
557 bool may_block;
558 };
559
560 /*
561 * The inner-most helper returns a tuple containing the return value from the
562 * arch- and action-specific handler, plus a flag indicating whether or not at
563 * least one memslot was found, i.e. if the handler found guest memory.
564 *
565 * Note, most notifiers are averse to booleans, so even though KVM tracks the
566 * return from arch code as a bool, outer helpers will cast it to an int. :-(
567 */
568 typedef struct kvm_mmu_notifier_return {
569 bool ret;
570 bool found_memslot;
571 } kvm_mn_ret_t;
572
573 /*
574 * Use a dedicated stub instead of NULL to indicate that there is no callback
575 * function/handler. The compiler technically can't guarantee that a real
576 * function will have a non-zero address, and so it will generate code to
577 * check for !NULL, whereas comparing against a stub will be elided at compile
578 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
579 */
580 static void kvm_null_fn(void)
581 {
582
583 }
584 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
585
586 static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
587
588 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
589 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
590 for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
591 node; \
592 node = interval_tree_iter_next(node, start, last)) \
593
594 static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
595 const struct kvm_mmu_notifier_range *range)
596 {
597 struct kvm_mmu_notifier_return r = {
598 .ret = false,
599 .found_memslot = false,
600 };
601 struct kvm_gfn_range gfn_range;
602 struct kvm_memory_slot *slot;
603 struct kvm_memslots *slots;
604 int i, idx;
605
606 if (WARN_ON_ONCE(range->end <= range->start))
607 return r;
608
609 /* A null handler is allowed if and only if on_lock() is provided. */
610 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
611 IS_KVM_NULL_FN(range->handler)))
612 return r;
613
614 idx = srcu_read_lock(&kvm->srcu);
615
616 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
617 struct interval_tree_node *node;
618
619 slots = __kvm_memslots(kvm, i);
620 kvm_for_each_memslot_in_hva_range(node, slots,
621 range->start, range->end - 1) {
622 unsigned long hva_start, hva_end;
623
624 slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
625 hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
626 hva_end = min_t(unsigned long, range->end,
627 slot->userspace_addr + (slot->npages << PAGE_SHIFT));
628
629 /*
630 * To optimize for the likely case where the address
631 * range is covered by zero or one memslots, don't
632 * bother making these conditional (to avoid writes on
633 * the second or later invocation of the handler).
634 */
635 gfn_range.arg = range->arg;
636 gfn_range.may_block = range->may_block;
637
638 /*
639 * {gfn(page) | page intersects with [hva_start, hva_end)} =
640 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
641 */
642 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
643 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
644 gfn_range.slot = slot;
645
646 if (!r.found_memslot) {
647 r.found_memslot = true;
648 KVM_MMU_LOCK(kvm);
649 if (!IS_KVM_NULL_FN(range->on_lock))
650 range->on_lock(kvm);
651
652 if (IS_KVM_NULL_FN(range->handler))
653 break;
654 }
655 r.ret |= range->handler(kvm, &gfn_range);
656 }
657 }
658
659 if (range->flush_on_ret && r.ret)
660 kvm_flush_remote_tlbs(kvm);
661
662 if (r.found_memslot)
663 KVM_MMU_UNLOCK(kvm);
664
665 srcu_read_unlock(&kvm->srcu, idx);
666
667 return r;
668 }
669
670 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
671 unsigned long start,
672 unsigned long end,
673 union kvm_mmu_notifier_arg arg,
674 gfn_handler_t handler)
675 {
676 struct kvm *kvm = mmu_notifier_to_kvm(mn);
677 const struct kvm_mmu_notifier_range range = {
678 .start = start,
679 .end = end,
680 .arg = arg,
681 .handler = handler,
682 .on_lock = (void *)kvm_null_fn,
683 .flush_on_ret = true,
684 .may_block = false,
685 };
686
687 return __kvm_handle_hva_range(kvm, &range).ret;
688 }
689
690 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
691 unsigned long start,
692 unsigned long end,
693 gfn_handler_t handler)
694 {
695 struct kvm *kvm = mmu_notifier_to_kvm(mn);
696 const struct kvm_mmu_notifier_range range = {
697 .start = start,
698 .end = end,
699 .handler = handler,
700 .on_lock = (void *)kvm_null_fn,
701 .flush_on_ret = false,
702 .may_block = false,
703 };
704
705 return __kvm_handle_hva_range(kvm, &range).ret;
706 }
707
708 static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
709 {
710 /*
711 * Skipping invalid memslots is correct if and only change_pte() is
712 * surrounded by invalidate_range_{start,end}(), which is currently
713 * guaranteed by the primary MMU. If that ever changes, KVM needs to
714 * unmap the memslot instead of skipping the memslot to ensure that KVM
715 * doesn't hold references to the old PFN.
716 */
717 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
718
719 if (range->slot->flags & KVM_MEMSLOT_INVALID)
720 return false;
721
722 return kvm_set_spte_gfn(kvm, range);
723 }
724
725 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
726 struct mm_struct *mm,
727 unsigned long address,
728 pte_t pte)
729 {
730 struct kvm *kvm = mmu_notifier_to_kvm(mn);
731 const union kvm_mmu_notifier_arg arg = { .pte = pte };
732
733 trace_kvm_set_spte_hva(address);
734
735 /*
736 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
737 * If mmu_invalidate_in_progress is zero, then no in-progress
738 * invalidations, including this one, found a relevant memslot at
739 * start(); rechecking memslots here is unnecessary. Note, a false
740 * positive (count elevated by a different invalidation) is sub-optimal
741 * but functionally ok.
742 */
743 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
744 if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
745 return;
746
747 kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
748 }
749
750 void kvm_mmu_invalidate_begin(struct kvm *kvm)
751 {
752 lockdep_assert_held_write(&kvm->mmu_lock);
753 /*
754 * The count increase must become visible at unlock time as no
755 * spte can be established without taking the mmu_lock and
756 * count is also read inside the mmu_lock critical section.
757 */
758 kvm->mmu_invalidate_in_progress++;
759
760 if (likely(kvm->mmu_invalidate_in_progress == 1)) {
761 kvm->mmu_invalidate_range_start = INVALID_GPA;
762 kvm->mmu_invalidate_range_end = INVALID_GPA;
763 }
764 }
765
766 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
767 {
768 lockdep_assert_held_write(&kvm->mmu_lock);
769
770 WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
771
772 if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
773 kvm->mmu_invalidate_range_start = start;
774 kvm->mmu_invalidate_range_end = end;
775 } else {
776 /*
777 * Fully tracking multiple concurrent ranges has diminishing
778 * returns. Keep things simple and just find the minimal range
779 * which includes the current and new ranges. As there won't be
780 * enough information to subtract a range after its invalidate
781 * completes, any ranges invalidated concurrently will
782 * accumulate and persist until all outstanding invalidates
783 * complete.
784 */
785 kvm->mmu_invalidate_range_start =
786 min(kvm->mmu_invalidate_range_start, start);
787 kvm->mmu_invalidate_range_end =
788 max(kvm->mmu_invalidate_range_end, end);
789 }
790 }
791
792 bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
793 {
794 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
795 return kvm_unmap_gfn_range(kvm, range);
796 }
797
798 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
799 const struct mmu_notifier_range *range)
800 {
801 struct kvm *kvm = mmu_notifier_to_kvm(mn);
802 const struct kvm_mmu_notifier_range hva_range = {
803 .start = range->start,
804 .end = range->end,
805 .handler = kvm_mmu_unmap_gfn_range,
806 .on_lock = kvm_mmu_invalidate_begin,
807 .flush_on_ret = true,
808 .may_block = mmu_notifier_range_blockable(range),
809 };
810
811 trace_kvm_unmap_hva_range(range->start, range->end);
812
813 /*
814 * Prevent memslot modification between range_start() and range_end()
815 * so that conditionally locking provides the same result in both
816 * functions. Without that guarantee, the mmu_invalidate_in_progress
817 * adjustments will be imbalanced.
818 *
819 * Pairs with the decrement in range_end().
820 */
821 spin_lock(&kvm->mn_invalidate_lock);
822 kvm->mn_active_invalidate_count++;
823 spin_unlock(&kvm->mn_invalidate_lock);
824
825 /*
826 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
827 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
828 * each cache's lock. There are relatively few caches in existence at
829 * any given time, and the caches themselves can check for hva overlap,
830 * i.e. don't need to rely on memslot overlap checks for performance.
831 * Because this runs without holding mmu_lock, the pfn caches must use
832 * mn_active_invalidate_count (see above) instead of
833 * mmu_invalidate_in_progress.
834 */
835 gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
836 hva_range.may_block);
837
838 /*
839 * If one or more memslots were found and thus zapped, notify arch code
840 * that guest memory has been reclaimed. This needs to be done *after*
841 * dropping mmu_lock, as x86's reclaim path is slooooow.
842 */
843 if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
844 kvm_arch_guest_memory_reclaimed(kvm);
845
846 return 0;
847 }
848
849 void kvm_mmu_invalidate_end(struct kvm *kvm)
850 {
851 lockdep_assert_held_write(&kvm->mmu_lock);
852
853 /*
854 * This sequence increase will notify the kvm page fault that
855 * the page that is going to be mapped in the spte could have
856 * been freed.
857 */
858 kvm->mmu_invalidate_seq++;
859 smp_wmb();
860 /*
861 * The above sequence increase must be visible before the
862 * below count decrease, which is ensured by the smp_wmb above
863 * in conjunction with the smp_rmb in mmu_invalidate_retry().
864 */
865 kvm->mmu_invalidate_in_progress--;
866 KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
867
868 /*
869 * Assert that at least one range was added between start() and end().
870 * Not adding a range isn't fatal, but it is a KVM bug.
871 */
872 WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
873 }
874
875 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
876 const struct mmu_notifier_range *range)
877 {
878 struct kvm *kvm = mmu_notifier_to_kvm(mn);
879 const struct kvm_mmu_notifier_range hva_range = {
880 .start = range->start,
881 .end = range->end,
882 .handler = (void *)kvm_null_fn,
883 .on_lock = kvm_mmu_invalidate_end,
884 .flush_on_ret = false,
885 .may_block = mmu_notifier_range_blockable(range),
886 };
887 bool wake;
888
889 __kvm_handle_hva_range(kvm, &hva_range);
890
891 /* Pairs with the increment in range_start(). */
892 spin_lock(&kvm->mn_invalidate_lock);
893 wake = (--kvm->mn_active_invalidate_count == 0);
894 spin_unlock(&kvm->mn_invalidate_lock);
895
896 /*
897 * There can only be one waiter, since the wait happens under
898 * slots_lock.
899 */
900 if (wake)
901 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
902 }
903
904 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
905 struct mm_struct *mm,
906 unsigned long start,
907 unsigned long end)
908 {
909 trace_kvm_age_hva(start, end);
910
911 return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
912 kvm_age_gfn);
913 }
914
915 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
916 struct mm_struct *mm,
917 unsigned long start,
918 unsigned long end)
919 {
920 trace_kvm_age_hva(start, end);
921
922 /*
923 * Even though we do not flush TLB, this will still adversely
924 * affect performance on pre-Haswell Intel EPT, where there is
925 * no EPT Access Bit to clear so that we have to tear down EPT
926 * tables instead. If we find this unacceptable, we can always
927 * add a parameter to kvm_age_hva so that it effectively doesn't
928 * do anything on clear_young.
929 *
930 * Also note that currently we never issue secondary TLB flushes
931 * from clear_young, leaving this job up to the regular system
932 * cadence. If we find this inaccurate, we might come up with a
933 * more sophisticated heuristic later.
934 */
935 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
936 }
937
938 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
939 struct mm_struct *mm,
940 unsigned long address)
941 {
942 trace_kvm_test_age_hva(address);
943
944 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
945 kvm_test_age_gfn);
946 }
947
948 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
949 struct mm_struct *mm)
950 {
951 struct kvm *kvm = mmu_notifier_to_kvm(mn);
952 int idx;
953
954 idx = srcu_read_lock(&kvm->srcu);
955 kvm_flush_shadow_all(kvm);
956 srcu_read_unlock(&kvm->srcu, idx);
957 }
958
959 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
960 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
961 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
962 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
963 .clear_young = kvm_mmu_notifier_clear_young,
964 .test_young = kvm_mmu_notifier_test_young,
965 .change_pte = kvm_mmu_notifier_change_pte,
966 .release = kvm_mmu_notifier_release,
967 };
968
969 static int kvm_init_mmu_notifier(struct kvm *kvm)
970 {
971 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
972 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
973 }
974
975 #else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
976
977 static int kvm_init_mmu_notifier(struct kvm *kvm)
978 {
979 return 0;
980 }
981
982 #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
983
984 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
985 static int kvm_pm_notifier_call(struct notifier_block *bl,
986 unsigned long state,
987 void *unused)
988 {
989 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
990
991 return kvm_arch_pm_notifier(kvm, state);
992 }
993
994 static void kvm_init_pm_notifier(struct kvm *kvm)
995 {
996 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
997 /* Suspend KVM before we suspend ftrace, RCU, etc. */
998 kvm->pm_notifier.priority = INT_MAX;
999 register_pm_notifier(&kvm->pm_notifier);
1000 }
1001
1002 static void kvm_destroy_pm_notifier(struct kvm *kvm)
1003 {
1004 unregister_pm_notifier(&kvm->pm_notifier);
1005 }
1006 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
1007 static void kvm_init_pm_notifier(struct kvm *kvm)
1008 {
1009 }
1010
1011 static void kvm_destroy_pm_notifier(struct kvm *kvm)
1012 {
1013 }
1014 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
1015
1016 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
1017 {
1018 if (!memslot->dirty_bitmap)
1019 return;
1020
1021 kvfree(memslot->dirty_bitmap);
1022 memslot->dirty_bitmap = NULL;
1023 }
1024
1025 /* This does not remove the slot from struct kvm_memslots data structures */
1026 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
1027 {
1028 if (slot->flags & KVM_MEM_GUEST_MEMFD)
1029 kvm_gmem_unbind(slot);
1030
1031 kvm_destroy_dirty_bitmap(slot);
1032
1033 kvm_arch_free_memslot(kvm, slot);
1034
1035 kfree(slot);
1036 }
1037
1038 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
1039 {
1040 struct hlist_node *idnode;
1041 struct kvm_memory_slot *memslot;
1042 int bkt;
1043
1044 /*
1045 * The same memslot objects live in both active and inactive sets,
1046 * arbitrarily free using index '1' so the second invocation of this
1047 * function isn't operating over a structure with dangling pointers
1048 * (even though this function isn't actually touching them).
1049 */
1050 if (!slots->node_idx)
1051 return;
1052
1053 hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1054 kvm_free_memslot(kvm, memslot);
1055 }
1056
1057 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1058 {
1059 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1060 case KVM_STATS_TYPE_INSTANT:
1061 return 0444;
1062 case KVM_STATS_TYPE_CUMULATIVE:
1063 case KVM_STATS_TYPE_PEAK:
1064 default:
1065 return 0644;
1066 }
1067 }
1068
1069
1070 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1071 {
1072 int i;
1073 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1074 kvm_vcpu_stats_header.num_desc;
1075
1076 if (IS_ERR(kvm->debugfs_dentry))
1077 return;
1078
1079 debugfs_remove_recursive(kvm->debugfs_dentry);
1080
1081 if (kvm->debugfs_stat_data) {
1082 for (i = 0; i < kvm_debugfs_num_entries; i++)
1083 kfree(kvm->debugfs_stat_data[i]);
1084 kfree(kvm->debugfs_stat_data);
1085 }
1086 }
1087
1088 static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1089 {
1090 static DEFINE_MUTEX(kvm_debugfs_lock);
1091 struct dentry *dent;
1092 char dir_name[ITOA_MAX_LEN * 2];
1093 struct kvm_stat_data *stat_data;
1094 const struct _kvm_stats_desc *pdesc;
1095 int i, ret = -ENOMEM;
1096 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1097 kvm_vcpu_stats_header.num_desc;
1098
1099 if (!debugfs_initialized())
1100 return 0;
1101
1102 snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1103 mutex_lock(&kvm_debugfs_lock);
1104 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1105 if (dent) {
1106 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1107 dput(dent);
1108 mutex_unlock(&kvm_debugfs_lock);
1109 return 0;
1110 }
1111 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1112 mutex_unlock(&kvm_debugfs_lock);
1113 if (IS_ERR(dent))
1114 return 0;
1115
1116 kvm->debugfs_dentry = dent;
1117 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1118 sizeof(*kvm->debugfs_stat_data),
1119 GFP_KERNEL_ACCOUNT);
1120 if (!kvm->debugfs_stat_data)
1121 goto out_err;
1122
1123 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1124 pdesc = &kvm_vm_stats_desc[i];
1125 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1126 if (!stat_data)
1127 goto out_err;
1128
1129 stat_data->kvm = kvm;
1130 stat_data->desc = pdesc;
1131 stat_data->kind = KVM_STAT_VM;
1132 kvm->debugfs_stat_data[i] = stat_data;
1133 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1134 kvm->debugfs_dentry, stat_data,
1135 &stat_fops_per_vm);
1136 }
1137
1138 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1139 pdesc = &kvm_vcpu_stats_desc[i];
1140 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1141 if (!stat_data)
1142 goto out_err;
1143
1144 stat_data->kvm = kvm;
1145 stat_data->desc = pdesc;
1146 stat_data->kind = KVM_STAT_VCPU;
1147 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1148 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1149 kvm->debugfs_dentry, stat_data,
1150 &stat_fops_per_vm);
1151 }
1152
1153 ret = kvm_arch_create_vm_debugfs(kvm);
1154 if (ret)
1155 goto out_err;
1156
1157 return 0;
1158 out_err:
1159 kvm_destroy_vm_debugfs(kvm);
1160 return ret;
1161 }
1162
1163 /*
1164 * Called after the VM is otherwise initialized, but just before adding it to
1165 * the vm_list.
1166 */
1167 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1168 {
1169 return 0;
1170 }
1171
1172 /*
1173 * Called just after removing the VM from the vm_list, but before doing any
1174 * other destruction.
1175 */
1176 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1177 {
1178 }
1179
1180 /*
1181 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1182 * be setup already, so we can create arch-specific debugfs entries under it.
1183 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1184 * a per-arch destroy interface is not needed.
1185 */
1186 int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1187 {
1188 return 0;
1189 }
1190
1191 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1192 {
1193 struct kvm *kvm = kvm_arch_alloc_vm();
1194 struct kvm_memslots *slots;
1195 int r = -ENOMEM;
1196 int i, j;
1197
1198 if (!kvm)
1199 return ERR_PTR(-ENOMEM);
1200
1201 KVM_MMU_LOCK_INIT(kvm);
1202 mmgrab(current->mm);
1203 kvm->mm = current->mm;
1204 kvm_eventfd_init(kvm);
1205 mutex_init(&kvm->lock);
1206 mutex_init(&kvm->irq_lock);
1207 mutex_init(&kvm->slots_lock);
1208 mutex_init(&kvm->slots_arch_lock);
1209 spin_lock_init(&kvm->mn_invalidate_lock);
1210 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1211 xa_init(&kvm->vcpu_array);
1212 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1213 xa_init(&kvm->mem_attr_array);
1214 #endif
1215
1216 INIT_LIST_HEAD(&kvm->gpc_list);
1217 spin_lock_init(&kvm->gpc_lock);
1218
1219 INIT_LIST_HEAD(&kvm->devices);
1220 kvm->max_vcpus = KVM_MAX_VCPUS;
1221
1222 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1223
1224 /*
1225 * Force subsequent debugfs file creations to fail if the VM directory
1226 * is not created (by kvm_create_vm_debugfs()).
1227 */
1228 kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1229
1230 snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1231 task_pid_nr(current));
1232
1233 if (init_srcu_struct(&kvm->srcu))
1234 goto out_err_no_srcu;
1235 if (init_srcu_struct(&kvm->irq_srcu))
1236 goto out_err_no_irq_srcu;
1237
1238 refcount_set(&kvm->users_count, 1);
1239 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1240 for (j = 0; j < 2; j++) {
1241 slots = &kvm->__memslots[i][j];
1242
1243 atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1244 slots->hva_tree = RB_ROOT_CACHED;
1245 slots->gfn_tree = RB_ROOT;
1246 hash_init(slots->id_hash);
1247 slots->node_idx = j;
1248
1249 /* Generations must be different for each address space. */
1250 slots->generation = i;
1251 }
1252
1253 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1254 }
1255
1256 for (i = 0; i < KVM_NR_BUSES; i++) {
1257 rcu_assign_pointer(kvm->buses[i],
1258 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1259 if (!kvm->buses[i])
1260 goto out_err_no_arch_destroy_vm;
1261 }
1262
1263 r = kvm_arch_init_vm(kvm, type);
1264 if (r)
1265 goto out_err_no_arch_destroy_vm;
1266
1267 r = hardware_enable_all();
1268 if (r)
1269 goto out_err_no_disable;
1270
1271 #ifdef CONFIG_HAVE_KVM_IRQCHIP
1272 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1273 #endif
1274
1275 r = kvm_init_mmu_notifier(kvm);
1276 if (r)
1277 goto out_err_no_mmu_notifier;
1278
1279 r = kvm_coalesced_mmio_init(kvm);
1280 if (r < 0)
1281 goto out_no_coalesced_mmio;
1282
1283 r = kvm_create_vm_debugfs(kvm, fdname);
1284 if (r)
1285 goto out_err_no_debugfs;
1286
1287 r = kvm_arch_post_init_vm(kvm);
1288 if (r)
1289 goto out_err;
1290
1291 mutex_lock(&kvm_lock);
1292 list_add(&kvm->vm_list, &vm_list);
1293 mutex_unlock(&kvm_lock);
1294
1295 preempt_notifier_inc();
1296 kvm_init_pm_notifier(kvm);
1297
1298 return kvm;
1299
1300 out_err:
1301 kvm_destroy_vm_debugfs(kvm);
1302 out_err_no_debugfs:
1303 kvm_coalesced_mmio_free(kvm);
1304 out_no_coalesced_mmio:
1305 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1306 if (kvm->mmu_notifier.ops)
1307 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1308 #endif
1309 out_err_no_mmu_notifier:
1310 hardware_disable_all();
1311 out_err_no_disable:
1312 kvm_arch_destroy_vm(kvm);
1313 out_err_no_arch_destroy_vm:
1314 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1315 for (i = 0; i < KVM_NR_BUSES; i++)
1316 kfree(kvm_get_bus(kvm, i));
1317 cleanup_srcu_struct(&kvm->irq_srcu);
1318 out_err_no_irq_srcu:
1319 cleanup_srcu_struct(&kvm->srcu);
1320 out_err_no_srcu:
1321 kvm_arch_free_vm(kvm);
1322 mmdrop(current->mm);
1323 return ERR_PTR(r);
1324 }
1325
1326 static void kvm_destroy_devices(struct kvm *kvm)
1327 {
1328 struct kvm_device *dev, *tmp;
1329
1330 /*
1331 * We do not need to take the kvm->lock here, because nobody else
1332 * has a reference to the struct kvm at this point and therefore
1333 * cannot access the devices list anyhow.
1334 */
1335 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1336 list_del(&dev->vm_node);
1337 dev->ops->destroy(dev);
1338 }
1339 }
1340
1341 static void kvm_destroy_vm(struct kvm *kvm)
1342 {
1343 int i;
1344 struct mm_struct *mm = kvm->mm;
1345
1346 kvm_destroy_pm_notifier(kvm);
1347 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1348 kvm_destroy_vm_debugfs(kvm);
1349 kvm_arch_sync_events(kvm);
1350 mutex_lock(&kvm_lock);
1351 list_del(&kvm->vm_list);
1352 mutex_unlock(&kvm_lock);
1353 kvm_arch_pre_destroy_vm(kvm);
1354
1355 kvm_free_irq_routing(kvm);
1356 for (i = 0; i < KVM_NR_BUSES; i++) {
1357 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1358
1359 if (bus)
1360 kvm_io_bus_destroy(bus);
1361 kvm->buses[i] = NULL;
1362 }
1363 kvm_coalesced_mmio_free(kvm);
1364 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1365 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1366 /*
1367 * At this point, pending calls to invalidate_range_start()
1368 * have completed but no more MMU notifiers will run, so
1369 * mn_active_invalidate_count may remain unbalanced.
1370 * No threads can be waiting in kvm_swap_active_memslots() as the
1371 * last reference on KVM has been dropped, but freeing
1372 * memslots would deadlock without this manual intervention.
1373 *
1374 * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
1375 * notifier between a start() and end(), then there shouldn't be any
1376 * in-progress invalidations.
1377 */
1378 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1379 if (kvm->mn_active_invalidate_count)
1380 kvm->mn_active_invalidate_count = 0;
1381 else
1382 WARN_ON(kvm->mmu_invalidate_in_progress);
1383 #else
1384 kvm_flush_shadow_all(kvm);
1385 #endif
1386 kvm_arch_destroy_vm(kvm);
1387 kvm_destroy_devices(kvm);
1388 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1389 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1390 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1391 }
1392 cleanup_srcu_struct(&kvm->irq_srcu);
1393 cleanup_srcu_struct(&kvm->srcu);
1394 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1395 xa_destroy(&kvm->mem_attr_array);
1396 #endif
1397 kvm_arch_free_vm(kvm);
1398 preempt_notifier_dec();
1399 hardware_disable_all();
1400 mmdrop(mm);
1401 }
1402
1403 void kvm_get_kvm(struct kvm *kvm)
1404 {
1405 refcount_inc(&kvm->users_count);
1406 }
1407 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1408
1409 /*
1410 * Make sure the vm is not during destruction, which is a safe version of
1411 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1412 */
1413 bool kvm_get_kvm_safe(struct kvm *kvm)
1414 {
1415 return refcount_inc_not_zero(&kvm->users_count);
1416 }
1417 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1418
1419 void kvm_put_kvm(struct kvm *kvm)
1420 {
1421 if (refcount_dec_and_test(&kvm->users_count))
1422 kvm_destroy_vm(kvm);
1423 }
1424 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1425
1426 /*
1427 * Used to put a reference that was taken on behalf of an object associated
1428 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1429 * of the new file descriptor fails and the reference cannot be transferred to
1430 * its final owner. In such cases, the caller is still actively using @kvm and
1431 * will fail miserably if the refcount unexpectedly hits zero.
1432 */
1433 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1434 {
1435 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1436 }
1437 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1438
1439 static int kvm_vm_release(struct inode *inode, struct file *filp)
1440 {
1441 struct kvm *kvm = filp->private_data;
1442
1443 kvm_irqfd_release(kvm);
1444
1445 kvm_put_kvm(kvm);
1446 return 0;
1447 }
1448
1449 /*
1450 * Allocation size is twice as large as the actual dirty bitmap size.
1451 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1452 */
1453 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1454 {
1455 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1456
1457 memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1458 if (!memslot->dirty_bitmap)
1459 return -ENOMEM;
1460
1461 return 0;
1462 }
1463
1464 static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1465 {
1466 struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1467 int node_idx_inactive = active->node_idx ^ 1;
1468
1469 return &kvm->__memslots[as_id][node_idx_inactive];
1470 }
1471
1472 /*
1473 * Helper to get the address space ID when one of memslot pointers may be NULL.
1474 * This also serves as a sanity that at least one of the pointers is non-NULL,
1475 * and that their address space IDs don't diverge.
1476 */
1477 static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1478 struct kvm_memory_slot *b)
1479 {
1480 if (WARN_ON_ONCE(!a && !b))
1481 return 0;
1482
1483 if (!a)
1484 return b->as_id;
1485 if (!b)
1486 return a->as_id;
1487
1488 WARN_ON_ONCE(a->as_id != b->as_id);
1489 return a->as_id;
1490 }
1491
1492 static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1493 struct kvm_memory_slot *slot)
1494 {
1495 struct rb_root *gfn_tree = &slots->gfn_tree;
1496 struct rb_node **node, *parent;
1497 int idx = slots->node_idx;
1498
1499 parent = NULL;
1500 for (node = &gfn_tree->rb_node; *node; ) {
1501 struct kvm_memory_slot *tmp;
1502
1503 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1504 parent = *node;
1505 if (slot->base_gfn < tmp->base_gfn)
1506 node = &(*node)->rb_left;
1507 else if (slot->base_gfn > tmp->base_gfn)
1508 node = &(*node)->rb_right;
1509 else
1510 BUG();
1511 }
1512
1513 rb_link_node(&slot->gfn_node[idx], parent, node);
1514 rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1515 }
1516
1517 static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1518 struct kvm_memory_slot *slot)
1519 {
1520 rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1521 }
1522
1523 static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1524 struct kvm_memory_slot *old,
1525 struct kvm_memory_slot *new)
1526 {
1527 int idx = slots->node_idx;
1528
1529 WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1530
1531 rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1532 &slots->gfn_tree);
1533 }
1534
1535 /*
1536 * Replace @old with @new in the inactive memslots.
1537 *
1538 * With NULL @old this simply adds @new.
1539 * With NULL @new this simply removes @old.
1540 *
1541 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1542 * appropriately.
1543 */
1544 static void kvm_replace_memslot(struct kvm *kvm,
1545 struct kvm_memory_slot *old,
1546 struct kvm_memory_slot *new)
1547 {
1548 int as_id = kvm_memslots_get_as_id(old, new);
1549 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1550 int idx = slots->node_idx;
1551
1552 if (old) {
1553 hash_del(&old->id_node[idx]);
1554 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1555
1556 if ((long)old == atomic_long_read(&slots->last_used_slot))
1557 atomic_long_set(&slots->last_used_slot, (long)new);
1558
1559 if (!new) {
1560 kvm_erase_gfn_node(slots, old);
1561 return;
1562 }
1563 }
1564
1565 /*
1566 * Initialize @new's hva range. Do this even when replacing an @old
1567 * slot, kvm_copy_memslot() deliberately does not touch node data.
1568 */
1569 new->hva_node[idx].start = new->userspace_addr;
1570 new->hva_node[idx].last = new->userspace_addr +
1571 (new->npages << PAGE_SHIFT) - 1;
1572
1573 /*
1574 * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1575 * hva_node needs to be swapped with remove+insert even though hva can't
1576 * change when replacing an existing slot.
1577 */
1578 hash_add(slots->id_hash, &new->id_node[idx], new->id);
1579 interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1580
1581 /*
1582 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1583 * switch the node in the gfn tree instead of removing the old and
1584 * inserting the new as two separate operations. Replacement is a
1585 * single O(1) operation versus two O(log(n)) operations for
1586 * remove+insert.
1587 */
1588 if (old && old->base_gfn == new->base_gfn) {
1589 kvm_replace_gfn_node(slots, old, new);
1590 } else {
1591 if (old)
1592 kvm_erase_gfn_node(slots, old);
1593 kvm_insert_gfn_node(slots, new);
1594 }
1595 }
1596
1597 /*
1598 * Flags that do not access any of the extra space of struct
1599 * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
1600 * only allows these.
1601 */
1602 #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
1603 (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
1604
1605 static int check_memory_region_flags(struct kvm *kvm,
1606 const struct kvm_userspace_memory_region2 *mem)
1607 {
1608 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1609
1610 if (kvm_arch_has_private_mem(kvm))
1611 valid_flags |= KVM_MEM_GUEST_MEMFD;
1612
1613 /* Dirty logging private memory is not currently supported. */
1614 if (mem->flags & KVM_MEM_GUEST_MEMFD)
1615 valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
1616
1617 #ifdef __KVM_HAVE_READONLY_MEM
1618 valid_flags |= KVM_MEM_READONLY;
1619 #endif
1620
1621 if (mem->flags & ~valid_flags)
1622 return -EINVAL;
1623
1624 return 0;
1625 }
1626
1627 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1628 {
1629 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1630
1631 /* Grab the generation from the activate memslots. */
1632 u64 gen = __kvm_memslots(kvm, as_id)->generation;
1633
1634 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1635 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1636
1637 /*
1638 * Do not store the new memslots while there are invalidations in
1639 * progress, otherwise the locking in invalidate_range_start and
1640 * invalidate_range_end will be unbalanced.
1641 */
1642 spin_lock(&kvm->mn_invalidate_lock);
1643 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1644 while (kvm->mn_active_invalidate_count) {
1645 set_current_state(TASK_UNINTERRUPTIBLE);
1646 spin_unlock(&kvm->mn_invalidate_lock);
1647 schedule();
1648 spin_lock(&kvm->mn_invalidate_lock);
1649 }
1650 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1651 rcu_assign_pointer(kvm->memslots[as_id], slots);
1652 spin_unlock(&kvm->mn_invalidate_lock);
1653
1654 /*
1655 * Acquired in kvm_set_memslot. Must be released before synchronize
1656 * SRCU below in order to avoid deadlock with another thread
1657 * acquiring the slots_arch_lock in an srcu critical section.
1658 */
1659 mutex_unlock(&kvm->slots_arch_lock);
1660
1661 synchronize_srcu_expedited(&kvm->srcu);
1662
1663 /*
1664 * Increment the new memslot generation a second time, dropping the
1665 * update in-progress flag and incrementing the generation based on
1666 * the number of address spaces. This provides a unique and easily
1667 * identifiable generation number while the memslots are in flux.
1668 */
1669 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1670
1671 /*
1672 * Generations must be unique even across address spaces. We do not need
1673 * a global counter for that, instead the generation space is evenly split
1674 * across address spaces. For example, with two address spaces, address
1675 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1676 * use generations 1, 3, 5, ...
1677 */
1678 gen += kvm_arch_nr_memslot_as_ids(kvm);
1679
1680 kvm_arch_memslots_updated(kvm, gen);
1681
1682 slots->generation = gen;
1683 }
1684
1685 static int kvm_prepare_memory_region(struct kvm *kvm,
1686 const struct kvm_memory_slot *old,
1687 struct kvm_memory_slot *new,
1688 enum kvm_mr_change change)
1689 {
1690 int r;
1691
1692 /*
1693 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1694 * will be freed on "commit". If logging is enabled in both old and
1695 * new, reuse the existing bitmap. If logging is enabled only in the
1696 * new and KVM isn't using a ring buffer, allocate and initialize a
1697 * new bitmap.
1698 */
1699 if (change != KVM_MR_DELETE) {
1700 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1701 new->dirty_bitmap = NULL;
1702 else if (old && old->dirty_bitmap)
1703 new->dirty_bitmap = old->dirty_bitmap;
1704 else if (kvm_use_dirty_bitmap(kvm)) {
1705 r = kvm_alloc_dirty_bitmap(new);
1706 if (r)
1707 return r;
1708
1709 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1710 bitmap_set(new->dirty_bitmap, 0, new->npages);
1711 }
1712 }
1713
1714 r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1715
1716 /* Free the bitmap on failure if it was allocated above. */
1717 if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1718 kvm_destroy_dirty_bitmap(new);
1719
1720 return r;
1721 }
1722
1723 static void kvm_commit_memory_region(struct kvm *kvm,
1724 struct kvm_memory_slot *old,
1725 const struct kvm_memory_slot *new,
1726 enum kvm_mr_change change)
1727 {
1728 int old_flags = old ? old->flags : 0;
1729 int new_flags = new ? new->flags : 0;
1730 /*
1731 * Update the total number of memslot pages before calling the arch
1732 * hook so that architectures can consume the result directly.
1733 */
1734 if (change == KVM_MR_DELETE)
1735 kvm->nr_memslot_pages -= old->npages;
1736 else if (change == KVM_MR_CREATE)
1737 kvm->nr_memslot_pages += new->npages;
1738
1739 if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1740 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1741 atomic_set(&kvm->nr_memslots_dirty_logging,
1742 atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1743 }
1744
1745 kvm_arch_commit_memory_region(kvm, old, new, change);
1746
1747 switch (change) {
1748 case KVM_MR_CREATE:
1749 /* Nothing more to do. */
1750 break;
1751 case KVM_MR_DELETE:
1752 /* Free the old memslot and all its metadata. */
1753 kvm_free_memslot(kvm, old);
1754 break;
1755 case KVM_MR_MOVE:
1756 case KVM_MR_FLAGS_ONLY:
1757 /*
1758 * Free the dirty bitmap as needed; the below check encompasses
1759 * both the flags and whether a ring buffer is being used)
1760 */
1761 if (old->dirty_bitmap && !new->dirty_bitmap)
1762 kvm_destroy_dirty_bitmap(old);
1763
1764 /*
1765 * The final quirk. Free the detached, old slot, but only its
1766 * memory, not any metadata. Metadata, including arch specific
1767 * data, may be reused by @new.
1768 */
1769 kfree(old);
1770 break;
1771 default:
1772 BUG();
1773 }
1774 }
1775
1776 /*
1777 * Activate @new, which must be installed in the inactive slots by the caller,
1778 * by swapping the active slots and then propagating @new to @old once @old is
1779 * unreachable and can be safely modified.
1780 *
1781 * With NULL @old this simply adds @new to @active (while swapping the sets).
1782 * With NULL @new this simply removes @old from @active and frees it
1783 * (while also swapping the sets).
1784 */
1785 static void kvm_activate_memslot(struct kvm *kvm,
1786 struct kvm_memory_slot *old,
1787 struct kvm_memory_slot *new)
1788 {
1789 int as_id = kvm_memslots_get_as_id(old, new);
1790
1791 kvm_swap_active_memslots(kvm, as_id);
1792
1793 /* Propagate the new memslot to the now inactive memslots. */
1794 kvm_replace_memslot(kvm, old, new);
1795 }
1796
1797 static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1798 const struct kvm_memory_slot *src)
1799 {
1800 dest->base_gfn = src->base_gfn;
1801 dest->npages = src->npages;
1802 dest->dirty_bitmap = src->dirty_bitmap;
1803 dest->arch = src->arch;
1804 dest->userspace_addr = src->userspace_addr;
1805 dest->flags = src->flags;
1806 dest->id = src->id;
1807 dest->as_id = src->as_id;
1808 }
1809
1810 static void kvm_invalidate_memslot(struct kvm *kvm,
1811 struct kvm_memory_slot *old,
1812 struct kvm_memory_slot *invalid_slot)
1813 {
1814 /*
1815 * Mark the current slot INVALID. As with all memslot modifications,
1816 * this must be done on an unreachable slot to avoid modifying the
1817 * current slot in the active tree.
1818 */
1819 kvm_copy_memslot(invalid_slot, old);
1820 invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1821 kvm_replace_memslot(kvm, old, invalid_slot);
1822
1823 /*
1824 * Activate the slot that is now marked INVALID, but don't propagate
1825 * the slot to the now inactive slots. The slot is either going to be
1826 * deleted or recreated as a new slot.
1827 */
1828 kvm_swap_active_memslots(kvm, old->as_id);
1829
1830 /*
1831 * From this point no new shadow pages pointing to a deleted, or moved,
1832 * memslot will be created. Validation of sp->gfn happens in:
1833 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1834 * - kvm_is_visible_gfn (mmu_check_root)
1835 */
1836 kvm_arch_flush_shadow_memslot(kvm, old);
1837 kvm_arch_guest_memory_reclaimed(kvm);
1838
1839 /* Was released by kvm_swap_active_memslots(), reacquire. */
1840 mutex_lock(&kvm->slots_arch_lock);
1841
1842 /*
1843 * Copy the arch-specific field of the newly-installed slot back to the
1844 * old slot as the arch data could have changed between releasing
1845 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1846 * above. Writers are required to retrieve memslots *after* acquiring
1847 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1848 */
1849 old->arch = invalid_slot->arch;
1850 }
1851
1852 static void kvm_create_memslot(struct kvm *kvm,
1853 struct kvm_memory_slot *new)
1854 {
1855 /* Add the new memslot to the inactive set and activate. */
1856 kvm_replace_memslot(kvm, NULL, new);
1857 kvm_activate_memslot(kvm, NULL, new);
1858 }
1859
1860 static void kvm_delete_memslot(struct kvm *kvm,
1861 struct kvm_memory_slot *old,
1862 struct kvm_memory_slot *invalid_slot)
1863 {
1864 /*
1865 * Remove the old memslot (in the inactive memslots) by passing NULL as
1866 * the "new" slot, and for the invalid version in the active slots.
1867 */
1868 kvm_replace_memslot(kvm, old, NULL);
1869 kvm_activate_memslot(kvm, invalid_slot, NULL);
1870 }
1871
1872 static void kvm_move_memslot(struct kvm *kvm,
1873 struct kvm_memory_slot *old,
1874 struct kvm_memory_slot *new,
1875 struct kvm_memory_slot *invalid_slot)
1876 {
1877 /*
1878 * Replace the old memslot in the inactive slots, and then swap slots
1879 * and replace the current INVALID with the new as well.
1880 */
1881 kvm_replace_memslot(kvm, old, new);
1882 kvm_activate_memslot(kvm, invalid_slot, new);
1883 }
1884
1885 static void kvm_update_flags_memslot(struct kvm *kvm,
1886 struct kvm_memory_slot *old,
1887 struct kvm_memory_slot *new)
1888 {
1889 /*
1890 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1891 * an intermediate step. Instead, the old memslot is simply replaced
1892 * with a new, updated copy in both memslot sets.
1893 */
1894 kvm_replace_memslot(kvm, old, new);
1895 kvm_activate_memslot(kvm, old, new);
1896 }
1897
1898 static int kvm_set_memslot(struct kvm *kvm,
1899 struct kvm_memory_slot *old,
1900 struct kvm_memory_slot *new,
1901 enum kvm_mr_change change)
1902 {
1903 struct kvm_memory_slot *invalid_slot;
1904 int r;
1905
1906 /*
1907 * Released in kvm_swap_active_memslots().
1908 *
1909 * Must be held from before the current memslots are copied until after
1910 * the new memslots are installed with rcu_assign_pointer, then
1911 * released before the synchronize srcu in kvm_swap_active_memslots().
1912 *
1913 * When modifying memslots outside of the slots_lock, must be held
1914 * before reading the pointer to the current memslots until after all
1915 * changes to those memslots are complete.
1916 *
1917 * These rules ensure that installing new memslots does not lose
1918 * changes made to the previous memslots.
1919 */
1920 mutex_lock(&kvm->slots_arch_lock);
1921
1922 /*
1923 * Invalidate the old slot if it's being deleted or moved. This is
1924 * done prior to actually deleting/moving the memslot to allow vCPUs to
1925 * continue running by ensuring there are no mappings or shadow pages
1926 * for the memslot when it is deleted/moved. Without pre-invalidation
1927 * (and without a lock), a window would exist between effecting the
1928 * delete/move and committing the changes in arch code where KVM or a
1929 * guest could access a non-existent memslot.
1930 *
1931 * Modifications are done on a temporary, unreachable slot. The old
1932 * slot needs to be preserved in case a later step fails and the
1933 * invalidation needs to be reverted.
1934 */
1935 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1936 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1937 if (!invalid_slot) {
1938 mutex_unlock(&kvm->slots_arch_lock);
1939 return -ENOMEM;
1940 }
1941 kvm_invalidate_memslot(kvm, old, invalid_slot);
1942 }
1943
1944 r = kvm_prepare_memory_region(kvm, old, new, change);
1945 if (r) {
1946 /*
1947 * For DELETE/MOVE, revert the above INVALID change. No
1948 * modifications required since the original slot was preserved
1949 * in the inactive slots. Changing the active memslots also
1950 * release slots_arch_lock.
1951 */
1952 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1953 kvm_activate_memslot(kvm, invalid_slot, old);
1954 kfree(invalid_slot);
1955 } else {
1956 mutex_unlock(&kvm->slots_arch_lock);
1957 }
1958 return r;
1959 }
1960
1961 /*
1962 * For DELETE and MOVE, the working slot is now active as the INVALID
1963 * version of the old slot. MOVE is particularly special as it reuses
1964 * the old slot and returns a copy of the old slot (in working_slot).
1965 * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1966 * old slot is detached but otherwise preserved.
1967 */
1968 if (change == KVM_MR_CREATE)
1969 kvm_create_memslot(kvm, new);
1970 else if (change == KVM_MR_DELETE)
1971 kvm_delete_memslot(kvm, old, invalid_slot);
1972 else if (change == KVM_MR_MOVE)
1973 kvm_move_memslot(kvm, old, new, invalid_slot);
1974 else if (change == KVM_MR_FLAGS_ONLY)
1975 kvm_update_flags_memslot(kvm, old, new);
1976 else
1977 BUG();
1978
1979 /* Free the temporary INVALID slot used for DELETE and MOVE. */
1980 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1981 kfree(invalid_slot);
1982
1983 /*
1984 * No need to refresh new->arch, changes after dropping slots_arch_lock
1985 * will directly hit the final, active memslot. Architectures are
1986 * responsible for knowing that new->arch may be stale.
1987 */
1988 kvm_commit_memory_region(kvm, old, new, change);
1989
1990 return 0;
1991 }
1992
1993 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1994 gfn_t start, gfn_t end)
1995 {
1996 struct kvm_memslot_iter iter;
1997
1998 kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1999 if (iter.slot->id != id)
2000 return true;
2001 }
2002
2003 return false;
2004 }
2005
2006 /*
2007 * Allocate some memory and give it an address in the guest physical address
2008 * space.
2009 *
2010 * Discontiguous memory is allowed, mostly for framebuffers.
2011 *
2012 * Must be called holding kvm->slots_lock for write.
2013 */
2014 int __kvm_set_memory_region(struct kvm *kvm,
2015 const struct kvm_userspace_memory_region2 *mem)
2016 {
2017 struct kvm_memory_slot *old, *new;
2018 struct kvm_memslots *slots;
2019 enum kvm_mr_change change;
2020 unsigned long npages;
2021 gfn_t base_gfn;
2022 int as_id, id;
2023 int r;
2024
2025 r = check_memory_region_flags(kvm, mem);
2026 if (r)
2027 return r;
2028
2029 as_id = mem->slot >> 16;
2030 id = (u16)mem->slot;
2031
2032 /* General sanity checks */
2033 if ((mem->memory_size & (PAGE_SIZE - 1)) ||
2034 (mem->memory_size != (unsigned long)mem->memory_size))
2035 return -EINVAL;
2036 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
2037 return -EINVAL;
2038 /* We can read the guest memory with __xxx_user() later on. */
2039 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
2040 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
2041 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
2042 mem->memory_size))
2043 return -EINVAL;
2044 if (mem->flags & KVM_MEM_GUEST_MEMFD &&
2045 (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
2046 mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
2047 return -EINVAL;
2048 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
2049 return -EINVAL;
2050 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
2051 return -EINVAL;
2052 if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
2053 return -EINVAL;
2054
2055 slots = __kvm_memslots(kvm, as_id);
2056
2057 /*
2058 * Note, the old memslot (and the pointer itself!) may be invalidated
2059 * and/or destroyed by kvm_set_memslot().
2060 */
2061 old = id_to_memslot(slots, id);
2062
2063 if (!mem->memory_size) {
2064 if (!old || !old->npages)
2065 return -EINVAL;
2066
2067 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
2068 return -EIO;
2069
2070 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
2071 }
2072
2073 base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2074 npages = (mem->memory_size >> PAGE_SHIFT);
2075
2076 if (!old || !old->npages) {
2077 change = KVM_MR_CREATE;
2078
2079 /*
2080 * To simplify KVM internals, the total number of pages across
2081 * all memslots must fit in an unsigned long.
2082 */
2083 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2084 return -EINVAL;
2085 } else { /* Modify an existing slot. */
2086 /* Private memslots are immutable, they can only be deleted. */
2087 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2088 return -EINVAL;
2089 if ((mem->userspace_addr != old->userspace_addr) ||
2090 (npages != old->npages) ||
2091 ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2092 return -EINVAL;
2093
2094 if (base_gfn != old->base_gfn)
2095 change = KVM_MR_MOVE;
2096 else if (mem->flags != old->flags)
2097 change = KVM_MR_FLAGS_ONLY;
2098 else /* Nothing to change. */
2099 return 0;
2100 }
2101
2102 if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2103 kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2104 return -EEXIST;
2105
2106 /* Allocate a slot that will persist in the memslot. */
2107 new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2108 if (!new)
2109 return -ENOMEM;
2110
2111 new->as_id = as_id;
2112 new->id = id;
2113 new->base_gfn = base_gfn;
2114 new->npages = npages;
2115 new->flags = mem->flags;
2116 new->userspace_addr = mem->userspace_addr;
2117 if (mem->flags & KVM_MEM_GUEST_MEMFD) {
2118 r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
2119 if (r)
2120 goto out;
2121 }
2122
2123 r = kvm_set_memslot(kvm, old, new, change);
2124 if (r)
2125 goto out_unbind;
2126
2127 return 0;
2128
2129 out_unbind:
2130 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2131 kvm_gmem_unbind(new);
2132 out:
2133 kfree(new);
2134 return r;
2135 }
2136 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2137
2138 int kvm_set_memory_region(struct kvm *kvm,
2139 const struct kvm_userspace_memory_region2 *mem)
2140 {
2141 int r;
2142
2143 mutex_lock(&kvm->slots_lock);
2144 r = __kvm_set_memory_region(kvm, mem);
2145 mutex_unlock(&kvm->slots_lock);
2146 return r;
2147 }
2148 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2149
2150 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2151 struct kvm_userspace_memory_region2 *mem)
2152 {
2153 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2154 return -EINVAL;
2155
2156 return kvm_set_memory_region(kvm, mem);
2157 }
2158
2159 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2160 /**
2161 * kvm_get_dirty_log - get a snapshot of dirty pages
2162 * @kvm: pointer to kvm instance
2163 * @log: slot id and address to which we copy the log
2164 * @is_dirty: set to '1' if any dirty pages were found
2165 * @memslot: set to the associated memslot, always valid on success
2166 */
2167 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2168 int *is_dirty, struct kvm_memory_slot **memslot)
2169 {
2170 struct kvm_memslots *slots;
2171 int i, as_id, id;
2172 unsigned long n;
2173 unsigned long any = 0;
2174
2175 /* Dirty ring tracking may be exclusive to dirty log tracking */
2176 if (!kvm_use_dirty_bitmap(kvm))
2177 return -ENXIO;
2178
2179 *memslot = NULL;
2180 *is_dirty = 0;
2181
2182 as_id = log->slot >> 16;
2183 id = (u16)log->slot;
2184 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2185 return -EINVAL;
2186
2187 slots = __kvm_memslots(kvm, as_id);
2188 *memslot = id_to_memslot(slots, id);
2189 if (!(*memslot) || !(*memslot)->dirty_bitmap)
2190 return -ENOENT;
2191
2192 kvm_arch_sync_dirty_log(kvm, *memslot);
2193
2194 n = kvm_dirty_bitmap_bytes(*memslot);
2195
2196 for (i = 0; !any && i < n/sizeof(long); ++i)
2197 any = (*memslot)->dirty_bitmap[i];
2198
2199 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2200 return -EFAULT;
2201
2202 if (any)
2203 *is_dirty = 1;
2204 return 0;
2205 }
2206 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2207
2208 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2209 /**
2210 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2211 * and reenable dirty page tracking for the corresponding pages.
2212 * @kvm: pointer to kvm instance
2213 * @log: slot id and address to which we copy the log
2214 *
2215 * We need to keep it in mind that VCPU threads can write to the bitmap
2216 * concurrently. So, to avoid losing track of dirty pages we keep the
2217 * following order:
2218 *
2219 * 1. Take a snapshot of the bit and clear it if needed.
2220 * 2. Write protect the corresponding page.
2221 * 3. Copy the snapshot to the userspace.
2222 * 4. Upon return caller flushes TLB's if needed.
2223 *
2224 * Between 2 and 4, the guest may write to the page using the remaining TLB
2225 * entry. This is not a problem because the page is reported dirty using
2226 * the snapshot taken before and step 4 ensures that writes done after
2227 * exiting to userspace will be logged for the next call.
2228 *
2229 */
2230 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2231 {
2232 struct kvm_memslots *slots;
2233 struct kvm_memory_slot *memslot;
2234 int i, as_id, id;
2235 unsigned long n;
2236 unsigned long *dirty_bitmap;
2237 unsigned long *dirty_bitmap_buffer;
2238 bool flush;
2239
2240 /* Dirty ring tracking may be exclusive to dirty log tracking */
2241 if (!kvm_use_dirty_bitmap(kvm))
2242 return -ENXIO;
2243
2244 as_id = log->slot >> 16;
2245 id = (u16)log->slot;
2246 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2247 return -EINVAL;
2248
2249 slots = __kvm_memslots(kvm, as_id);
2250 memslot = id_to_memslot(slots, id);
2251 if (!memslot || !memslot->dirty_bitmap)
2252 return -ENOENT;
2253
2254 dirty_bitmap = memslot->dirty_bitmap;
2255
2256 kvm_arch_sync_dirty_log(kvm, memslot);
2257
2258 n = kvm_dirty_bitmap_bytes(memslot);
2259 flush = false;
2260 if (kvm->manual_dirty_log_protect) {
2261 /*
2262 * Unlike kvm_get_dirty_log, we always return false in *flush,
2263 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2264 * is some code duplication between this function and
2265 * kvm_get_dirty_log, but hopefully all architecture
2266 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2267 * can be eliminated.
2268 */
2269 dirty_bitmap_buffer = dirty_bitmap;
2270 } else {
2271 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2272 memset(dirty_bitmap_buffer, 0, n);
2273
2274 KVM_MMU_LOCK(kvm);
2275 for (i = 0; i < n / sizeof(long); i++) {
2276 unsigned long mask;
2277 gfn_t offset;
2278
2279 if (!dirty_bitmap[i])
2280 continue;
2281
2282 flush = true;
2283 mask = xchg(&dirty_bitmap[i], 0);
2284 dirty_bitmap_buffer[i] = mask;
2285
2286 offset = i * BITS_PER_LONG;
2287 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2288 offset, mask);
2289 }
2290 KVM_MMU_UNLOCK(kvm);
2291 }
2292
2293 if (flush)
2294 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2295
2296 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2297 return -EFAULT;
2298 return 0;
2299 }
2300
2301
2302 /**
2303 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2304 * @kvm: kvm instance
2305 * @log: slot id and address to which we copy the log
2306 *
2307 * Steps 1-4 below provide general overview of dirty page logging. See
2308 * kvm_get_dirty_log_protect() function description for additional details.
2309 *
2310 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2311 * always flush the TLB (step 4) even if previous step failed and the dirty
2312 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2313 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2314 * writes will be marked dirty for next log read.
2315 *
2316 * 1. Take a snapshot of the bit and clear it if needed.
2317 * 2. Write protect the corresponding page.
2318 * 3. Copy the snapshot to the userspace.
2319 * 4. Flush TLB's if needed.
2320 */
2321 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2322 struct kvm_dirty_log *log)
2323 {
2324 int r;
2325
2326 mutex_lock(&kvm->slots_lock);
2327
2328 r = kvm_get_dirty_log_protect(kvm, log);
2329
2330 mutex_unlock(&kvm->slots_lock);
2331 return r;
2332 }
2333
2334 /**
2335 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2336 * and reenable dirty page tracking for the corresponding pages.
2337 * @kvm: pointer to kvm instance
2338 * @log: slot id and address from which to fetch the bitmap of dirty pages
2339 */
2340 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2341 struct kvm_clear_dirty_log *log)
2342 {
2343 struct kvm_memslots *slots;
2344 struct kvm_memory_slot *memslot;
2345 int as_id, id;
2346 gfn_t offset;
2347 unsigned long i, n;
2348 unsigned long *dirty_bitmap;
2349 unsigned long *dirty_bitmap_buffer;
2350 bool flush;
2351
2352 /* Dirty ring tracking may be exclusive to dirty log tracking */
2353 if (!kvm_use_dirty_bitmap(kvm))
2354 return -ENXIO;
2355
2356 as_id = log->slot >> 16;
2357 id = (u16)log->slot;
2358 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2359 return -EINVAL;
2360
2361 if (log->first_page & 63)
2362 return -EINVAL;
2363
2364 slots = __kvm_memslots(kvm, as_id);
2365 memslot = id_to_memslot(slots, id);
2366 if (!memslot || !memslot->dirty_bitmap)
2367 return -ENOENT;
2368
2369 dirty_bitmap = memslot->dirty_bitmap;
2370
2371 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2372
2373 if (log->first_page > memslot->npages ||
2374 log->num_pages > memslot->npages - log->first_page ||
2375 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2376 return -EINVAL;
2377
2378 kvm_arch_sync_dirty_log(kvm, memslot);
2379
2380 flush = false;
2381 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2382 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2383 return -EFAULT;
2384
2385 KVM_MMU_LOCK(kvm);
2386 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2387 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2388 i++, offset += BITS_PER_LONG) {
2389 unsigned long mask = *dirty_bitmap_buffer++;
2390 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2391 if (!mask)
2392 continue;
2393
2394 mask &= atomic_long_fetch_andnot(mask, p);
2395
2396 /*
2397 * mask contains the bits that really have been cleared. This
2398 * never includes any bits beyond the length of the memslot (if
2399 * the length is not aligned to 64 pages), therefore it is not
2400 * a problem if userspace sets them in log->dirty_bitmap.
2401 */
2402 if (mask) {
2403 flush = true;
2404 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2405 offset, mask);
2406 }
2407 }
2408 KVM_MMU_UNLOCK(kvm);
2409
2410 if (flush)
2411 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2412
2413 return 0;
2414 }
2415
2416 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2417 struct kvm_clear_dirty_log *log)
2418 {
2419 int r;
2420
2421 mutex_lock(&kvm->slots_lock);
2422
2423 r = kvm_clear_dirty_log_protect(kvm, log);
2424
2425 mutex_unlock(&kvm->slots_lock);
2426 return r;
2427 }
2428 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2429
2430 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
2431 /*
2432 * Returns true if _all_ gfns in the range [@start, @end) have attributes
2433 * matching @attrs.
2434 */
2435 bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2436 unsigned long attrs)
2437 {
2438 XA_STATE(xas, &kvm->mem_attr_array, start);
2439 unsigned long index;
2440 bool has_attrs;
2441 void *entry;
2442
2443 rcu_read_lock();
2444
2445 if (!attrs) {
2446 has_attrs = !xas_find(&xas, end - 1);
2447 goto out;
2448 }
2449
2450 has_attrs = true;
2451 for (index = start; index < end; index++) {
2452 do {
2453 entry = xas_next(&xas);
2454 } while (xas_retry(&xas, entry));
2455
2456 if (xas.xa_index != index || xa_to_value(entry) != attrs) {
2457 has_attrs = false;
2458 break;
2459 }
2460 }
2461
2462 out:
2463 rcu_read_unlock();
2464 return has_attrs;
2465 }
2466
2467 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
2468 {
2469 if (!kvm || kvm_arch_has_private_mem(kvm))
2470 return KVM_MEMORY_ATTRIBUTE_PRIVATE;
2471
2472 return 0;
2473 }
2474
2475 static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
2476 struct kvm_mmu_notifier_range *range)
2477 {
2478 struct kvm_gfn_range gfn_range;
2479 struct kvm_memory_slot *slot;
2480 struct kvm_memslots *slots;
2481 struct kvm_memslot_iter iter;
2482 bool found_memslot = false;
2483 bool ret = false;
2484 int i;
2485
2486 gfn_range.arg = range->arg;
2487 gfn_range.may_block = range->may_block;
2488
2489 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
2490 slots = __kvm_memslots(kvm, i);
2491
2492 kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
2493 slot = iter.slot;
2494 gfn_range.slot = slot;
2495
2496 gfn_range.start = max(range->start, slot->base_gfn);
2497 gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
2498 if (gfn_range.start >= gfn_range.end)
2499 continue;
2500
2501 if (!found_memslot) {
2502 found_memslot = true;
2503 KVM_MMU_LOCK(kvm);
2504 if (!IS_KVM_NULL_FN(range->on_lock))
2505 range->on_lock(kvm);
2506 }
2507
2508 ret |= range->handler(kvm, &gfn_range);
2509 }
2510 }
2511
2512 if (range->flush_on_ret && ret)
2513 kvm_flush_remote_tlbs(kvm);
2514
2515 if (found_memslot)
2516 KVM_MMU_UNLOCK(kvm);
2517 }
2518
2519 static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
2520 struct kvm_gfn_range *range)
2521 {
2522 /*
2523 * Unconditionally add the range to the invalidation set, regardless of
2524 * whether or not the arch callback actually needs to zap SPTEs. E.g.
2525 * if KVM supports RWX attributes in the future and the attributes are
2526 * going from R=>RW, zapping isn't strictly necessary. Unconditionally
2527 * adding the range allows KVM to require that MMU invalidations add at
2528 * least one range between begin() and end(), e.g. allows KVM to detect
2529 * bugs where the add() is missed. Relaxing the rule *might* be safe,
2530 * but it's not obvious that allowing new mappings while the attributes
2531 * are in flux is desirable or worth the complexity.
2532 */
2533 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
2534
2535 return kvm_arch_pre_set_memory_attributes(kvm, range);
2536 }
2537
2538 /* Set @attributes for the gfn range [@start, @end). */
2539 static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2540 unsigned long attributes)
2541 {
2542 struct kvm_mmu_notifier_range pre_set_range = {
2543 .start = start,
2544 .end = end,
2545 .handler = kvm_pre_set_memory_attributes,
2546 .on_lock = kvm_mmu_invalidate_begin,
2547 .flush_on_ret = true,
2548 .may_block = true,
2549 };
2550 struct kvm_mmu_notifier_range post_set_range = {
2551 .start = start,
2552 .end = end,
2553 .arg.attributes = attributes,
2554 .handler = kvm_arch_post_set_memory_attributes,
2555 .on_lock = kvm_mmu_invalidate_end,
2556 .may_block = true,
2557 };
2558 unsigned long i;
2559 void *entry;
2560 int r = 0;
2561
2562 entry = attributes ? xa_mk_value(attributes) : NULL;
2563
2564 mutex_lock(&kvm->slots_lock);
2565
2566 /* Nothing to do if the entire range as the desired attributes. */
2567 if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
2568 goto out_unlock;
2569
2570 /*
2571 * Reserve memory ahead of time to avoid having to deal with failures
2572 * partway through setting the new attributes.
2573 */
2574 for (i = start; i < end; i++) {
2575 r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
2576 if (r)
2577 goto out_unlock;
2578 }
2579
2580 kvm_handle_gfn_range(kvm, &pre_set_range);
2581
2582 for (i = start; i < end; i++) {
2583 r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
2584 GFP_KERNEL_ACCOUNT));
2585 KVM_BUG_ON(r, kvm);
2586 }
2587
2588 kvm_handle_gfn_range(kvm, &post_set_range);
2589
2590 out_unlock:
2591 mutex_unlock(&kvm->slots_lock);
2592
2593 return r;
2594 }
2595 static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
2596 struct kvm_memory_attributes *attrs)
2597 {
2598 gfn_t start, end;
2599
2600 /* flags is currently not used. */
2601 if (attrs->flags)
2602 return -EINVAL;
2603 if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
2604 return -EINVAL;
2605 if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
2606 return -EINVAL;
2607 if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
2608 return -EINVAL;
2609
2610 start = attrs->address >> PAGE_SHIFT;
2611 end = (attrs->address + attrs->size) >> PAGE_SHIFT;
2612
2613 /*
2614 * xarray tracks data using "unsigned long", and as a result so does
2615 * KVM. For simplicity, supports generic attributes only on 64-bit
2616 * architectures.
2617 */
2618 BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
2619
2620 return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
2621 }
2622 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
2623
2624 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2625 {
2626 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2627 }
2628 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2629
2630 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2631 {
2632 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2633 u64 gen = slots->generation;
2634 struct kvm_memory_slot *slot;
2635
2636 /*
2637 * This also protects against using a memslot from a different address space,
2638 * since different address spaces have different generation numbers.
2639 */
2640 if (unlikely(gen != vcpu->last_used_slot_gen)) {
2641 vcpu->last_used_slot = NULL;
2642 vcpu->last_used_slot_gen = gen;
2643 }
2644
2645 slot = try_get_memslot(vcpu->last_used_slot, gfn);
2646 if (slot)
2647 return slot;
2648
2649 /*
2650 * Fall back to searching all memslots. We purposely use
2651 * search_memslots() instead of __gfn_to_memslot() to avoid
2652 * thrashing the VM-wide last_used_slot in kvm_memslots.
2653 */
2654 slot = search_memslots(slots, gfn, false);
2655 if (slot) {
2656 vcpu->last_used_slot = slot;
2657 return slot;
2658 }
2659
2660 return NULL;
2661 }
2662
2663 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2664 {
2665 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2666
2667 return kvm_is_visible_memslot(memslot);
2668 }
2669 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2670
2671 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2672 {
2673 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2674
2675 return kvm_is_visible_memslot(memslot);
2676 }
2677 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2678
2679 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2680 {
2681 struct vm_area_struct *vma;
2682 unsigned long addr, size;
2683
2684 size = PAGE_SIZE;
2685
2686 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2687 if (kvm_is_error_hva(addr))
2688 return PAGE_SIZE;
2689
2690 mmap_read_lock(current->mm);
2691 vma = find_vma(current->mm, addr);
2692 if (!vma)
2693 goto out;
2694
2695 size = vma_kernel_pagesize(vma);
2696
2697 out:
2698 mmap_read_unlock(current->mm);
2699
2700 return size;
2701 }
2702
2703 static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2704 {
2705 return slot->flags & KVM_MEM_READONLY;
2706 }
2707
2708 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2709 gfn_t *nr_pages, bool write)
2710 {
2711 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2712 return KVM_HVA_ERR_BAD;
2713
2714 if (memslot_is_readonly(slot) && write)
2715 return KVM_HVA_ERR_RO_BAD;
2716
2717 if (nr_pages)
2718 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2719
2720 return __gfn_to_hva_memslot(slot, gfn);
2721 }
2722
2723 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2724 gfn_t *nr_pages)
2725 {
2726 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2727 }
2728
2729 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2730 gfn_t gfn)
2731 {
2732 return gfn_to_hva_many(slot, gfn, NULL);
2733 }
2734 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2735
2736 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2737 {
2738 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2739 }
2740 EXPORT_SYMBOL_GPL(gfn_to_hva);
2741
2742 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2743 {
2744 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2745 }
2746 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2747
2748 /*
2749 * Return the hva of a @gfn and the R/W attribute if possible.
2750 *
2751 * @slot: the kvm_memory_slot which contains @gfn
2752 * @gfn: the gfn to be translated
2753 * @writable: used to return the read/write attribute of the @slot if the hva
2754 * is valid and @writable is not NULL
2755 */
2756 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2757 gfn_t gfn, bool *writable)
2758 {
2759 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2760
2761 if (!kvm_is_error_hva(hva) && writable)
2762 *writable = !memslot_is_readonly(slot);
2763
2764 return hva;
2765 }
2766
2767 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2768 {
2769 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2770
2771 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2772 }
2773
2774 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2775 {
2776 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2777
2778 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2779 }
2780
2781 static inline int check_user_page_hwpoison(unsigned long addr)
2782 {
2783 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2784
2785 rc = get_user_pages(addr, 1, flags, NULL);
2786 return rc == -EHWPOISON;
2787 }
2788
2789 /*
2790 * The fast path to get the writable pfn which will be stored in @pfn,
2791 * true indicates success, otherwise false is returned. It's also the
2792 * only part that runs if we can in atomic context.
2793 */
2794 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2795 bool *writable, kvm_pfn_t *pfn)
2796 {
2797 struct page *page[1];
2798
2799 /*
2800 * Fast pin a writable pfn only if it is a write fault request
2801 * or the caller allows to map a writable pfn for a read fault
2802 * request.
2803 */
2804 if (!(write_fault || writable))
2805 return false;
2806
2807 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2808 *pfn = page_to_pfn(page[0]);
2809
2810 if (writable)
2811 *writable = true;
2812 return true;
2813 }
2814
2815 return false;
2816 }
2817
2818 /*
2819 * The slow path to get the pfn of the specified host virtual address,
2820 * 1 indicates success, -errno is returned if error is detected.
2821 */
2822 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2823 bool interruptible, bool *writable, kvm_pfn_t *pfn)
2824 {
2825 /*
2826 * When a VCPU accesses a page that is not mapped into the secondary
2827 * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2828 * make progress. We always want to honor NUMA hinting faults in that
2829 * case, because GUP usage corresponds to memory accesses from the VCPU.
2830 * Otherwise, we'd not trigger NUMA hinting faults once a page is
2831 * mapped into the secondary MMU and gets accessed by a VCPU.
2832 *
2833 * Note that get_user_page_fast_only() and FOLL_WRITE for now
2834 * implicitly honor NUMA hinting faults and don't need this flag.
2835 */
2836 unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
2837 struct page *page;
2838 int npages;
2839
2840 might_sleep();
2841
2842 if (writable)
2843 *writable = write_fault;
2844
2845 if (write_fault)
2846 flags |= FOLL_WRITE;
2847 if (async)
2848 flags |= FOLL_NOWAIT;
2849 if (interruptible)
2850 flags |= FOLL_INTERRUPTIBLE;
2851
2852 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2853 if (npages != 1)
2854 return npages;
2855
2856 /* map read fault as writable if possible */
2857 if (unlikely(!write_fault) && writable) {
2858 struct page *wpage;
2859
2860 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2861 *writable = true;
2862 put_page(page);
2863 page = wpage;
2864 }
2865 }
2866 *pfn = page_to_pfn(page);
2867 return npages;
2868 }
2869
2870 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2871 {
2872 if (unlikely(!(vma->vm_flags & VM_READ)))
2873 return false;
2874
2875 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2876 return false;
2877
2878 return true;
2879 }
2880
2881 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2882 {
2883 struct page *page = kvm_pfn_to_refcounted_page(pfn);
2884
2885 if (!page)
2886 return 1;
2887
2888 return get_page_unless_zero(page);
2889 }
2890
2891 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2892 unsigned long addr, bool write_fault,
2893 bool *writable, kvm_pfn_t *p_pfn)
2894 {
2895 kvm_pfn_t pfn;
2896 pte_t *ptep;
2897 pte_t pte;
2898 spinlock_t *ptl;
2899 int r;
2900
2901 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2902 if (r) {
2903 /*
2904 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2905 * not call the fault handler, so do it here.
2906 */
2907 bool unlocked = false;
2908 r = fixup_user_fault(current->mm, addr,
2909 (write_fault ? FAULT_FLAG_WRITE : 0),
2910 &unlocked);
2911 if (unlocked)
2912 return -EAGAIN;
2913 if (r)
2914 return r;
2915
2916 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2917 if (r)
2918 return r;
2919 }
2920
2921 pte = ptep_get(ptep);
2922
2923 if (write_fault && !pte_write(pte)) {
2924 pfn = KVM_PFN_ERR_RO_FAULT;
2925 goto out;
2926 }
2927
2928 if (writable)
2929 *writable = pte_write(pte);
2930 pfn = pte_pfn(pte);
2931
2932 /*
2933 * Get a reference here because callers of *hva_to_pfn* and
2934 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2935 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
2936 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2937 * simply do nothing for reserved pfns.
2938 *
2939 * Whoever called remap_pfn_range is also going to call e.g.
2940 * unmap_mapping_range before the underlying pages are freed,
2941 * causing a call to our MMU notifier.
2942 *
2943 * Certain IO or PFNMAP mappings can be backed with valid
2944 * struct pages, but be allocated without refcounting e.g.,
2945 * tail pages of non-compound higher order allocations, which
2946 * would then underflow the refcount when the caller does the
2947 * required put_page. Don't allow those pages here.
2948 */
2949 if (!kvm_try_get_pfn(pfn))
2950 r = -EFAULT;
2951
2952 out:
2953 pte_unmap_unlock(ptep, ptl);
2954 *p_pfn = pfn;
2955
2956 return r;
2957 }
2958
2959 /*
2960 * Pin guest page in memory and return its pfn.
2961 * @addr: host virtual address which maps memory to the guest
2962 * @atomic: whether this function can sleep
2963 * @interruptible: whether the process can be interrupted by non-fatal signals
2964 * @async: whether this function need to wait IO complete if the
2965 * host page is not in the memory
2966 * @write_fault: whether we should get a writable host page
2967 * @writable: whether it allows to map a writable host page for !@write_fault
2968 *
2969 * The function will map a writable host page for these two cases:
2970 * 1): @write_fault = true
2971 * 2): @write_fault = false && @writable, @writable will tell the caller
2972 * whether the mapping is writable.
2973 */
2974 kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2975 bool *async, bool write_fault, bool *writable)
2976 {
2977 struct vm_area_struct *vma;
2978 kvm_pfn_t pfn;
2979 int npages, r;
2980
2981 /* we can do it either atomically or asynchronously, not both */
2982 BUG_ON(atomic && async);
2983
2984 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2985 return pfn;
2986
2987 if (atomic)
2988 return KVM_PFN_ERR_FAULT;
2989
2990 npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2991 writable, &pfn);
2992 if (npages == 1)
2993 return pfn;
2994 if (npages == -EINTR)
2995 return KVM_PFN_ERR_SIGPENDING;
2996
2997 mmap_read_lock(current->mm);
2998 if (npages == -EHWPOISON ||
2999 (!async && check_user_page_hwpoison(addr))) {
3000 pfn = KVM_PFN_ERR_HWPOISON;
3001 goto exit;
3002 }
3003
3004 retry:
3005 vma = vma_lookup(current->mm, addr);
3006
3007 if (vma == NULL)
3008 pfn = KVM_PFN_ERR_FAULT;
3009 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
3010 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
3011 if (r == -EAGAIN)
3012 goto retry;
3013 if (r < 0)
3014 pfn = KVM_PFN_ERR_FAULT;
3015 } else {
3016 if (async && vma_is_valid(vma, write_fault))
3017 *async = true;
3018 pfn = KVM_PFN_ERR_FAULT;
3019 }
3020 exit:
3021 mmap_read_unlock(current->mm);
3022 return pfn;
3023 }
3024
3025 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
3026 bool atomic, bool interruptible, bool *async,
3027 bool write_fault, bool *writable, hva_t *hva)
3028 {
3029 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
3030
3031 if (hva)
3032 *hva = addr;
3033
3034 if (addr == KVM_HVA_ERR_RO_BAD) {
3035 if (writable)
3036 *writable = false;
3037 return KVM_PFN_ERR_RO_FAULT;
3038 }
3039
3040 if (kvm_is_error_hva(addr)) {
3041 if (writable)
3042 *writable = false;
3043 return KVM_PFN_NOSLOT;
3044 }
3045
3046 /* Do not map writable pfn in the readonly memslot. */
3047 if (writable && memslot_is_readonly(slot)) {
3048 *writable = false;
3049 writable = NULL;
3050 }
3051
3052 return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
3053 writable);
3054 }
3055 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
3056
3057 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
3058 bool *writable)
3059 {
3060 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
3061 NULL, write_fault, writable, NULL);
3062 }
3063 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
3064
3065 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
3066 {
3067 return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
3068 NULL, NULL);
3069 }
3070 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
3071
3072 kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
3073 {
3074 return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
3075 NULL, NULL);
3076 }
3077 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
3078
3079 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
3080 {
3081 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
3082 }
3083 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
3084
3085 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
3086 {
3087 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
3088 }
3089 EXPORT_SYMBOL_GPL(gfn_to_pfn);
3090
3091 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
3092 {
3093 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
3094 }
3095 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
3096
3097 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3098 struct page **pages, int nr_pages)
3099 {
3100 unsigned long addr;
3101 gfn_t entry = 0;
3102
3103 addr = gfn_to_hva_many(slot, gfn, &entry);
3104 if (kvm_is_error_hva(addr))
3105 return -1;
3106
3107 if (entry < nr_pages)
3108 return 0;
3109
3110 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
3111 }
3112 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
3113
3114 /*
3115 * Do not use this helper unless you are absolutely certain the gfn _must_ be
3116 * backed by 'struct page'. A valid example is if the backing memslot is
3117 * controlled by KVM. Note, if the returned page is valid, it's refcount has
3118 * been elevated by gfn_to_pfn().
3119 */
3120 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
3121 {
3122 struct page *page;
3123 kvm_pfn_t pfn;
3124
3125 pfn = gfn_to_pfn(kvm, gfn);
3126
3127 if (is_error_noslot_pfn(pfn))
3128 return KVM_ERR_PTR_BAD_PAGE;
3129
3130 page = kvm_pfn_to_refcounted_page(pfn);
3131 if (!page)
3132 return KVM_ERR_PTR_BAD_PAGE;
3133
3134 return page;
3135 }
3136 EXPORT_SYMBOL_GPL(gfn_to_page);
3137
3138 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
3139 {
3140 if (dirty)
3141 kvm_release_pfn_dirty(pfn);
3142 else
3143 kvm_release_pfn_clean(pfn);
3144 }
3145
3146 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
3147 {
3148 kvm_pfn_t pfn;
3149 void *hva = NULL;
3150 struct page *page = KVM_UNMAPPED_PAGE;
3151
3152 if (!map)
3153 return -EINVAL;
3154
3155 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3156 if (is_error_noslot_pfn(pfn))
3157 return -EINVAL;
3158
3159 if (pfn_valid(pfn)) {
3160 page = pfn_to_page(pfn);
3161 hva = kmap(page);
3162 #ifdef CONFIG_HAS_IOMEM
3163 } else {
3164 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
3165 #endif
3166 }
3167
3168 if (!hva)
3169 return -EFAULT;
3170
3171 map->page = page;
3172 map->hva = hva;
3173 map->pfn = pfn;
3174 map->gfn = gfn;
3175
3176 return 0;
3177 }
3178 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
3179
3180 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
3181 {
3182 if (!map)
3183 return;
3184
3185 if (!map->hva)
3186 return;
3187
3188 if (map->page != KVM_UNMAPPED_PAGE)
3189 kunmap(map->page);
3190 #ifdef CONFIG_HAS_IOMEM
3191 else
3192 memunmap(map->hva);
3193 #endif
3194
3195 if (dirty)
3196 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
3197
3198 kvm_release_pfn(map->pfn, dirty);
3199
3200 map->hva = NULL;
3201 map->page = NULL;
3202 }
3203 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
3204
3205 static bool kvm_is_ad_tracked_page(struct page *page)
3206 {
3207 /*
3208 * Per page-flags.h, pages tagged PG_reserved "should in general not be
3209 * touched (e.g. set dirty) except by its owner".
3210 */
3211 return !PageReserved(page);
3212 }
3213
3214 static void kvm_set_page_dirty(struct page *page)
3215 {
3216 if (kvm_is_ad_tracked_page(page))
3217 SetPageDirty(page);
3218 }
3219
3220 static void kvm_set_page_accessed(struct page *page)
3221 {
3222 if (kvm_is_ad_tracked_page(page))
3223 mark_page_accessed(page);
3224 }
3225
3226 void kvm_release_page_clean(struct page *page)
3227 {
3228 WARN_ON(is_error_page(page));
3229
3230 kvm_set_page_accessed(page);
3231 put_page(page);
3232 }
3233 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
3234
3235 void kvm_release_pfn_clean(kvm_pfn_t pfn)
3236 {
3237 struct page *page;
3238
3239 if (is_error_noslot_pfn(pfn))
3240 return;
3241
3242 page = kvm_pfn_to_refcounted_page(pfn);
3243 if (!page)
3244 return;
3245
3246 kvm_release_page_clean(page);
3247 }
3248 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
3249
3250 void kvm_release_page_dirty(struct page *page)
3251 {
3252 WARN_ON(is_error_page(page));
3253
3254 kvm_set_page_dirty(page);
3255 kvm_release_page_clean(page);
3256 }
3257 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
3258
3259 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
3260 {
3261 struct page *page;
3262
3263 if (is_error_noslot_pfn(pfn))
3264 return;
3265
3266 page = kvm_pfn_to_refcounted_page(pfn);
3267 if (!page)
3268 return;
3269
3270 kvm_release_page_dirty(page);
3271 }
3272 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
3273
3274 /*
3275 * Note, checking for an error/noslot pfn is the caller's responsibility when
3276 * directly marking a page dirty/accessed. Unlike the "release" helpers, the
3277 * "set" helpers are not to be used when the pfn might point at garbage.
3278 */
3279 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
3280 {
3281 if (WARN_ON(is_error_noslot_pfn(pfn)))
3282 return;
3283
3284 if (pfn_valid(pfn))
3285 kvm_set_page_dirty(pfn_to_page(pfn));
3286 }
3287 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3288
3289 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
3290 {
3291 if (WARN_ON(is_error_noslot_pfn(pfn)))
3292 return;
3293
3294 if (pfn_valid(pfn))
3295 kvm_set_page_accessed(pfn_to_page(pfn));
3296 }
3297 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3298
3299 static int next_segment(unsigned long len, int offset)
3300 {
3301 if (len > PAGE_SIZE - offset)
3302 return PAGE_SIZE - offset;
3303 else
3304 return len;
3305 }
3306
3307 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3308 void *data, int offset, int len)
3309 {
3310 int r;
3311 unsigned long addr;
3312
3313 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3314 if (kvm_is_error_hva(addr))
3315 return -EFAULT;
3316 r = __copy_from_user(data, (void __user *)addr + offset, len);
3317 if (r)
3318 return -EFAULT;
3319 return 0;
3320 }
3321
3322 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3323 int len)
3324 {
3325 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3326
3327 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3328 }
3329 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3330
3331 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3332 int offset, int len)
3333 {
3334 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3335
3336 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3337 }
3338 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3339
3340 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3341 {
3342 gfn_t gfn = gpa >> PAGE_SHIFT;
3343 int seg;
3344 int offset = offset_in_page(gpa);
3345 int ret;
3346
3347 while ((seg = next_segment(len, offset)) != 0) {
3348 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3349 if (ret < 0)
3350 return ret;
3351 offset = 0;
3352 len -= seg;
3353 data += seg;
3354 ++gfn;
3355 }
3356 return 0;
3357 }
3358 EXPORT_SYMBOL_GPL(kvm_read_guest);
3359
3360 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3361 {
3362 gfn_t gfn = gpa >> PAGE_SHIFT;
3363 int seg;
3364 int offset = offset_in_page(gpa);
3365 int ret;
3366
3367 while ((seg = next_segment(len, offset)) != 0) {
3368 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3369 if (ret < 0)
3370 return ret;
3371 offset = 0;
3372 len -= seg;
3373 data += seg;
3374 ++gfn;
3375 }
3376 return 0;
3377 }
3378 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3379
3380 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3381 void *data, int offset, unsigned long len)
3382 {
3383 int r;
3384 unsigned long addr;
3385
3386 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3387 if (kvm_is_error_hva(addr))
3388 return -EFAULT;
3389 pagefault_disable();
3390 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3391 pagefault_enable();
3392 if (r)
3393 return -EFAULT;
3394 return 0;
3395 }
3396
3397 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3398 void *data, unsigned long len)
3399 {
3400 gfn_t gfn = gpa >> PAGE_SHIFT;
3401 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3402 int offset = offset_in_page(gpa);
3403
3404 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3405 }
3406 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3407
3408 static int __kvm_write_guest_page(struct kvm *kvm,
3409 struct kvm_memory_slot *memslot, gfn_t gfn,
3410 const void *data, int offset, int len)
3411 {
3412 int r;
3413 unsigned long addr;
3414
3415 addr = gfn_to_hva_memslot(memslot, gfn);
3416 if (kvm_is_error_hva(addr))
3417 return -EFAULT;
3418 r = __copy_to_user((void __user *)addr + offset, data, len);
3419 if (r)
3420 return -EFAULT;
3421 mark_page_dirty_in_slot(kvm, memslot, gfn);
3422 return 0;
3423 }
3424
3425 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3426 const void *data, int offset, int len)
3427 {
3428 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3429
3430 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3431 }
3432 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3433
3434 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3435 const void *data, int offset, int len)
3436 {
3437 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3438
3439 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3440 }
3441 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3442
3443 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3444 unsigned long len)
3445 {
3446 gfn_t gfn = gpa >> PAGE_SHIFT;
3447 int seg;
3448 int offset = offset_in_page(gpa);
3449 int ret;
3450
3451 while ((seg = next_segment(len, offset)) != 0) {
3452 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3453 if (ret < 0)
3454 return ret;
3455 offset = 0;
3456 len -= seg;
3457 data += seg;
3458 ++gfn;
3459 }
3460 return 0;
3461 }
3462 EXPORT_SYMBOL_GPL(kvm_write_guest);
3463
3464 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3465 unsigned long len)
3466 {
3467 gfn_t gfn = gpa >> PAGE_SHIFT;
3468 int seg;
3469 int offset = offset_in_page(gpa);
3470 int ret;
3471
3472 while ((seg = next_segment(len, offset)) != 0) {
3473 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3474 if (ret < 0)
3475 return ret;
3476 offset = 0;
3477 len -= seg;
3478 data += seg;
3479 ++gfn;
3480 }
3481 return 0;
3482 }
3483 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3484
3485 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3486 struct gfn_to_hva_cache *ghc,
3487 gpa_t gpa, unsigned long len)
3488 {
3489 int offset = offset_in_page(gpa);
3490 gfn_t start_gfn = gpa >> PAGE_SHIFT;
3491 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3492 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3493 gfn_t nr_pages_avail;
3494
3495 /* Update ghc->generation before performing any error checks. */
3496 ghc->generation = slots->generation;
3497
3498 if (start_gfn > end_gfn) {
3499 ghc->hva = KVM_HVA_ERR_BAD;
3500 return -EINVAL;
3501 }
3502
3503 /*
3504 * If the requested region crosses two memslots, we still
3505 * verify that the entire region is valid here.
3506 */
3507 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3508 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3509 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3510 &nr_pages_avail);
3511 if (kvm_is_error_hva(ghc->hva))
3512 return -EFAULT;
3513 }
3514
3515 /* Use the slow path for cross page reads and writes. */
3516 if (nr_pages_needed == 1)
3517 ghc->hva += offset;
3518 else
3519 ghc->memslot = NULL;
3520
3521 ghc->gpa = gpa;
3522 ghc->len = len;
3523 return 0;
3524 }
3525
3526 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3527 gpa_t gpa, unsigned long len)
3528 {
3529 struct kvm_memslots *slots = kvm_memslots(kvm);
3530 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3531 }
3532 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3533
3534 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3535 void *data, unsigned int offset,
3536 unsigned long len)
3537 {
3538 struct kvm_memslots *slots = kvm_memslots(kvm);
3539 int r;
3540 gpa_t gpa = ghc->gpa + offset;
3541
3542 if (WARN_ON_ONCE(len + offset > ghc->len))
3543 return -EINVAL;
3544
3545 if (slots->generation != ghc->generation) {
3546 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3547 return -EFAULT;
3548 }
3549
3550 if (kvm_is_error_hva(ghc->hva))
3551 return -EFAULT;
3552
3553 if (unlikely(!ghc->memslot))
3554 return kvm_write_guest(kvm, gpa, data, len);
3555
3556 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3557 if (r)
3558 return -EFAULT;
3559 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3560
3561 return 0;
3562 }
3563 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3564
3565 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3566 void *data, unsigned long len)
3567 {
3568 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3569 }
3570 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3571
3572 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3573 void *data, unsigned int offset,
3574 unsigned long len)
3575 {
3576 struct kvm_memslots *slots = kvm_memslots(kvm);
3577 int r;
3578 gpa_t gpa = ghc->gpa + offset;
3579
3580 if (WARN_ON_ONCE(len + offset > ghc->len))
3581 return -EINVAL;
3582
3583 if (slots->generation != ghc->generation) {
3584 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3585 return -EFAULT;
3586 }
3587
3588 if (kvm_is_error_hva(ghc->hva))
3589 return -EFAULT;
3590
3591 if (unlikely(!ghc->memslot))
3592 return kvm_read_guest(kvm, gpa, data, len);
3593
3594 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3595 if (r)
3596 return -EFAULT;
3597
3598 return 0;
3599 }
3600 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3601
3602 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3603 void *data, unsigned long len)
3604 {
3605 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3606 }
3607 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3608
3609 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3610 {
3611 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3612 gfn_t gfn = gpa >> PAGE_SHIFT;
3613 int seg;
3614 int offset = offset_in_page(gpa);
3615 int ret;
3616
3617 while ((seg = next_segment(len, offset)) != 0) {
3618 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3619 if (ret < 0)
3620 return ret;
3621 offset = 0;
3622 len -= seg;
3623 ++gfn;
3624 }
3625 return 0;
3626 }
3627 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3628
3629 void mark_page_dirty_in_slot(struct kvm *kvm,
3630 const struct kvm_memory_slot *memslot,
3631 gfn_t gfn)
3632 {
3633 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3634
3635 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3636 if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3637 return;
3638
3639 WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3640 #endif
3641
3642 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3643 unsigned long rel_gfn = gfn - memslot->base_gfn;
3644 u32 slot = (memslot->as_id << 16) | memslot->id;
3645
3646 if (kvm->dirty_ring_size && vcpu)
3647 kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3648 else if (memslot->dirty_bitmap)
3649 set_bit_le(rel_gfn, memslot->dirty_bitmap);
3650 }
3651 }
3652 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3653
3654 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3655 {
3656 struct kvm_memory_slot *memslot;
3657
3658 memslot = gfn_to_memslot(kvm, gfn);
3659 mark_page_dirty_in_slot(kvm, memslot, gfn);
3660 }
3661 EXPORT_SYMBOL_GPL(mark_page_dirty);
3662
3663 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3664 {
3665 struct kvm_memory_slot *memslot;
3666
3667 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3668 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3669 }
3670 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3671
3672 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3673 {
3674 if (!vcpu->sigset_active)
3675 return;
3676
3677 /*
3678 * This does a lockless modification of ->real_blocked, which is fine
3679 * because, only current can change ->real_blocked and all readers of
3680 * ->real_blocked don't care as long ->real_blocked is always a subset
3681 * of ->blocked.
3682 */
3683 sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3684 }
3685
3686 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3687 {
3688 if (!vcpu->sigset_active)
3689 return;
3690
3691 sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3692 sigemptyset(&current->real_blocked);
3693 }
3694
3695 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3696 {
3697 unsigned int old, val, grow, grow_start;
3698
3699 old = val = vcpu->halt_poll_ns;
3700 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3701 grow = READ_ONCE(halt_poll_ns_grow);
3702 if (!grow)
3703 goto out;
3704
3705 val *= grow;
3706 if (val < grow_start)
3707 val = grow_start;
3708
3709 vcpu->halt_poll_ns = val;
3710 out:
3711 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3712 }
3713
3714 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3715 {
3716 unsigned int old, val, shrink, grow_start;
3717
3718 old = val = vcpu->halt_poll_ns;
3719 shrink = READ_ONCE(halt_poll_ns_shrink);
3720 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3721 if (shrink == 0)
3722 val = 0;
3723 else
3724 val /= shrink;
3725
3726 if (val < grow_start)
3727 val = 0;
3728
3729 vcpu->halt_poll_ns = val;
3730 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3731 }
3732
3733 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3734 {
3735 int ret = -EINTR;
3736 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3737
3738 if (kvm_arch_vcpu_runnable(vcpu))
3739 goto out;
3740 if (kvm_cpu_has_pending_timer(vcpu))
3741 goto out;
3742 if (signal_pending(current))
3743 goto out;
3744 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3745 goto out;
3746
3747 ret = 0;
3748 out:
3749 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3750 return ret;
3751 }
3752
3753 /*
3754 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3755 * pending. This is mostly used when halting a vCPU, but may also be used
3756 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3757 */
3758 bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3759 {
3760 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3761 bool waited = false;
3762
3763 vcpu->stat.generic.blocking = 1;
3764
3765 preempt_disable();
3766 kvm_arch_vcpu_blocking(vcpu);
3767 prepare_to_rcuwait(wait);
3768 preempt_enable();
3769
3770 for (;;) {
3771 set_current_state(TASK_INTERRUPTIBLE);
3772
3773 if (kvm_vcpu_check_block(vcpu) < 0)
3774 break;
3775
3776 waited = true;
3777 schedule();
3778 }
3779
3780 preempt_disable();
3781 finish_rcuwait(wait);
3782 kvm_arch_vcpu_unblocking(vcpu);
3783 preempt_enable();
3784
3785 vcpu->stat.generic.blocking = 0;
3786
3787 return waited;
3788 }
3789
3790 static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3791 ktime_t end, bool success)
3792 {
3793 struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3794 u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3795
3796 ++vcpu->stat.generic.halt_attempted_poll;
3797
3798 if (success) {
3799 ++vcpu->stat.generic.halt_successful_poll;
3800
3801 if (!vcpu_valid_wakeup(vcpu))
3802 ++vcpu->stat.generic.halt_poll_invalid;
3803
3804 stats->halt_poll_success_ns += poll_ns;
3805 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3806 } else {
3807 stats->halt_poll_fail_ns += poll_ns;
3808 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3809 }
3810 }
3811
3812 static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3813 {
3814 struct kvm *kvm = vcpu->kvm;
3815
3816 if (kvm->override_halt_poll_ns) {
3817 /*
3818 * Ensure kvm->max_halt_poll_ns is not read before
3819 * kvm->override_halt_poll_ns.
3820 *
3821 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3822 */
3823 smp_rmb();
3824 return READ_ONCE(kvm->max_halt_poll_ns);
3825 }
3826
3827 return READ_ONCE(halt_poll_ns);
3828 }
3829
3830 /*
3831 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3832 * polling is enabled, busy wait for a short time before blocking to avoid the
3833 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3834 * is halted.
3835 */
3836 void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3837 {
3838 unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3839 bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3840 ktime_t start, cur, poll_end;
3841 bool waited = false;
3842 bool do_halt_poll;
3843 u64 halt_ns;
3844
3845 if (vcpu->halt_poll_ns > max_halt_poll_ns)
3846 vcpu->halt_poll_ns = max_halt_poll_ns;
3847
3848 do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3849
3850 start = cur = poll_end = ktime_get();
3851 if (do_halt_poll) {
3852 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3853
3854 do {
3855 if (kvm_vcpu_check_block(vcpu) < 0)
3856 goto out;
3857 cpu_relax();
3858 poll_end = cur = ktime_get();
3859 } while (kvm_vcpu_can_poll(cur, stop));
3860 }
3861
3862 waited = kvm_vcpu_block(vcpu);
3863
3864 cur = ktime_get();
3865 if (waited) {
3866 vcpu->stat.generic.halt_wait_ns +=
3867 ktime_to_ns(cur) - ktime_to_ns(poll_end);
3868 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3869 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3870 }
3871 out:
3872 /* The total time the vCPU was "halted", including polling time. */
3873 halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3874
3875 /*
3876 * Note, halt-polling is considered successful so long as the vCPU was
3877 * never actually scheduled out, i.e. even if the wake event arrived
3878 * after of the halt-polling loop itself, but before the full wait.
3879 */
3880 if (do_halt_poll)
3881 update_halt_poll_stats(vcpu, start, poll_end, !waited);
3882
3883 if (halt_poll_allowed) {
3884 /* Recompute the max halt poll time in case it changed. */
3885 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3886
3887 if (!vcpu_valid_wakeup(vcpu)) {
3888 shrink_halt_poll_ns(vcpu);
3889 } else if (max_halt_poll_ns) {
3890 if (halt_ns <= vcpu->halt_poll_ns)
3891 ;
3892 /* we had a long block, shrink polling */
3893 else if (vcpu->halt_poll_ns &&
3894 halt_ns > max_halt_poll_ns)
3895 shrink_halt_poll_ns(vcpu);
3896 /* we had a short halt and our poll time is too small */
3897 else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3898 halt_ns < max_halt_poll_ns)
3899 grow_halt_poll_ns(vcpu);
3900 } else {
3901 vcpu->halt_poll_ns = 0;
3902 }
3903 }
3904
3905 trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3906 }
3907 EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3908
3909 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3910 {
3911 if (__kvm_vcpu_wake_up(vcpu)) {
3912 WRITE_ONCE(vcpu->ready, true);
3913 ++vcpu->stat.generic.halt_wakeup;
3914 return true;
3915 }
3916
3917 return false;
3918 }
3919 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3920
3921 #ifndef CONFIG_S390
3922 /*
3923 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3924 */
3925 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3926 {
3927 int me, cpu;
3928
3929 if (kvm_vcpu_wake_up(vcpu))
3930 return;
3931
3932 me = get_cpu();
3933 /*
3934 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3935 * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3936 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3937 * within the vCPU thread itself.
3938 */
3939 if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3940 if (vcpu->mode == IN_GUEST_MODE)
3941 WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3942 goto out;
3943 }
3944
3945 /*
3946 * Note, the vCPU could get migrated to a different pCPU at any point
3947 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3948 * IPI to the previous pCPU. But, that's ok because the purpose of the
3949 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3950 * vCPU also requires it to leave IN_GUEST_MODE.
3951 */
3952 if (kvm_arch_vcpu_should_kick(vcpu)) {
3953 cpu = READ_ONCE(vcpu->cpu);
3954 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3955 smp_send_reschedule(cpu);
3956 }
3957 out:
3958 put_cpu();
3959 }
3960 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3961 #endif /* !CONFIG_S390 */
3962
3963 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3964 {
3965 struct pid *pid;
3966 struct task_struct *task = NULL;
3967 int ret = 0;
3968
3969 rcu_read_lock();
3970 pid = rcu_dereference(target->pid);
3971 if (pid)
3972 task = get_pid_task(pid, PIDTYPE_PID);
3973 rcu_read_unlock();
3974 if (!task)
3975 return ret;
3976 ret = yield_to(task, 1);
3977 put_task_struct(task);
3978
3979 return ret;
3980 }
3981 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3982
3983 /*
3984 * Helper that checks whether a VCPU is eligible for directed yield.
3985 * Most eligible candidate to yield is decided by following heuristics:
3986 *
3987 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3988 * (preempted lock holder), indicated by @in_spin_loop.
3989 * Set at the beginning and cleared at the end of interception/PLE handler.
3990 *
3991 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3992 * chance last time (mostly it has become eligible now since we have probably
3993 * yielded to lockholder in last iteration. This is done by toggling
3994 * @dy_eligible each time a VCPU checked for eligibility.)
3995 *
3996 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3997 * to preempted lock-holder could result in wrong VCPU selection and CPU
3998 * burning. Giving priority for a potential lock-holder increases lock
3999 * progress.
4000 *
4001 * Since algorithm is based on heuristics, accessing another VCPU data without
4002 * locking does not harm. It may result in trying to yield to same VCPU, fail
4003 * and continue with next VCPU and so on.
4004 */
4005 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
4006 {
4007 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
4008 bool eligible;
4009
4010 eligible = !vcpu->spin_loop.in_spin_loop ||
4011 vcpu->spin_loop.dy_eligible;
4012
4013 if (vcpu->spin_loop.in_spin_loop)
4014 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
4015
4016 return eligible;
4017 #else
4018 return true;
4019 #endif
4020 }
4021
4022 /*
4023 * Unlike kvm_arch_vcpu_runnable, this function is called outside
4024 * a vcpu_load/vcpu_put pair. However, for most architectures
4025 * kvm_arch_vcpu_runnable does not require vcpu_load.
4026 */
4027 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
4028 {
4029 return kvm_arch_vcpu_runnable(vcpu);
4030 }
4031
4032 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
4033 {
4034 if (kvm_arch_dy_runnable(vcpu))
4035 return true;
4036
4037 #ifdef CONFIG_KVM_ASYNC_PF
4038 if (!list_empty_careful(&vcpu->async_pf.done))
4039 return true;
4040 #endif
4041
4042 return false;
4043 }
4044
4045 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
4046 {
4047 return false;
4048 }
4049
4050 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
4051 {
4052 struct kvm *kvm = me->kvm;
4053 struct kvm_vcpu *vcpu;
4054 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
4055 unsigned long i;
4056 int yielded = 0;
4057 int try = 3;
4058 int pass;
4059
4060 kvm_vcpu_set_in_spin_loop(me, true);
4061 /*
4062 * We boost the priority of a VCPU that is runnable but not
4063 * currently running, because it got preempted by something
4064 * else and called schedule in __vcpu_run. Hopefully that
4065 * VCPU is holding the lock that we need and will release it.
4066 * We approximate round-robin by starting at the last boosted VCPU.
4067 */
4068 for (pass = 0; pass < 2 && !yielded && try; pass++) {
4069 kvm_for_each_vcpu(i, vcpu, kvm) {
4070 if (!pass && i <= last_boosted_vcpu) {
4071 i = last_boosted_vcpu;
4072 continue;
4073 } else if (pass && i > last_boosted_vcpu)
4074 break;
4075 if (!READ_ONCE(vcpu->ready))
4076 continue;
4077 if (vcpu == me)
4078 continue;
4079 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
4080 continue;
4081 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
4082 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
4083 !kvm_arch_vcpu_in_kernel(vcpu))
4084 continue;
4085 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
4086 continue;
4087
4088 yielded = kvm_vcpu_yield_to(vcpu);
4089 if (yielded > 0) {
4090 kvm->last_boosted_vcpu = i;
4091 break;
4092 } else if (yielded < 0) {
4093 try--;
4094 if (!try)
4095 break;
4096 }
4097 }
4098 }
4099 kvm_vcpu_set_in_spin_loop(me, false);
4100
4101 /* Ensure vcpu is not eligible during next spinloop */
4102 kvm_vcpu_set_dy_eligible(me, false);
4103 }
4104 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
4105
4106 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
4107 {
4108 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
4109 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
4110 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
4111 kvm->dirty_ring_size / PAGE_SIZE);
4112 #else
4113 return false;
4114 #endif
4115 }
4116
4117 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
4118 {
4119 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
4120 struct page *page;
4121
4122 if (vmf->pgoff == 0)
4123 page = virt_to_page(vcpu->run);
4124 #ifdef CONFIG_X86
4125 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
4126 page = virt_to_page(vcpu->arch.pio_data);
4127 #endif
4128 #ifdef CONFIG_KVM_MMIO
4129 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
4130 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
4131 #endif
4132 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
4133 page = kvm_dirty_ring_get_page(
4134 &vcpu->dirty_ring,
4135 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
4136 else
4137 return kvm_arch_vcpu_fault(vcpu, vmf);
4138 get_page(page);
4139 vmf->page = page;
4140 return 0;
4141 }
4142
4143 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
4144 .fault = kvm_vcpu_fault,
4145 };
4146
4147 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
4148 {
4149 struct kvm_vcpu *vcpu = file->private_data;
4150 unsigned long pages = vma_pages(vma);
4151
4152 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
4153 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
4154 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
4155 return -EINVAL;
4156
4157 vma->vm_ops = &kvm_vcpu_vm_ops;
4158 return 0;
4159 }
4160
4161 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
4162 {
4163 struct kvm_vcpu *vcpu = filp->private_data;
4164
4165 kvm_put_kvm(vcpu->kvm);
4166 return 0;
4167 }
4168
4169 static struct file_operations kvm_vcpu_fops = {
4170 .release = kvm_vcpu_release,
4171 .unlocked_ioctl = kvm_vcpu_ioctl,
4172 .mmap = kvm_vcpu_mmap,
4173 .llseek = noop_llseek,
4174 KVM_COMPAT(kvm_vcpu_compat_ioctl),
4175 };
4176
4177 /*
4178 * Allocates an inode for the vcpu.
4179 */
4180 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
4181 {
4182 char name[8 + 1 + ITOA_MAX_LEN + 1];
4183
4184 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
4185 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
4186 }
4187
4188 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
4189 static int vcpu_get_pid(void *data, u64 *val)
4190 {
4191 struct kvm_vcpu *vcpu = data;
4192
4193 rcu_read_lock();
4194 *val = pid_nr(rcu_dereference(vcpu->pid));
4195 rcu_read_unlock();
4196 return 0;
4197 }
4198
4199 DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
4200
4201 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
4202 {
4203 struct dentry *debugfs_dentry;
4204 char dir_name[ITOA_MAX_LEN * 2];
4205
4206 if (!debugfs_initialized())
4207 return;
4208
4209 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
4210 debugfs_dentry = debugfs_create_dir(dir_name,
4211 vcpu->kvm->debugfs_dentry);
4212 debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
4213 &vcpu_get_pid_fops);
4214
4215 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
4216 }
4217 #endif
4218
4219 /*
4220 * Creates some virtual cpus. Good luck creating more than one.
4221 */
4222 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
4223 {
4224 int r;
4225 struct kvm_vcpu *vcpu;
4226 struct page *page;
4227
4228 if (id >= KVM_MAX_VCPU_IDS)
4229 return -EINVAL;
4230
4231 mutex_lock(&kvm->lock);
4232 if (kvm->created_vcpus >= kvm->max_vcpus) {
4233 mutex_unlock(&kvm->lock);
4234 return -EINVAL;
4235 }
4236
4237 r = kvm_arch_vcpu_precreate(kvm, id);
4238 if (r) {
4239 mutex_unlock(&kvm->lock);
4240 return r;
4241 }
4242
4243 kvm->created_vcpus++;
4244 mutex_unlock(&kvm->lock);
4245
4246 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
4247 if (!vcpu) {
4248 r = -ENOMEM;
4249 goto vcpu_decrement;
4250 }
4251
4252 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
4253 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
4254 if (!page) {
4255 r = -ENOMEM;
4256 goto vcpu_free;
4257 }
4258 vcpu->run = page_address(page);
4259
4260 kvm_vcpu_init(vcpu, kvm, id);
4261
4262 r = kvm_arch_vcpu_create(vcpu);
4263 if (r)
4264 goto vcpu_free_run_page;
4265
4266 if (kvm->dirty_ring_size) {
4267 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
4268 id, kvm->dirty_ring_size);
4269 if (r)
4270 goto arch_vcpu_destroy;
4271 }
4272
4273 mutex_lock(&kvm->lock);
4274
4275 #ifdef CONFIG_LOCKDEP
4276 /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
4277 mutex_lock(&vcpu->mutex);
4278 mutex_unlock(&vcpu->mutex);
4279 #endif
4280
4281 if (kvm_get_vcpu_by_id(kvm, id)) {
4282 r = -EEXIST;
4283 goto unlock_vcpu_destroy;
4284 }
4285
4286 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4287 r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4288 if (r)
4289 goto unlock_vcpu_destroy;
4290
4291 /* Now it's all set up, let userspace reach it */
4292 kvm_get_kvm(kvm);
4293 r = create_vcpu_fd(vcpu);
4294 if (r < 0)
4295 goto kvm_put_xa_release;
4296
4297 if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4298 r = -EINVAL;
4299 goto kvm_put_xa_release;
4300 }
4301
4302 /*
4303 * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
4304 * pointer before kvm->online_vcpu's incremented value.
4305 */
4306 smp_wmb();
4307 atomic_inc(&kvm->online_vcpus);
4308
4309 mutex_unlock(&kvm->lock);
4310 kvm_arch_vcpu_postcreate(vcpu);
4311 kvm_create_vcpu_debugfs(vcpu);
4312 return r;
4313
4314 kvm_put_xa_release:
4315 kvm_put_kvm_no_destroy(kvm);
4316 xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4317 unlock_vcpu_destroy:
4318 mutex_unlock(&kvm->lock);
4319 kvm_dirty_ring_free(&vcpu->dirty_ring);
4320 arch_vcpu_destroy:
4321 kvm_arch_vcpu_destroy(vcpu);
4322 vcpu_free_run_page:
4323 free_page((unsigned long)vcpu->run);
4324 vcpu_free:
4325 kmem_cache_free(kvm_vcpu_cache, vcpu);
4326 vcpu_decrement:
4327 mutex_lock(&kvm->lock);
4328 kvm->created_vcpus--;
4329 mutex_unlock(&kvm->lock);
4330 return r;
4331 }
4332
4333 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4334 {
4335 if (sigset) {
4336 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4337 vcpu->sigset_active = 1;
4338 vcpu->sigset = *sigset;
4339 } else
4340 vcpu->sigset_active = 0;
4341 return 0;
4342 }
4343
4344 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4345 size_t size, loff_t *offset)
4346 {
4347 struct kvm_vcpu *vcpu = file->private_data;
4348
4349 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4350 &kvm_vcpu_stats_desc[0], &vcpu->stat,
4351 sizeof(vcpu->stat), user_buffer, size, offset);
4352 }
4353
4354 static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4355 {
4356 struct kvm_vcpu *vcpu = file->private_data;
4357
4358 kvm_put_kvm(vcpu->kvm);
4359 return 0;
4360 }
4361
4362 static const struct file_operations kvm_vcpu_stats_fops = {
4363 .owner = THIS_MODULE,
4364 .read = kvm_vcpu_stats_read,
4365 .release = kvm_vcpu_stats_release,
4366 .llseek = noop_llseek,
4367 };
4368
4369 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4370 {
4371 int fd;
4372 struct file *file;
4373 char name[15 + ITOA_MAX_LEN + 1];
4374
4375 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4376
4377 fd = get_unused_fd_flags(O_CLOEXEC);
4378 if (fd < 0)
4379 return fd;
4380
4381 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4382 if (IS_ERR(file)) {
4383 put_unused_fd(fd);
4384 return PTR_ERR(file);
4385 }
4386
4387 kvm_get_kvm(vcpu->kvm);
4388
4389 file->f_mode |= FMODE_PREAD;
4390 fd_install(fd, file);
4391
4392 return fd;
4393 }
4394
4395 static long kvm_vcpu_ioctl(struct file *filp,
4396 unsigned int ioctl, unsigned long arg)
4397 {
4398 struct kvm_vcpu *vcpu = filp->private_data;
4399 void __user *argp = (void __user *)arg;
4400 int r;
4401 struct kvm_fpu *fpu = NULL;
4402 struct kvm_sregs *kvm_sregs = NULL;
4403
4404 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4405 return -EIO;
4406
4407 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4408 return -EINVAL;
4409
4410 /*
4411 * Some architectures have vcpu ioctls that are asynchronous to vcpu
4412 * execution; mutex_lock() would break them.
4413 */
4414 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4415 if (r != -ENOIOCTLCMD)
4416 return r;
4417
4418 if (mutex_lock_killable(&vcpu->mutex))
4419 return -EINTR;
4420 switch (ioctl) {
4421 case KVM_RUN: {
4422 struct pid *oldpid;
4423 r = -EINVAL;
4424 if (arg)
4425 goto out;
4426 oldpid = rcu_access_pointer(vcpu->pid);
4427 if (unlikely(oldpid != task_pid(current))) {
4428 /* The thread running this VCPU changed. */
4429 struct pid *newpid;
4430
4431 r = kvm_arch_vcpu_run_pid_change(vcpu);
4432 if (r)
4433 break;
4434
4435 newpid = get_task_pid(current, PIDTYPE_PID);
4436 rcu_assign_pointer(vcpu->pid, newpid);
4437 if (oldpid)
4438 synchronize_rcu();
4439 put_pid(oldpid);
4440 }
4441 r = kvm_arch_vcpu_ioctl_run(vcpu);
4442 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4443 break;
4444 }
4445 case KVM_GET_REGS: {
4446 struct kvm_regs *kvm_regs;
4447
4448 r = -ENOMEM;
4449 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
4450 if (!kvm_regs)
4451 goto out;
4452 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4453 if (r)
4454 goto out_free1;
4455 r = -EFAULT;
4456 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4457 goto out_free1;
4458 r = 0;
4459 out_free1:
4460 kfree(kvm_regs);
4461 break;
4462 }
4463 case KVM_SET_REGS: {
4464 struct kvm_regs *kvm_regs;
4465
4466 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4467 if (IS_ERR(kvm_regs)) {
4468 r = PTR_ERR(kvm_regs);
4469 goto out;
4470 }
4471 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4472 kfree(kvm_regs);
4473 break;
4474 }
4475 case KVM_GET_SREGS: {
4476 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4477 GFP_KERNEL_ACCOUNT);
4478 r = -ENOMEM;
4479 if (!kvm_sregs)
4480 goto out;
4481 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4482 if (r)
4483 goto out;
4484 r = -EFAULT;
4485 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4486 goto out;
4487 r = 0;
4488 break;
4489 }
4490 case KVM_SET_SREGS: {
4491 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4492 if (IS_ERR(kvm_sregs)) {
4493 r = PTR_ERR(kvm_sregs);
4494 kvm_sregs = NULL;
4495 goto out;
4496 }
4497 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4498 break;
4499 }
4500 case KVM_GET_MP_STATE: {
4501 struct kvm_mp_state mp_state;
4502
4503 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4504 if (r)
4505 goto out;
4506 r = -EFAULT;
4507 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4508 goto out;
4509 r = 0;
4510 break;
4511 }
4512 case KVM_SET_MP_STATE: {
4513 struct kvm_mp_state mp_state;
4514
4515 r = -EFAULT;
4516 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4517 goto out;
4518 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4519 break;
4520 }
4521 case KVM_TRANSLATE: {
4522 struct kvm_translation tr;
4523
4524 r = -EFAULT;
4525 if (copy_from_user(&tr, argp, sizeof(tr)))
4526 goto out;
4527 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4528 if (r)
4529 goto out;
4530 r = -EFAULT;
4531 if (copy_to_user(argp, &tr, sizeof(tr)))
4532 goto out;
4533 r = 0;
4534 break;
4535 }
4536 case KVM_SET_GUEST_DEBUG: {
4537 struct kvm_guest_debug dbg;
4538
4539 r = -EFAULT;
4540 if (copy_from_user(&dbg, argp, sizeof(dbg)))
4541 goto out;
4542 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4543 break;
4544 }
4545 case KVM_SET_SIGNAL_MASK: {
4546 struct kvm_signal_mask __user *sigmask_arg = argp;
4547 struct kvm_signal_mask kvm_sigmask;
4548 sigset_t sigset, *p;
4549
4550 p = NULL;
4551 if (argp) {
4552 r = -EFAULT;
4553 if (copy_from_user(&kvm_sigmask, argp,
4554 sizeof(kvm_sigmask)))
4555 goto out;
4556 r = -EINVAL;
4557 if (kvm_sigmask.len != sizeof(sigset))
4558 goto out;
4559 r = -EFAULT;
4560 if (copy_from_user(&sigset, sigmask_arg->sigset,
4561 sizeof(sigset)))
4562 goto out;
4563 p = &sigset;
4564 }
4565 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4566 break;
4567 }
4568 case KVM_GET_FPU: {
4569 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4570 r = -ENOMEM;
4571 if (!fpu)
4572 goto out;
4573 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4574 if (r)
4575 goto out;
4576 r = -EFAULT;
4577 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4578 goto out;
4579 r = 0;
4580 break;
4581 }
4582 case KVM_SET_FPU: {
4583 fpu = memdup_user(argp, sizeof(*fpu));
4584 if (IS_ERR(fpu)) {
4585 r = PTR_ERR(fpu);
4586 fpu = NULL;
4587 goto out;
4588 }
4589 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4590 break;
4591 }
4592 case KVM_GET_STATS_FD: {
4593 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4594 break;
4595 }
4596 default:
4597 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4598 }
4599 out:
4600 mutex_unlock(&vcpu->mutex);
4601 kfree(fpu);
4602 kfree(kvm_sregs);
4603 return r;
4604 }
4605
4606 #ifdef CONFIG_KVM_COMPAT
4607 static long kvm_vcpu_compat_ioctl(struct file *filp,
4608 unsigned int ioctl, unsigned long arg)
4609 {
4610 struct kvm_vcpu *vcpu = filp->private_data;
4611 void __user *argp = compat_ptr(arg);
4612 int r;
4613
4614 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4615 return -EIO;
4616
4617 switch (ioctl) {
4618 case KVM_SET_SIGNAL_MASK: {
4619 struct kvm_signal_mask __user *sigmask_arg = argp;
4620 struct kvm_signal_mask kvm_sigmask;
4621 sigset_t sigset;
4622
4623 if (argp) {
4624 r = -EFAULT;
4625 if (copy_from_user(&kvm_sigmask, argp,
4626 sizeof(kvm_sigmask)))
4627 goto out;
4628 r = -EINVAL;
4629 if (kvm_sigmask.len != sizeof(compat_sigset_t))
4630 goto out;
4631 r = -EFAULT;
4632 if (get_compat_sigset(&sigset,
4633 (compat_sigset_t __user *)sigmask_arg->sigset))
4634 goto out;
4635 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4636 } else
4637 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4638 break;
4639 }
4640 default:
4641 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4642 }
4643
4644 out:
4645 return r;
4646 }
4647 #endif
4648
4649 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4650 {
4651 struct kvm_device *dev = filp->private_data;
4652
4653 if (dev->ops->mmap)
4654 return dev->ops->mmap(dev, vma);
4655
4656 return -ENODEV;
4657 }
4658
4659 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4660 int (*accessor)(struct kvm_device *dev,
4661 struct kvm_device_attr *attr),
4662 unsigned long arg)
4663 {
4664 struct kvm_device_attr attr;
4665
4666 if (!accessor)
4667 return -EPERM;
4668
4669 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4670 return -EFAULT;
4671
4672 return accessor(dev, &attr);
4673 }
4674
4675 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4676 unsigned long arg)
4677 {
4678 struct kvm_device *dev = filp->private_data;
4679
4680 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4681 return -EIO;
4682
4683 switch (ioctl) {
4684 case KVM_SET_DEVICE_ATTR:
4685 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4686 case KVM_GET_DEVICE_ATTR:
4687 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4688 case KVM_HAS_DEVICE_ATTR:
4689 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4690 default:
4691 if (dev->ops->ioctl)
4692 return dev->ops->ioctl(dev, ioctl, arg);
4693
4694 return -ENOTTY;
4695 }
4696 }
4697
4698 static int kvm_device_release(struct inode *inode, struct file *filp)
4699 {
4700 struct kvm_device *dev = filp->private_data;
4701 struct kvm *kvm = dev->kvm;
4702
4703 if (dev->ops->release) {
4704 mutex_lock(&kvm->lock);
4705 list_del(&dev->vm_node);
4706 dev->ops->release(dev);
4707 mutex_unlock(&kvm->lock);
4708 }
4709
4710 kvm_put_kvm(kvm);
4711 return 0;
4712 }
4713
4714 static struct file_operations kvm_device_fops = {
4715 .unlocked_ioctl = kvm_device_ioctl,
4716 .release = kvm_device_release,
4717 KVM_COMPAT(kvm_device_ioctl),
4718 .mmap = kvm_device_mmap,
4719 };
4720
4721 struct kvm_device *kvm_device_from_filp(struct file *filp)
4722 {
4723 if (filp->f_op != &kvm_device_fops)
4724 return NULL;
4725
4726 return filp->private_data;
4727 }
4728
4729 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4730 #ifdef CONFIG_KVM_MPIC
4731 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4732 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4733 #endif
4734 };
4735
4736 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4737 {
4738 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4739 return -ENOSPC;
4740
4741 if (kvm_device_ops_table[type] != NULL)
4742 return -EEXIST;
4743
4744 kvm_device_ops_table[type] = ops;
4745 return 0;
4746 }
4747
4748 void kvm_unregister_device_ops(u32 type)
4749 {
4750 if (kvm_device_ops_table[type] != NULL)
4751 kvm_device_ops_table[type] = NULL;
4752 }
4753
4754 static int kvm_ioctl_create_device(struct kvm *kvm,
4755 struct kvm_create_device *cd)
4756 {
4757 const struct kvm_device_ops *ops;
4758 struct kvm_device *dev;
4759 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4760 int type;
4761 int ret;
4762
4763 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4764 return -ENODEV;
4765
4766 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4767 ops = kvm_device_ops_table[type];
4768 if (ops == NULL)
4769 return -ENODEV;
4770
4771 if (test)
4772 return 0;
4773
4774 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4775 if (!dev)
4776 return -ENOMEM;
4777
4778 dev->ops = ops;
4779 dev->kvm = kvm;
4780
4781 mutex_lock(&kvm->lock);
4782 ret = ops->create(dev, type);
4783 if (ret < 0) {
4784 mutex_unlock(&kvm->lock);
4785 kfree(dev);
4786 return ret;
4787 }
4788 list_add(&dev->vm_node, &kvm->devices);
4789 mutex_unlock(&kvm->lock);
4790
4791 if (ops->init)
4792 ops->init(dev);
4793
4794 kvm_get_kvm(kvm);
4795 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4796 if (ret < 0) {
4797 kvm_put_kvm_no_destroy(kvm);
4798 mutex_lock(&kvm->lock);
4799 list_del(&dev->vm_node);
4800 if (ops->release)
4801 ops->release(dev);
4802 mutex_unlock(&kvm->lock);
4803 if (ops->destroy)
4804 ops->destroy(dev);
4805 return ret;
4806 }
4807
4808 cd->fd = ret;
4809 return 0;
4810 }
4811
4812 static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4813 {
4814 switch (arg) {
4815 case KVM_CAP_USER_MEMORY:
4816 case KVM_CAP_USER_MEMORY2:
4817 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4818 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4819 case KVM_CAP_INTERNAL_ERROR_DATA:
4820 #ifdef CONFIG_HAVE_KVM_MSI
4821 case KVM_CAP_SIGNAL_MSI:
4822 #endif
4823 #ifdef CONFIG_HAVE_KVM_IRQCHIP
4824 case KVM_CAP_IRQFD:
4825 #endif
4826 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4827 case KVM_CAP_CHECK_EXTENSION_VM:
4828 case KVM_CAP_ENABLE_CAP_VM:
4829 case KVM_CAP_HALT_POLL:
4830 return 1;
4831 #ifdef CONFIG_KVM_MMIO
4832 case KVM_CAP_COALESCED_MMIO:
4833 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4834 case KVM_CAP_COALESCED_PIO:
4835 return 1;
4836 #endif
4837 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4838 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4839 return KVM_DIRTY_LOG_MANUAL_CAPS;
4840 #endif
4841 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4842 case KVM_CAP_IRQ_ROUTING:
4843 return KVM_MAX_IRQ_ROUTES;
4844 #endif
4845 #if KVM_MAX_NR_ADDRESS_SPACES > 1
4846 case KVM_CAP_MULTI_ADDRESS_SPACE:
4847 if (kvm)
4848 return kvm_arch_nr_memslot_as_ids(kvm);
4849 return KVM_MAX_NR_ADDRESS_SPACES;
4850 #endif
4851 case KVM_CAP_NR_MEMSLOTS:
4852 return KVM_USER_MEM_SLOTS;
4853 case KVM_CAP_DIRTY_LOG_RING:
4854 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4855 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4856 #else
4857 return 0;
4858 #endif
4859 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4860 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4861 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4862 #else
4863 return 0;
4864 #endif
4865 #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4866 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4867 #endif
4868 case KVM_CAP_BINARY_STATS_FD:
4869 case KVM_CAP_SYSTEM_EVENT_DATA:
4870 return 1;
4871 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
4872 case KVM_CAP_MEMORY_ATTRIBUTES:
4873 return kvm_supported_mem_attributes(kvm);
4874 #endif
4875 #ifdef CONFIG_KVM_PRIVATE_MEM
4876 case KVM_CAP_GUEST_MEMFD:
4877 return !kvm || kvm_arch_has_private_mem(kvm);
4878 #endif
4879 default:
4880 break;
4881 }
4882 return kvm_vm_ioctl_check_extension(kvm, arg);
4883 }
4884
4885 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4886 {
4887 int r;
4888
4889 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4890 return -EINVAL;
4891
4892 /* the size should be power of 2 */
4893 if (!size || (size & (size - 1)))
4894 return -EINVAL;
4895
4896 /* Should be bigger to keep the reserved entries, or a page */
4897 if (size < kvm_dirty_ring_get_rsvd_entries() *
4898 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4899 return -EINVAL;
4900
4901 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4902 sizeof(struct kvm_dirty_gfn))
4903 return -E2BIG;
4904
4905 /* We only allow it to set once */
4906 if (kvm->dirty_ring_size)
4907 return -EINVAL;
4908
4909 mutex_lock(&kvm->lock);
4910
4911 if (kvm->created_vcpus) {
4912 /* We don't allow to change this value after vcpu created */
4913 r = -EINVAL;
4914 } else {
4915 kvm->dirty_ring_size = size;
4916 r = 0;
4917 }
4918
4919 mutex_unlock(&kvm->lock);
4920 return r;
4921 }
4922
4923 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4924 {
4925 unsigned long i;
4926 struct kvm_vcpu *vcpu;
4927 int cleared = 0;
4928
4929 if (!kvm->dirty_ring_size)
4930 return -EINVAL;
4931
4932 mutex_lock(&kvm->slots_lock);
4933
4934 kvm_for_each_vcpu(i, vcpu, kvm)
4935 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4936
4937 mutex_unlock(&kvm->slots_lock);
4938
4939 if (cleared)
4940 kvm_flush_remote_tlbs(kvm);
4941
4942 return cleared;
4943 }
4944
4945 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4946 struct kvm_enable_cap *cap)
4947 {
4948 return -EINVAL;
4949 }
4950
4951 bool kvm_are_all_memslots_empty(struct kvm *kvm)
4952 {
4953 int i;
4954
4955 lockdep_assert_held(&kvm->slots_lock);
4956
4957 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
4958 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4959 return false;
4960 }
4961
4962 return true;
4963 }
4964 EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
4965
4966 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4967 struct kvm_enable_cap *cap)
4968 {
4969 switch (cap->cap) {
4970 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4971 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4972 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4973
4974 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4975 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4976
4977 if (cap->flags || (cap->args[0] & ~allowed_options))
4978 return -EINVAL;
4979 kvm->manual_dirty_log_protect = cap->args[0];
4980 return 0;
4981 }
4982 #endif
4983 case KVM_CAP_HALT_POLL: {
4984 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4985 return -EINVAL;
4986
4987 kvm->max_halt_poll_ns = cap->args[0];
4988
4989 /*
4990 * Ensure kvm->override_halt_poll_ns does not become visible
4991 * before kvm->max_halt_poll_ns.
4992 *
4993 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4994 */
4995 smp_wmb();
4996 kvm->override_halt_poll_ns = true;
4997
4998 return 0;
4999 }
5000 case KVM_CAP_DIRTY_LOG_RING:
5001 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
5002 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
5003 return -EINVAL;
5004
5005 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
5006 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
5007 int r = -EINVAL;
5008
5009 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
5010 !kvm->dirty_ring_size || cap->flags)
5011 return r;
5012
5013 mutex_lock(&kvm->slots_lock);
5014
5015 /*
5016 * For simplicity, allow enabling ring+bitmap if and only if
5017 * there are no memslots, e.g. to ensure all memslots allocate
5018 * a bitmap after the capability is enabled.
5019 */
5020 if (kvm_are_all_memslots_empty(kvm)) {
5021 kvm->dirty_ring_with_bitmap = true;
5022 r = 0;
5023 }
5024
5025 mutex_unlock(&kvm->slots_lock);
5026
5027 return r;
5028 }
5029 default:
5030 return kvm_vm_ioctl_enable_cap(kvm, cap);
5031 }
5032 }
5033
5034 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
5035 size_t size, loff_t *offset)
5036 {
5037 struct kvm *kvm = file->private_data;
5038
5039 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
5040 &kvm_vm_stats_desc[0], &kvm->stat,
5041 sizeof(kvm->stat), user_buffer, size, offset);
5042 }
5043
5044 static int kvm_vm_stats_release(struct inode *inode, struct file *file)
5045 {
5046 struct kvm *kvm = file->private_data;
5047
5048 kvm_put_kvm(kvm);
5049 return 0;
5050 }
5051
5052 static const struct file_operations kvm_vm_stats_fops = {
5053 .owner = THIS_MODULE,
5054 .read = kvm_vm_stats_read,
5055 .release = kvm_vm_stats_release,
5056 .llseek = noop_llseek,
5057 };
5058
5059 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
5060 {
5061 int fd;
5062 struct file *file;
5063
5064 fd = get_unused_fd_flags(O_CLOEXEC);
5065 if (fd < 0)
5066 return fd;
5067
5068 file = anon_inode_getfile("kvm-vm-stats",
5069 &kvm_vm_stats_fops, kvm, O_RDONLY);
5070 if (IS_ERR(file)) {
5071 put_unused_fd(fd);
5072 return PTR_ERR(file);
5073 }
5074
5075 kvm_get_kvm(kvm);
5076
5077 file->f_mode |= FMODE_PREAD;
5078 fd_install(fd, file);
5079
5080 return fd;
5081 }
5082
5083 #define SANITY_CHECK_MEM_REGION_FIELD(field) \
5084 do { \
5085 BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
5086 offsetof(struct kvm_userspace_memory_region2, field)); \
5087 BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
5088 sizeof_field(struct kvm_userspace_memory_region2, field)); \
5089 } while (0)
5090
5091 static long kvm_vm_ioctl(struct file *filp,
5092 unsigned int ioctl, unsigned long arg)
5093 {
5094 struct kvm *kvm = filp->private_data;
5095 void __user *argp = (void __user *)arg;
5096 int r;
5097
5098 if (kvm->mm != current->mm || kvm->vm_dead)
5099 return -EIO;
5100 switch (ioctl) {
5101 case KVM_CREATE_VCPU:
5102 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
5103 break;
5104 case KVM_ENABLE_CAP: {
5105 struct kvm_enable_cap cap;
5106
5107 r = -EFAULT;
5108 if (copy_from_user(&cap, argp, sizeof(cap)))
5109 goto out;
5110 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
5111 break;
5112 }
5113 case KVM_SET_USER_MEMORY_REGION2:
5114 case KVM_SET_USER_MEMORY_REGION: {
5115 struct kvm_userspace_memory_region2 mem;
5116 unsigned long size;
5117
5118 if (ioctl == KVM_SET_USER_MEMORY_REGION) {
5119 /*
5120 * Fields beyond struct kvm_userspace_memory_region shouldn't be
5121 * accessed, but avoid leaking kernel memory in case of a bug.
5122 */
5123 memset(&mem, 0, sizeof(mem));
5124 size = sizeof(struct kvm_userspace_memory_region);
5125 } else {
5126 size = sizeof(struct kvm_userspace_memory_region2);
5127 }
5128
5129 /* Ensure the common parts of the two structs are identical. */
5130 SANITY_CHECK_MEM_REGION_FIELD(slot);
5131 SANITY_CHECK_MEM_REGION_FIELD(flags);
5132 SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
5133 SANITY_CHECK_MEM_REGION_FIELD(memory_size);
5134 SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
5135
5136 r = -EFAULT;
5137 if (copy_from_user(&mem, argp, size))
5138 goto out;
5139
5140 r = -EINVAL;
5141 if (ioctl == KVM_SET_USER_MEMORY_REGION &&
5142 (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
5143 goto out;
5144
5145 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
5146 break;
5147 }
5148 case KVM_GET_DIRTY_LOG: {
5149 struct kvm_dirty_log log;
5150
5151 r = -EFAULT;
5152 if (copy_from_user(&log, argp, sizeof(log)))
5153 goto out;
5154 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5155 break;
5156 }
5157 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5158 case KVM_CLEAR_DIRTY_LOG: {
5159 struct kvm_clear_dirty_log log;
5160
5161 r = -EFAULT;
5162 if (copy_from_user(&log, argp, sizeof(log)))
5163 goto out;
5164 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5165 break;
5166 }
5167 #endif
5168 #ifdef CONFIG_KVM_MMIO
5169 case KVM_REGISTER_COALESCED_MMIO: {
5170 struct kvm_coalesced_mmio_zone zone;
5171
5172 r = -EFAULT;
5173 if (copy_from_user(&zone, argp, sizeof(zone)))
5174 goto out;
5175 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
5176 break;
5177 }
5178 case KVM_UNREGISTER_COALESCED_MMIO: {
5179 struct kvm_coalesced_mmio_zone zone;
5180
5181 r = -EFAULT;
5182 if (copy_from_user(&zone, argp, sizeof(zone)))
5183 goto out;
5184 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
5185 break;
5186 }
5187 #endif
5188 case KVM_IRQFD: {
5189 struct kvm_irqfd data;
5190
5191 r = -EFAULT;
5192 if (copy_from_user(&data, argp, sizeof(data)))
5193 goto out;
5194 r = kvm_irqfd(kvm, &data);
5195 break;
5196 }
5197 case KVM_IOEVENTFD: {
5198 struct kvm_ioeventfd data;
5199
5200 r = -EFAULT;
5201 if (copy_from_user(&data, argp, sizeof(data)))
5202 goto out;
5203 r = kvm_ioeventfd(kvm, &data);
5204 break;
5205 }
5206 #ifdef CONFIG_HAVE_KVM_MSI
5207 case KVM_SIGNAL_MSI: {
5208 struct kvm_msi msi;
5209
5210 r = -EFAULT;
5211 if (copy_from_user(&msi, argp, sizeof(msi)))
5212 goto out;
5213 r = kvm_send_userspace_msi(kvm, &msi);
5214 break;
5215 }
5216 #endif
5217 #ifdef __KVM_HAVE_IRQ_LINE
5218 case KVM_IRQ_LINE_STATUS:
5219 case KVM_IRQ_LINE: {
5220 struct kvm_irq_level irq_event;
5221
5222 r = -EFAULT;
5223 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
5224 goto out;
5225
5226 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
5227 ioctl == KVM_IRQ_LINE_STATUS);
5228 if (r)
5229 goto out;
5230
5231 r = -EFAULT;
5232 if (ioctl == KVM_IRQ_LINE_STATUS) {
5233 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
5234 goto out;
5235 }
5236
5237 r = 0;
5238 break;
5239 }
5240 #endif
5241 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
5242 case KVM_SET_GSI_ROUTING: {
5243 struct kvm_irq_routing routing;
5244 struct kvm_irq_routing __user *urouting;
5245 struct kvm_irq_routing_entry *entries = NULL;
5246
5247 r = -EFAULT;
5248 if (copy_from_user(&routing, argp, sizeof(routing)))
5249 goto out;
5250 r = -EINVAL;
5251 if (!kvm_arch_can_set_irq_routing(kvm))
5252 goto out;
5253 if (routing.nr > KVM_MAX_IRQ_ROUTES)
5254 goto out;
5255 if (routing.flags)
5256 goto out;
5257 if (routing.nr) {
5258 urouting = argp;
5259 entries = vmemdup_user(urouting->entries,
5260 array_size(sizeof(*entries),
5261 routing.nr));
5262 if (IS_ERR(entries)) {
5263 r = PTR_ERR(entries);
5264 goto out;
5265 }
5266 }
5267 r = kvm_set_irq_routing(kvm, entries, routing.nr,
5268 routing.flags);
5269 kvfree(entries);
5270 break;
5271 }
5272 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
5273 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
5274 case KVM_SET_MEMORY_ATTRIBUTES: {
5275 struct kvm_memory_attributes attrs;
5276
5277 r = -EFAULT;
5278 if (copy_from_user(&attrs, argp, sizeof(attrs)))
5279 goto out;
5280
5281 r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
5282 break;
5283 }
5284 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
5285 case KVM_CREATE_DEVICE: {
5286 struct kvm_create_device cd;
5287
5288 r = -EFAULT;
5289 if (copy_from_user(&cd, argp, sizeof(cd)))
5290 goto out;
5291
5292 r = kvm_ioctl_create_device(kvm, &cd);
5293 if (r)
5294 goto out;
5295
5296 r = -EFAULT;
5297 if (copy_to_user(argp, &cd, sizeof(cd)))
5298 goto out;
5299
5300 r = 0;
5301 break;
5302 }
5303 case KVM_CHECK_EXTENSION:
5304 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
5305 break;
5306 case KVM_RESET_DIRTY_RINGS:
5307 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
5308 break;
5309 case KVM_GET_STATS_FD:
5310 r = kvm_vm_ioctl_get_stats_fd(kvm);
5311 break;
5312 #ifdef CONFIG_KVM_PRIVATE_MEM
5313 case KVM_CREATE_GUEST_MEMFD: {
5314 struct kvm_create_guest_memfd guest_memfd;
5315
5316 r = -EFAULT;
5317 if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
5318 goto out;
5319
5320 r = kvm_gmem_create(kvm, &guest_memfd);
5321 break;
5322 }
5323 #endif
5324 default:
5325 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
5326 }
5327 out:
5328 return r;
5329 }
5330
5331 #ifdef CONFIG_KVM_COMPAT
5332 struct compat_kvm_dirty_log {
5333 __u32 slot;
5334 __u32 padding1;
5335 union {
5336 compat_uptr_t dirty_bitmap; /* one bit per page */
5337 __u64 padding2;
5338 };
5339 };
5340
5341 struct compat_kvm_clear_dirty_log {
5342 __u32 slot;
5343 __u32 num_pages;
5344 __u64 first_page;
5345 union {
5346 compat_uptr_t dirty_bitmap; /* one bit per page */
5347 __u64 padding2;
5348 };
5349 };
5350
5351 long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5352 unsigned long arg)
5353 {
5354 return -ENOTTY;
5355 }
5356
5357 static long kvm_vm_compat_ioctl(struct file *filp,
5358 unsigned int ioctl, unsigned long arg)
5359 {
5360 struct kvm *kvm = filp->private_data;
5361 int r;
5362
5363 if (kvm->mm != current->mm || kvm->vm_dead)
5364 return -EIO;
5365
5366 r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5367 if (r != -ENOTTY)
5368 return r;
5369
5370 switch (ioctl) {
5371 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5372 case KVM_CLEAR_DIRTY_LOG: {
5373 struct compat_kvm_clear_dirty_log compat_log;
5374 struct kvm_clear_dirty_log log;
5375
5376 if (copy_from_user(&compat_log, (void __user *)arg,
5377 sizeof(compat_log)))
5378 return -EFAULT;
5379 log.slot = compat_log.slot;
5380 log.num_pages = compat_log.num_pages;
5381 log.first_page = compat_log.first_page;
5382 log.padding2 = compat_log.padding2;
5383 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5384
5385 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5386 break;
5387 }
5388 #endif
5389 case KVM_GET_DIRTY_LOG: {
5390 struct compat_kvm_dirty_log compat_log;
5391 struct kvm_dirty_log log;
5392
5393 if (copy_from_user(&compat_log, (void __user *)arg,
5394 sizeof(compat_log)))
5395 return -EFAULT;
5396 log.slot = compat_log.slot;
5397 log.padding1 = compat_log.padding1;
5398 log.padding2 = compat_log.padding2;
5399 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5400
5401 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5402 break;
5403 }
5404 default:
5405 r = kvm_vm_ioctl(filp, ioctl, arg);
5406 }
5407 return r;
5408 }
5409 #endif
5410
5411 static struct file_operations kvm_vm_fops = {
5412 .release = kvm_vm_release,
5413 .unlocked_ioctl = kvm_vm_ioctl,
5414 .llseek = noop_llseek,
5415 KVM_COMPAT(kvm_vm_compat_ioctl),
5416 };
5417
5418 bool file_is_kvm(struct file *file)
5419 {
5420 return file && file->f_op == &kvm_vm_fops;
5421 }
5422 EXPORT_SYMBOL_GPL(file_is_kvm);
5423
5424 static int kvm_dev_ioctl_create_vm(unsigned long type)
5425 {
5426 char fdname[ITOA_MAX_LEN + 1];
5427 int r, fd;
5428 struct kvm *kvm;
5429 struct file *file;
5430
5431 fd = get_unused_fd_flags(O_CLOEXEC);
5432 if (fd < 0)
5433 return fd;
5434
5435 snprintf(fdname, sizeof(fdname), "%d", fd);
5436
5437 kvm = kvm_create_vm(type, fdname);
5438 if (IS_ERR(kvm)) {
5439 r = PTR_ERR(kvm);
5440 goto put_fd;
5441 }
5442
5443 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5444 if (IS_ERR(file)) {
5445 r = PTR_ERR(file);
5446 goto put_kvm;
5447 }
5448
5449 /*
5450 * Don't call kvm_put_kvm anymore at this point; file->f_op is
5451 * already set, with ->release() being kvm_vm_release(). In error
5452 * cases it will be called by the final fput(file) and will take
5453 * care of doing kvm_put_kvm(kvm).
5454 */
5455 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5456
5457 fd_install(fd, file);
5458 return fd;
5459
5460 put_kvm:
5461 kvm_put_kvm(kvm);
5462 put_fd:
5463 put_unused_fd(fd);
5464 return r;
5465 }
5466
5467 static long kvm_dev_ioctl(struct file *filp,
5468 unsigned int ioctl, unsigned long arg)
5469 {
5470 int r = -EINVAL;
5471
5472 switch (ioctl) {
5473 case KVM_GET_API_VERSION:
5474 if (arg)
5475 goto out;
5476 r = KVM_API_VERSION;
5477 break;
5478 case KVM_CREATE_VM:
5479 r = kvm_dev_ioctl_create_vm(arg);
5480 break;
5481 case KVM_CHECK_EXTENSION:
5482 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5483 break;
5484 case KVM_GET_VCPU_MMAP_SIZE:
5485 if (arg)
5486 goto out;
5487 r = PAGE_SIZE; /* struct kvm_run */
5488 #ifdef CONFIG_X86
5489 r += PAGE_SIZE; /* pio data page */
5490 #endif
5491 #ifdef CONFIG_KVM_MMIO
5492 r += PAGE_SIZE; /* coalesced mmio ring page */
5493 #endif
5494 break;
5495 default:
5496 return kvm_arch_dev_ioctl(filp, ioctl, arg);
5497 }
5498 out:
5499 return r;
5500 }
5501
5502 static struct file_operations kvm_chardev_ops = {
5503 .unlocked_ioctl = kvm_dev_ioctl,
5504 .llseek = noop_llseek,
5505 KVM_COMPAT(kvm_dev_ioctl),
5506 };
5507
5508 static struct miscdevice kvm_dev = {
5509 KVM_MINOR,
5510 "kvm",
5511 &kvm_chardev_ops,
5512 };
5513
5514 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5515 __visible bool kvm_rebooting;
5516 EXPORT_SYMBOL_GPL(kvm_rebooting);
5517
5518 static DEFINE_PER_CPU(bool, hardware_enabled);
5519 static int kvm_usage_count;
5520
5521 static int __hardware_enable_nolock(void)
5522 {
5523 if (__this_cpu_read(hardware_enabled))
5524 return 0;
5525
5526 if (kvm_arch_hardware_enable()) {
5527 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5528 raw_smp_processor_id());
5529 return -EIO;
5530 }
5531
5532 __this_cpu_write(hardware_enabled, true);
5533 return 0;
5534 }
5535
5536 static void hardware_enable_nolock(void *failed)
5537 {
5538 if (__hardware_enable_nolock())
5539 atomic_inc(failed);
5540 }
5541
5542 static int kvm_online_cpu(unsigned int cpu)
5543 {
5544 int ret = 0;
5545
5546 /*
5547 * Abort the CPU online process if hardware virtualization cannot
5548 * be enabled. Otherwise running VMs would encounter unrecoverable
5549 * errors when scheduled to this CPU.
5550 */
5551 mutex_lock(&kvm_lock);
5552 if (kvm_usage_count)
5553 ret = __hardware_enable_nolock();
5554 mutex_unlock(&kvm_lock);
5555 return ret;
5556 }
5557
5558 static void hardware_disable_nolock(void *junk)
5559 {
5560 /*
5561 * Note, hardware_disable_all_nolock() tells all online CPUs to disable
5562 * hardware, not just CPUs that successfully enabled hardware!
5563 */
5564 if (!__this_cpu_read(hardware_enabled))
5565 return;
5566
5567 kvm_arch_hardware_disable();
5568
5569 __this_cpu_write(hardware_enabled, false);
5570 }
5571
5572 static int kvm_offline_cpu(unsigned int cpu)
5573 {
5574 mutex_lock(&kvm_lock);
5575 if (kvm_usage_count)
5576 hardware_disable_nolock(NULL);
5577 mutex_unlock(&kvm_lock);
5578 return 0;
5579 }
5580
5581 static void hardware_disable_all_nolock(void)
5582 {
5583 BUG_ON(!kvm_usage_count);
5584
5585 kvm_usage_count--;
5586 if (!kvm_usage_count)
5587 on_each_cpu(hardware_disable_nolock, NULL, 1);
5588 }
5589
5590 static void hardware_disable_all(void)
5591 {
5592 cpus_read_lock();
5593 mutex_lock(&kvm_lock);
5594 hardware_disable_all_nolock();
5595 mutex_unlock(&kvm_lock);
5596 cpus_read_unlock();
5597 }
5598
5599 static int hardware_enable_all(void)
5600 {
5601 atomic_t failed = ATOMIC_INIT(0);
5602 int r;
5603
5604 /*
5605 * Do not enable hardware virtualization if the system is going down.
5606 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5607 * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5608 * after kvm_reboot() is called. Note, this relies on system_state
5609 * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5610 * hook instead of registering a dedicated reboot notifier (the latter
5611 * runs before system_state is updated).
5612 */
5613 if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5614 system_state == SYSTEM_RESTART)
5615 return -EBUSY;
5616
5617 /*
5618 * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5619 * is called, and so on_each_cpu() between them includes the CPU that
5620 * is being onlined. As a result, hardware_enable_nolock() may get
5621 * invoked before kvm_online_cpu(), which also enables hardware if the
5622 * usage count is non-zero. Disable CPU hotplug to avoid attempting to
5623 * enable hardware multiple times.
5624 */
5625 cpus_read_lock();
5626 mutex_lock(&kvm_lock);
5627
5628 r = 0;
5629
5630 kvm_usage_count++;
5631 if (kvm_usage_count == 1) {
5632 on_each_cpu(hardware_enable_nolock, &failed, 1);
5633
5634 if (atomic_read(&failed)) {
5635 hardware_disable_all_nolock();
5636 r = -EBUSY;
5637 }
5638 }
5639
5640 mutex_unlock(&kvm_lock);
5641 cpus_read_unlock();
5642
5643 return r;
5644 }
5645
5646 static void kvm_shutdown(void)
5647 {
5648 /*
5649 * Disable hardware virtualization and set kvm_rebooting to indicate
5650 * that KVM has asynchronously disabled hardware virtualization, i.e.
5651 * that relevant errors and exceptions aren't entirely unexpected.
5652 * Some flavors of hardware virtualization need to be disabled before
5653 * transferring control to firmware (to perform shutdown/reboot), e.g.
5654 * on x86, virtualization can block INIT interrupts, which are used by
5655 * firmware to pull APs back under firmware control. Note, this path
5656 * is used for both shutdown and reboot scenarios, i.e. neither name is
5657 * 100% comprehensive.
5658 */
5659 pr_info("kvm: exiting hardware virtualization\n");
5660 kvm_rebooting = true;
5661 on_each_cpu(hardware_disable_nolock, NULL, 1);
5662 }
5663
5664 static int kvm_suspend(void)
5665 {
5666 /*
5667 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5668 * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5669 * is stable. Assert that kvm_lock is not held to ensure the system
5670 * isn't suspended while KVM is enabling hardware. Hardware enabling
5671 * can be preempted, but the task cannot be frozen until it has dropped
5672 * all locks (userspace tasks are frozen via a fake signal).
5673 */
5674 lockdep_assert_not_held(&kvm_lock);
5675 lockdep_assert_irqs_disabled();
5676
5677 if (kvm_usage_count)
5678 hardware_disable_nolock(NULL);
5679 return 0;
5680 }
5681
5682 static void kvm_resume(void)
5683 {
5684 lockdep_assert_not_held(&kvm_lock);
5685 lockdep_assert_irqs_disabled();
5686
5687 if (kvm_usage_count)
5688 WARN_ON_ONCE(__hardware_enable_nolock());
5689 }
5690
5691 static struct syscore_ops kvm_syscore_ops = {
5692 .suspend = kvm_suspend,
5693 .resume = kvm_resume,
5694 .shutdown = kvm_shutdown,
5695 };
5696 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5697 static int hardware_enable_all(void)
5698 {
5699 return 0;
5700 }
5701
5702 static void hardware_disable_all(void)
5703 {
5704
5705 }
5706 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5707
5708 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5709 {
5710 if (dev->ops->destructor)
5711 dev->ops->destructor(dev);
5712 }
5713
5714 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5715 {
5716 int i;
5717
5718 for (i = 0; i < bus->dev_count; i++) {
5719 struct kvm_io_device *pos = bus->range[i].dev;
5720
5721 kvm_iodevice_destructor(pos);
5722 }
5723 kfree(bus);
5724 }
5725
5726 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5727 const struct kvm_io_range *r2)
5728 {
5729 gpa_t addr1 = r1->addr;
5730 gpa_t addr2 = r2->addr;
5731
5732 if (addr1 < addr2)
5733 return -1;
5734
5735 /* If r2->len == 0, match the exact address. If r2->len != 0,
5736 * accept any overlapping write. Any order is acceptable for
5737 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5738 * we process all of them.
5739 */
5740 if (r2->len) {
5741 addr1 += r1->len;
5742 addr2 += r2->len;
5743 }
5744
5745 if (addr1 > addr2)
5746 return 1;
5747
5748 return 0;
5749 }
5750
5751 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5752 {
5753 return kvm_io_bus_cmp(p1, p2);
5754 }
5755
5756 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5757 gpa_t addr, int len)
5758 {
5759 struct kvm_io_range *range, key;
5760 int off;
5761
5762 key = (struct kvm_io_range) {
5763 .addr = addr,
5764 .len = len,
5765 };
5766
5767 range = bsearch(&key, bus->range, bus->dev_count,
5768 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5769 if (range == NULL)
5770 return -ENOENT;
5771
5772 off = range - bus->range;
5773
5774 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5775 off--;
5776
5777 return off;
5778 }
5779
5780 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5781 struct kvm_io_range *range, const void *val)
5782 {
5783 int idx;
5784
5785 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5786 if (idx < 0)
5787 return -EOPNOTSUPP;
5788
5789 while (idx < bus->dev_count &&
5790 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5791 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5792 range->len, val))
5793 return idx;
5794 idx++;
5795 }
5796
5797 return -EOPNOTSUPP;
5798 }
5799
5800 /* kvm_io_bus_write - called under kvm->slots_lock */
5801 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5802 int len, const void *val)
5803 {
5804 struct kvm_io_bus *bus;
5805 struct kvm_io_range range;
5806 int r;
5807
5808 range = (struct kvm_io_range) {
5809 .addr = addr,
5810 .len = len,
5811 };
5812
5813 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5814 if (!bus)
5815 return -ENOMEM;
5816 r = __kvm_io_bus_write(vcpu, bus, &range, val);
5817 return r < 0 ? r : 0;
5818 }
5819 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5820
5821 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
5822 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5823 gpa_t addr, int len, const void *val, long cookie)
5824 {
5825 struct kvm_io_bus *bus;
5826 struct kvm_io_range range;
5827
5828 range = (struct kvm_io_range) {
5829 .addr = addr,
5830 .len = len,
5831 };
5832
5833 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5834 if (!bus)
5835 return -ENOMEM;
5836
5837 /* First try the device referenced by cookie. */
5838 if ((cookie >= 0) && (cookie < bus->dev_count) &&
5839 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5840 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5841 val))
5842 return cookie;
5843
5844 /*
5845 * cookie contained garbage; fall back to search and return the
5846 * correct cookie value.
5847 */
5848 return __kvm_io_bus_write(vcpu, bus, &range, val);
5849 }
5850
5851 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5852 struct kvm_io_range *range, void *val)
5853 {
5854 int idx;
5855
5856 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5857 if (idx < 0)
5858 return -EOPNOTSUPP;
5859
5860 while (idx < bus->dev_count &&
5861 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5862 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5863 range->len, val))
5864 return idx;
5865 idx++;
5866 }
5867
5868 return -EOPNOTSUPP;
5869 }
5870
5871 /* kvm_io_bus_read - called under kvm->slots_lock */
5872 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5873 int len, void *val)
5874 {
5875 struct kvm_io_bus *bus;
5876 struct kvm_io_range range;
5877 int r;
5878
5879 range = (struct kvm_io_range) {
5880 .addr = addr,
5881 .len = len,
5882 };
5883
5884 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5885 if (!bus)
5886 return -ENOMEM;
5887 r = __kvm_io_bus_read(vcpu, bus, &range, val);
5888 return r < 0 ? r : 0;
5889 }
5890
5891 /* Caller must hold slots_lock. */
5892 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5893 int len, struct kvm_io_device *dev)
5894 {
5895 int i;
5896 struct kvm_io_bus *new_bus, *bus;
5897 struct kvm_io_range range;
5898
5899 bus = kvm_get_bus(kvm, bus_idx);
5900 if (!bus)
5901 return -ENOMEM;
5902
5903 /* exclude ioeventfd which is limited by maximum fd */
5904 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5905 return -ENOSPC;
5906
5907 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5908 GFP_KERNEL_ACCOUNT);
5909 if (!new_bus)
5910 return -ENOMEM;
5911
5912 range = (struct kvm_io_range) {
5913 .addr = addr,
5914 .len = len,
5915 .dev = dev,
5916 };
5917
5918 for (i = 0; i < bus->dev_count; i++)
5919 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5920 break;
5921
5922 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5923 new_bus->dev_count++;
5924 new_bus->range[i] = range;
5925 memcpy(new_bus->range + i + 1, bus->range + i,
5926 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5927 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5928 synchronize_srcu_expedited(&kvm->srcu);
5929 kfree(bus);
5930
5931 return 0;
5932 }
5933
5934 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5935 struct kvm_io_device *dev)
5936 {
5937 int i;
5938 struct kvm_io_bus *new_bus, *bus;
5939
5940 lockdep_assert_held(&kvm->slots_lock);
5941
5942 bus = kvm_get_bus(kvm, bus_idx);
5943 if (!bus)
5944 return 0;
5945
5946 for (i = 0; i < bus->dev_count; i++) {
5947 if (bus->range[i].dev == dev) {
5948 break;
5949 }
5950 }
5951
5952 if (i == bus->dev_count)
5953 return 0;
5954
5955 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5956 GFP_KERNEL_ACCOUNT);
5957 if (new_bus) {
5958 memcpy(new_bus, bus, struct_size(bus, range, i));
5959 new_bus->dev_count--;
5960 memcpy(new_bus->range + i, bus->range + i + 1,
5961 flex_array_size(new_bus, range, new_bus->dev_count - i));
5962 }
5963
5964 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5965 synchronize_srcu_expedited(&kvm->srcu);
5966
5967 /*
5968 * If NULL bus is installed, destroy the old bus, including all the
5969 * attached devices. Otherwise, destroy the caller's device only.
5970 */
5971 if (!new_bus) {
5972 pr_err("kvm: failed to shrink bus, removing it completely\n");
5973 kvm_io_bus_destroy(bus);
5974 return -ENOMEM;
5975 }
5976
5977 kvm_iodevice_destructor(dev);
5978 kfree(bus);
5979 return 0;
5980 }
5981
5982 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5983 gpa_t addr)
5984 {
5985 struct kvm_io_bus *bus;
5986 int dev_idx, srcu_idx;
5987 struct kvm_io_device *iodev = NULL;
5988
5989 srcu_idx = srcu_read_lock(&kvm->srcu);
5990
5991 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5992 if (!bus)
5993 goto out_unlock;
5994
5995 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5996 if (dev_idx < 0)
5997 goto out_unlock;
5998
5999 iodev = bus->range[dev_idx].dev;
6000
6001 out_unlock:
6002 srcu_read_unlock(&kvm->srcu, srcu_idx);
6003
6004 return iodev;
6005 }
6006 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
6007
6008 static int kvm_debugfs_open(struct inode *inode, struct file *file,
6009 int (*get)(void *, u64 *), int (*set)(void *, u64),
6010 const char *fmt)
6011 {
6012 int ret;
6013 struct kvm_stat_data *stat_data = inode->i_private;
6014
6015 /*
6016 * The debugfs files are a reference to the kvm struct which
6017 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
6018 * avoids the race between open and the removal of the debugfs directory.
6019 */
6020 if (!kvm_get_kvm_safe(stat_data->kvm))
6021 return -ENOENT;
6022
6023 ret = simple_attr_open(inode, file, get,
6024 kvm_stats_debugfs_mode(stat_data->desc) & 0222
6025 ? set : NULL, fmt);
6026 if (ret)
6027 kvm_put_kvm(stat_data->kvm);
6028
6029 return ret;
6030 }
6031
6032 static int kvm_debugfs_release(struct inode *inode, struct file *file)
6033 {
6034 struct kvm_stat_data *stat_data = inode->i_private;
6035
6036 simple_attr_release(inode, file);
6037 kvm_put_kvm(stat_data->kvm);
6038
6039 return 0;
6040 }
6041
6042 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
6043 {
6044 *val = *(u64 *)((void *)(&kvm->stat) + offset);
6045
6046 return 0;
6047 }
6048
6049 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
6050 {
6051 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
6052
6053 return 0;
6054 }
6055
6056 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
6057 {
6058 unsigned long i;
6059 struct kvm_vcpu *vcpu;
6060
6061 *val = 0;
6062
6063 kvm_for_each_vcpu(i, vcpu, kvm)
6064 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
6065
6066 return 0;
6067 }
6068
6069 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
6070 {
6071 unsigned long i;
6072 struct kvm_vcpu *vcpu;
6073
6074 kvm_for_each_vcpu(i, vcpu, kvm)
6075 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
6076
6077 return 0;
6078 }
6079
6080 static int kvm_stat_data_get(void *data, u64 *val)
6081 {
6082 int r = -EFAULT;
6083 struct kvm_stat_data *stat_data = data;
6084
6085 switch (stat_data->kind) {
6086 case KVM_STAT_VM:
6087 r = kvm_get_stat_per_vm(stat_data->kvm,
6088 stat_data->desc->desc.offset, val);
6089 break;
6090 case KVM_STAT_VCPU:
6091 r = kvm_get_stat_per_vcpu(stat_data->kvm,
6092 stat_data->desc->desc.offset, val);
6093 break;
6094 }
6095
6096 return r;
6097 }
6098
6099 static int kvm_stat_data_clear(void *data, u64 val)
6100 {
6101 int r = -EFAULT;
6102 struct kvm_stat_data *stat_data = data;
6103
6104 if (val)
6105 return -EINVAL;
6106
6107 switch (stat_data->kind) {
6108 case KVM_STAT_VM:
6109 r = kvm_clear_stat_per_vm(stat_data->kvm,
6110 stat_data->desc->desc.offset);
6111 break;
6112 case KVM_STAT_VCPU:
6113 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
6114 stat_data->desc->desc.offset);
6115 break;
6116 }
6117
6118 return r;
6119 }
6120
6121 static int kvm_stat_data_open(struct inode *inode, struct file *file)
6122 {
6123 __simple_attr_check_format("%llu\n", 0ull);
6124 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
6125 kvm_stat_data_clear, "%llu\n");
6126 }
6127
6128 static const struct file_operations stat_fops_per_vm = {
6129 .owner = THIS_MODULE,
6130 .open = kvm_stat_data_open,
6131 .release = kvm_debugfs_release,
6132 .read = simple_attr_read,
6133 .write = simple_attr_write,
6134 .llseek = no_llseek,
6135 };
6136
6137 static int vm_stat_get(void *_offset, u64 *val)
6138 {
6139 unsigned offset = (long)_offset;
6140 struct kvm *kvm;
6141 u64 tmp_val;
6142
6143 *val = 0;
6144 mutex_lock(&kvm_lock);
6145 list_for_each_entry(kvm, &vm_list, vm_list) {
6146 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
6147 *val += tmp_val;
6148 }
6149 mutex_unlock(&kvm_lock);
6150 return 0;
6151 }
6152
6153 static int vm_stat_clear(void *_offset, u64 val)
6154 {
6155 unsigned offset = (long)_offset;
6156 struct kvm *kvm;
6157
6158 if (val)
6159 return -EINVAL;
6160
6161 mutex_lock(&kvm_lock);
6162 list_for_each_entry(kvm, &vm_list, vm_list) {
6163 kvm_clear_stat_per_vm(kvm, offset);
6164 }
6165 mutex_unlock(&kvm_lock);
6166
6167 return 0;
6168 }
6169
6170 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
6171 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
6172
6173 static int vcpu_stat_get(void *_offset, u64 *val)
6174 {
6175 unsigned offset = (long)_offset;
6176 struct kvm *kvm;
6177 u64 tmp_val;
6178
6179 *val = 0;
6180 mutex_lock(&kvm_lock);
6181 list_for_each_entry(kvm, &vm_list, vm_list) {
6182 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
6183 *val += tmp_val;
6184 }
6185 mutex_unlock(&kvm_lock);
6186 return 0;
6187 }
6188
6189 static int vcpu_stat_clear(void *_offset, u64 val)
6190 {
6191 unsigned offset = (long)_offset;
6192 struct kvm *kvm;
6193
6194 if (val)
6195 return -EINVAL;
6196
6197 mutex_lock(&kvm_lock);
6198 list_for_each_entry(kvm, &vm_list, vm_list) {
6199 kvm_clear_stat_per_vcpu(kvm, offset);
6200 }
6201 mutex_unlock(&kvm_lock);
6202
6203 return 0;
6204 }
6205
6206 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
6207 "%llu\n");
6208 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
6209
6210 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
6211 {
6212 struct kobj_uevent_env *env;
6213 unsigned long long created, active;
6214
6215 if (!kvm_dev.this_device || !kvm)
6216 return;
6217
6218 mutex_lock(&kvm_lock);
6219 if (type == KVM_EVENT_CREATE_VM) {
6220 kvm_createvm_count++;
6221 kvm_active_vms++;
6222 } else if (type == KVM_EVENT_DESTROY_VM) {
6223 kvm_active_vms--;
6224 }
6225 created = kvm_createvm_count;
6226 active = kvm_active_vms;
6227 mutex_unlock(&kvm_lock);
6228
6229 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
6230 if (!env)
6231 return;
6232
6233 add_uevent_var(env, "CREATED=%llu", created);
6234 add_uevent_var(env, "COUNT=%llu", active);
6235
6236 if (type == KVM_EVENT_CREATE_VM) {
6237 add_uevent_var(env, "EVENT=create");
6238 kvm->userspace_pid = task_pid_nr(current);
6239 } else if (type == KVM_EVENT_DESTROY_VM) {
6240 add_uevent_var(env, "EVENT=destroy");
6241 }
6242 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
6243
6244 if (!IS_ERR(kvm->debugfs_dentry)) {
6245 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
6246
6247 if (p) {
6248 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
6249 if (!IS_ERR(tmp))
6250 add_uevent_var(env, "STATS_PATH=%s", tmp);
6251 kfree(p);
6252 }
6253 }
6254 /* no need for checks, since we are adding at most only 5 keys */
6255 env->envp[env->envp_idx++] = NULL;
6256 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
6257 kfree(env);
6258 }
6259
6260 static void kvm_init_debug(void)
6261 {
6262 const struct file_operations *fops;
6263 const struct _kvm_stats_desc *pdesc;
6264 int i;
6265
6266 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6267
6268 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
6269 pdesc = &kvm_vm_stats_desc[i];
6270 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6271 fops = &vm_stat_fops;
6272 else
6273 fops = &vm_stat_readonly_fops;
6274 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6275 kvm_debugfs_dir,
6276 (void *)(long)pdesc->desc.offset, fops);
6277 }
6278
6279 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
6280 pdesc = &kvm_vcpu_stats_desc[i];
6281 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6282 fops = &vcpu_stat_fops;
6283 else
6284 fops = &vcpu_stat_readonly_fops;
6285 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6286 kvm_debugfs_dir,
6287 (void *)(long)pdesc->desc.offset, fops);
6288 }
6289 }
6290
6291 static inline
6292 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
6293 {
6294 return container_of(pn, struct kvm_vcpu, preempt_notifier);
6295 }
6296
6297 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
6298 {
6299 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6300
6301 WRITE_ONCE(vcpu->preempted, false);
6302 WRITE_ONCE(vcpu->ready, false);
6303
6304 __this_cpu_write(kvm_running_vcpu, vcpu);
6305 kvm_arch_sched_in(vcpu, cpu);
6306 kvm_arch_vcpu_load(vcpu, cpu);
6307 }
6308
6309 static void kvm_sched_out(struct preempt_notifier *pn,
6310 struct task_struct *next)
6311 {
6312 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6313
6314 if (current->on_rq) {
6315 WRITE_ONCE(vcpu->preempted, true);
6316 WRITE_ONCE(vcpu->ready, true);
6317 }
6318 kvm_arch_vcpu_put(vcpu);
6319 __this_cpu_write(kvm_running_vcpu, NULL);
6320 }
6321
6322 /**
6323 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
6324 *
6325 * We can disable preemption locally around accessing the per-CPU variable,
6326 * and use the resolved vcpu pointer after enabling preemption again,
6327 * because even if the current thread is migrated to another CPU, reading
6328 * the per-CPU value later will give us the same value as we update the
6329 * per-CPU variable in the preempt notifier handlers.
6330 */
6331 struct kvm_vcpu *kvm_get_running_vcpu(void)
6332 {
6333 struct kvm_vcpu *vcpu;
6334
6335 preempt_disable();
6336 vcpu = __this_cpu_read(kvm_running_vcpu);
6337 preempt_enable();
6338
6339 return vcpu;
6340 }
6341 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
6342
6343 /**
6344 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6345 */
6346 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6347 {
6348 return &kvm_running_vcpu;
6349 }
6350
6351 #ifdef CONFIG_GUEST_PERF_EVENTS
6352 static unsigned int kvm_guest_state(void)
6353 {
6354 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6355 unsigned int state;
6356
6357 if (!kvm_arch_pmi_in_guest(vcpu))
6358 return 0;
6359
6360 state = PERF_GUEST_ACTIVE;
6361 if (!kvm_arch_vcpu_in_kernel(vcpu))
6362 state |= PERF_GUEST_USER;
6363
6364 return state;
6365 }
6366
6367 static unsigned long kvm_guest_get_ip(void)
6368 {
6369 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6370
6371 /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6372 if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6373 return 0;
6374
6375 return kvm_arch_vcpu_get_ip(vcpu);
6376 }
6377
6378 static struct perf_guest_info_callbacks kvm_guest_cbs = {
6379 .state = kvm_guest_state,
6380 .get_ip = kvm_guest_get_ip,
6381 .handle_intel_pt_intr = NULL,
6382 };
6383
6384 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6385 {
6386 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6387 perf_register_guest_info_callbacks(&kvm_guest_cbs);
6388 }
6389 void kvm_unregister_perf_callbacks(void)
6390 {
6391 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6392 }
6393 #endif
6394
6395 int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6396 {
6397 int r;
6398 int cpu;
6399
6400 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6401 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
6402 kvm_online_cpu, kvm_offline_cpu);
6403 if (r)
6404 return r;
6405
6406 register_syscore_ops(&kvm_syscore_ops);
6407 #endif
6408
6409 /* A kmem cache lets us meet the alignment requirements of fx_save. */
6410 if (!vcpu_align)
6411 vcpu_align = __alignof__(struct kvm_vcpu);
6412 kvm_vcpu_cache =
6413 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6414 SLAB_ACCOUNT,
6415 offsetof(struct kvm_vcpu, arch),
6416 offsetofend(struct kvm_vcpu, stats_id)
6417 - offsetof(struct kvm_vcpu, arch),
6418 NULL);
6419 if (!kvm_vcpu_cache) {
6420 r = -ENOMEM;
6421 goto err_vcpu_cache;
6422 }
6423
6424 for_each_possible_cpu(cpu) {
6425 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6426 GFP_KERNEL, cpu_to_node(cpu))) {
6427 r = -ENOMEM;
6428 goto err_cpu_kick_mask;
6429 }
6430 }
6431
6432 r = kvm_irqfd_init();
6433 if (r)
6434 goto err_irqfd;
6435
6436 r = kvm_async_pf_init();
6437 if (r)
6438 goto err_async_pf;
6439
6440 kvm_chardev_ops.owner = module;
6441 kvm_vm_fops.owner = module;
6442 kvm_vcpu_fops.owner = module;
6443 kvm_device_fops.owner = module;
6444
6445 kvm_preempt_ops.sched_in = kvm_sched_in;
6446 kvm_preempt_ops.sched_out = kvm_sched_out;
6447
6448 kvm_init_debug();
6449
6450 r = kvm_vfio_ops_init();
6451 if (WARN_ON_ONCE(r))
6452 goto err_vfio;
6453
6454 kvm_gmem_init(module);
6455
6456 /*
6457 * Registration _must_ be the very last thing done, as this exposes
6458 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6459 */
6460 r = misc_register(&kvm_dev);
6461 if (r) {
6462 pr_err("kvm: misc device register failed\n");
6463 goto err_register;
6464 }
6465
6466 return 0;
6467
6468 err_register:
6469 kvm_vfio_ops_exit();
6470 err_vfio:
6471 kvm_async_pf_deinit();
6472 err_async_pf:
6473 kvm_irqfd_exit();
6474 err_irqfd:
6475 err_cpu_kick_mask:
6476 for_each_possible_cpu(cpu)
6477 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6478 kmem_cache_destroy(kvm_vcpu_cache);
6479 err_vcpu_cache:
6480 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6481 unregister_syscore_ops(&kvm_syscore_ops);
6482 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6483 #endif
6484 return r;
6485 }
6486 EXPORT_SYMBOL_GPL(kvm_init);
6487
6488 void kvm_exit(void)
6489 {
6490 int cpu;
6491
6492 /*
6493 * Note, unregistering /dev/kvm doesn't strictly need to come first,
6494 * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6495 * to KVM while the module is being stopped.
6496 */
6497 misc_deregister(&kvm_dev);
6498
6499 debugfs_remove_recursive(kvm_debugfs_dir);
6500 for_each_possible_cpu(cpu)
6501 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6502 kmem_cache_destroy(kvm_vcpu_cache);
6503 kvm_vfio_ops_exit();
6504 kvm_async_pf_deinit();
6505 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6506 unregister_syscore_ops(&kvm_syscore_ops);
6507 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
6508 #endif
6509 kvm_irqfd_exit();
6510 }
6511 EXPORT_SYMBOL_GPL(kvm_exit);
6512
6513 struct kvm_vm_worker_thread_context {
6514 struct kvm *kvm;
6515 struct task_struct *parent;
6516 struct completion init_done;
6517 kvm_vm_thread_fn_t thread_fn;
6518 uintptr_t data;
6519 int err;
6520 };
6521
6522 static int kvm_vm_worker_thread(void *context)
6523 {
6524 /*
6525 * The init_context is allocated on the stack of the parent thread, so
6526 * we have to locally copy anything that is needed beyond initialization
6527 */
6528 struct kvm_vm_worker_thread_context *init_context = context;
6529 struct task_struct *parent;
6530 struct kvm *kvm = init_context->kvm;
6531 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6532 uintptr_t data = init_context->data;
6533 int err;
6534
6535 err = kthread_park(current);
6536 /* kthread_park(current) is never supposed to return an error */
6537 WARN_ON(err != 0);
6538 if (err)
6539 goto init_complete;
6540
6541 err = cgroup_attach_task_all(init_context->parent, current);
6542 if (err) {
6543 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6544 __func__, err);
6545 goto init_complete;
6546 }
6547
6548 set_user_nice(current, task_nice(init_context->parent));
6549
6550 init_complete:
6551 init_context->err = err;
6552 complete(&init_context->init_done);
6553 init_context = NULL;
6554
6555 if (err)
6556 goto out;
6557
6558 /* Wait to be woken up by the spawner before proceeding. */
6559 kthread_parkme();
6560
6561 if (!kthread_should_stop())
6562 err = thread_fn(kvm, data);
6563
6564 out:
6565 /*
6566 * Move kthread back to its original cgroup to prevent it lingering in
6567 * the cgroup of the VM process, after the latter finishes its
6568 * execution.
6569 *
6570 * kthread_stop() waits on the 'exited' completion condition which is
6571 * set in exit_mm(), via mm_release(), in do_exit(). However, the
6572 * kthread is removed from the cgroup in the cgroup_exit() which is
6573 * called after the exit_mm(). This causes the kthread_stop() to return
6574 * before the kthread actually quits the cgroup.
6575 */
6576 rcu_read_lock();
6577 parent = rcu_dereference(current->real_parent);
6578 get_task_struct(parent);
6579 rcu_read_unlock();
6580 cgroup_attach_task_all(parent, current);
6581 put_task_struct(parent);
6582
6583 return err;
6584 }
6585
6586 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6587 uintptr_t data, const char *name,
6588 struct task_struct **thread_ptr)
6589 {
6590 struct kvm_vm_worker_thread_context init_context = {};
6591 struct task_struct *thread;
6592
6593 *thread_ptr = NULL;
6594 init_context.kvm = kvm;
6595 init_context.parent = current;
6596 init_context.thread_fn = thread_fn;
6597 init_context.data = data;
6598 init_completion(&init_context.init_done);
6599
6600 thread = kthread_run(kvm_vm_worker_thread, &init_context,
6601 "%s-%d", name, task_pid_nr(current));
6602 if (IS_ERR(thread))
6603 return PTR_ERR(thread);
6604
6605 /* kthread_run is never supposed to return NULL */
6606 WARN_ON(thread == NULL);
6607
6608 wait_for_completion(&init_context.init_done);
6609
6610 if (!init_context.err)
6611 *thread_ptr = thread;
6612
6613 return init_context.err;
6614 }