]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - virt/kvm/kvm_main.c
KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest()
[thirdparty/kernel/stable.git] / virt / kvm / kvm_main.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
6aa8b732
AK
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9611c187 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
6aa8b732
AK
14 */
15
af669ac6 16#include <kvm/iodev.h>
6aa8b732 17
edf88417 18#include <linux/kvm_host.h>
6aa8b732
AK
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
6aa8b732 22#include <linux/percpu.h>
6aa8b732
AK
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
6aa8b732 26#include <linux/reboot.h>
6aa8b732
AK
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
fb3600cc 30#include <linux/syscore_ops.h>
774c47f1 31#include <linux/cpu.h>
174cd4b1 32#include <linux/sched/signal.h>
6e84f315 33#include <linux/sched/mm.h>
03441a34 34#include <linux/sched/stat.h>
d9e368d6
AK
35#include <linux/cpumask.h>
36#include <linux/smp.h>
d6d28168 37#include <linux/anon_inodes.h>
04d2cc77 38#include <linux/profile.h>
7aa81cc0 39#include <linux/kvm_para.h>
6fc138d2 40#include <linux/pagemap.h>
8d4e1288 41#include <linux/mman.h>
35149e21 42#include <linux/swap.h>
e56d532f 43#include <linux/bitops.h>
547de29e 44#include <linux/spinlock.h>
6ff5894c 45#include <linux/compat.h>
bc6678a3 46#include <linux/srcu.h>
8f0b1ab6 47#include <linux/hugetlb.h>
5a0e3ad6 48#include <linux/slab.h>
743eeb0b
SL
49#include <linux/sort.h>
50#include <linux/bsearch.h>
c011d23b 51#include <linux/io.h>
2eb06c30 52#include <linux/lockdep.h>
c57c8046 53#include <linux/kthread.h>
2fdef3a2 54#include <linux/suspend.h>
6aa8b732 55
e495606d 56#include <asm/processor.h>
2ea75be3 57#include <asm/ioctl.h>
7c0f6ba6 58#include <linux/uaccess.h>
6aa8b732 59
5f94c174 60#include "coalesced_mmio.h"
af585b92 61#include "async_pf.h"
982ed0de 62#include "kvm_mm.h"
3c3c29fd 63#include "vfio.h"
5f94c174 64
229456fc
MT
65#define CREATE_TRACE_POINTS
66#include <trace/events/kvm.h>
67
fb04a1ed
PX
68#include <linux/kvm_dirty_ring.h>
69
536a6f88
JF
70/* Worst case buffer size needed for holding an integer. */
71#define ITOA_MAX_LEN 12
72
6aa8b732
AK
73MODULE_AUTHOR("Qumranet");
74MODULE_LICENSE("GPL");
75
920552b2 76/* Architectures should define their poll value according to the halt latency */
ec76d819 77unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
039c5d1b 78module_param(halt_poll_ns, uint, 0644);
ec76d819 79EXPORT_SYMBOL_GPL(halt_poll_ns);
f7819512 80
aca6ff29 81/* Default doubles per-vcpu halt_poll_ns. */
ec76d819 82unsigned int halt_poll_ns_grow = 2;
039c5d1b 83module_param(halt_poll_ns_grow, uint, 0644);
ec76d819 84EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
aca6ff29 85
49113d36
NW
86/* The start value to grow halt_poll_ns from */
87unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
88module_param(halt_poll_ns_grow_start, uint, 0644);
89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
aca6ff29 91/* Default resets per-vcpu halt_poll_ns . */
ec76d819 92unsigned int halt_poll_ns_shrink;
039c5d1b 93module_param(halt_poll_ns_shrink, uint, 0644);
ec76d819 94EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
aca6ff29 95
fa40a821
MT
96/*
97 * Ordering of locks:
98 *
b7d409de 99 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
fa40a821
MT
100 */
101
0d9ce162 102DEFINE_MUTEX(kvm_lock);
4a937f96 103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
e9b11c17 104LIST_HEAD(vm_list);
133de902 105
7f59f492 106static cpumask_var_t cpus_hardware_enabled;
f4fee932 107static int kvm_usage_count;
10474ae8 108static atomic_t hardware_enable_failed;
1b6c0168 109
aaba298c 110static struct kmem_cache *kvm_vcpu_cache;
1165f5fe 111
15ad7146 112static __read_mostly struct preempt_ops kvm_preempt_ops;
7495e22b 113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
15ad7146 114
76f7c879 115struct dentry *kvm_debugfs_dir;
e23a808b 116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
6aa8b732 117
09cbcef6 118static const struct file_operations stat_fops_per_vm;
536a6f88 119
5f6de5cb
DM
120static struct file_operations kvm_chardev_ops;
121
bccf2150
AK
122static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
123 unsigned long arg);
de8e5d74 124#ifdef CONFIG_KVM_COMPAT
1dda606c
AG
125static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
126 unsigned long arg);
7ddfd3e0
MZ
127#define KVM_COMPAT(c) .compat_ioctl = (c)
128#else
9cb09e7c
MZ
129/*
130 * For architectures that don't implement a compat infrastructure,
131 * adopt a double line of defense:
132 * - Prevent a compat task from opening /dev/kvm
133 * - If the open has been done by a 64bit task, and the KVM fd
134 * passed to a compat task, let the ioctls fail.
135 */
7ddfd3e0
MZ
136static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
137 unsigned long arg) { return -EINVAL; }
b9876e6d
MZ
138
139static int kvm_no_compat_open(struct inode *inode, struct file *file)
140{
141 return is_compat_task() ? -ENODEV : 0;
142}
143#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
144 .open = kvm_no_compat_open
1dda606c 145#endif
10474ae8
AG
146static int hardware_enable_all(void);
147static void hardware_disable_all(void);
bccf2150 148
e93f8a0f 149static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
7940876e 150
52480137 151__visible bool kvm_rebooting;
b7c4145b 152EXPORT_SYMBOL_GPL(kvm_rebooting);
4ecac3fd 153
286de8f6
CI
154#define KVM_EVENT_CREATE_VM 0
155#define KVM_EVENT_DESTROY_VM 1
156static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
157static unsigned long long kvm_createvm_count;
158static unsigned long long kvm_active_vms;
159
baff59cc
VK
160static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
161
e649b3f0
ET
162__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
163 unsigned long start, unsigned long end)
b1394e74
RK
164{
165}
166
683412cc
MZ
167__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
168{
169}
170
284dc493 171bool kvm_is_zone_device_page(struct page *page)
a78986aa
SC
172{
173 /*
174 * The metadata used by is_zone_device_page() to determine whether or
175 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
176 * the device has been pinned, e.g. by get_user_pages(). WARN if the
177 * page_count() is zero to help detect bad usage of this helper.
178 */
284dc493 179 if (WARN_ON_ONCE(!page_count(page)))
a78986aa
SC
180 return false;
181
284dc493 182 return is_zone_device_page(page);
a78986aa
SC
183}
184
b14b2690
SC
185/*
186 * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
187 * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
188 * is likely incomplete, it has been compiled purely through people wanting to
189 * back guest with a certain type of memory and encountering issues.
190 */
191struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
cbff90a7 192{
b14b2690
SC
193 struct page *page;
194
195 if (!pfn_valid(pfn))
196 return NULL;
197
198 page = pfn_to_page(pfn);
199 if (!PageReserved(page))
200 return page;
201
202 /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
203 if (is_zero_pfn(pfn))
204 return page;
205
a78986aa
SC
206 /*
207 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
208 * perspective they are "normal" pages, albeit with slightly different
209 * usage rules.
210 */
b14b2690
SC
211 if (kvm_is_zone_device_page(page))
212 return page;
cbff90a7 213
b14b2690 214 return NULL;
cbff90a7
BAY
215}
216
bccf2150
AK
217/*
218 * Switches to specified vcpu, until a matching vcpu_put()
219 */
ec7660cc 220void vcpu_load(struct kvm_vcpu *vcpu)
6aa8b732 221{
ec7660cc 222 int cpu = get_cpu();
7495e22b
PB
223
224 __this_cpu_write(kvm_running_vcpu, vcpu);
15ad7146 225 preempt_notifier_register(&vcpu->preempt_notifier);
313a3dc7 226 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146 227 put_cpu();
6aa8b732 228}
2f1fe811 229EXPORT_SYMBOL_GPL(vcpu_load);
6aa8b732 230
313a3dc7 231void vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 232{
15ad7146 233 preempt_disable();
313a3dc7 234 kvm_arch_vcpu_put(vcpu);
15ad7146 235 preempt_notifier_unregister(&vcpu->preempt_notifier);
7495e22b 236 __this_cpu_write(kvm_running_vcpu, NULL);
15ad7146 237 preempt_enable();
6aa8b732 238}
2f1fe811 239EXPORT_SYMBOL_GPL(vcpu_put);
6aa8b732 240
7a97cec2
PB
241/* TODO: merge with kvm_arch_vcpu_should_kick */
242static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
243{
244 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
245
246 /*
247 * We need to wait for the VCPU to reenable interrupts and get out of
248 * READING_SHADOW_PAGE_TABLES mode.
249 */
250 if (req & KVM_REQUEST_WAIT)
251 return mode != OUTSIDE_GUEST_MODE;
252
253 /*
254 * Need to kick a running VCPU, but otherwise there is nothing to do.
255 */
256 return mode == IN_GUEST_MODE;
257}
258
f24b44e4 259static void ack_kick(void *_completed)
d9e368d6 260{
d9e368d6
AK
261}
262
620b2438 263static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
b49defe8 264{
b49defe8
PB
265 if (cpumask_empty(cpus))
266 return false;
267
f24b44e4 268 smp_call_function_many(cpus, ack_kick, NULL, wait);
b49defe8
PB
269 return true;
270}
271
b56bd8e0
JL
272static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
273 struct cpumask *tmp, int current_cpu)
ae0946cd
VK
274{
275 int cpu;
276
df06dae3
SC
277 if (likely(!(req & KVM_REQUEST_NO_ACTION)))
278 __kvm_make_request(req, vcpu);
ae0946cd
VK
279
280 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
281 return;
282
ae0946cd
VK
283 /*
284 * Note, the vCPU could get migrated to a different pCPU at any point
285 * after kvm_request_needs_ipi(), which could result in sending an IPI
286 * to the previous pCPU. But, that's OK because the purpose of the IPI
287 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
288 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
289 * after this point is also OK, as the requirement is only that KVM wait
290 * for vCPUs that were reading SPTEs _before_ any changes were
291 * finalized. See kvm_vcpu_kick() for more details on handling requests.
292 */
293 if (kvm_request_needs_ipi(vcpu, req)) {
294 cpu = READ_ONCE(vcpu->cpu);
295 if (cpu != -1 && cpu != current_cpu)
296 __cpumask_set_cpu(cpu, tmp);
297 }
298}
299
7053df4e 300bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
620b2438 301 unsigned long *vcpu_bitmap)
d9e368d6 302{
d9e368d6 303 struct kvm_vcpu *vcpu;
620b2438 304 struct cpumask *cpus;
ae0946cd 305 int i, me;
7053df4e 306 bool called;
6ef7a1bc 307
3cba4130 308 me = get_cpu();
7053df4e 309
620b2438
VK
310 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
311 cpumask_clear(cpus);
312
ae0946cd
VK
313 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
314 vcpu = kvm_get_vcpu(kvm, i);
381cecc5 315 if (!vcpu)
7053df4e 316 continue;
b56bd8e0 317 kvm_make_vcpu_request(vcpu, req, cpus, me);
49846896 318 }
7053df4e 319
620b2438 320 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
3cba4130 321 put_cpu();
7053df4e
VK
322
323 return called;
324}
325
54163a34
SS
326bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
327 struct kvm_vcpu *except)
7053df4e 328{
ae0946cd 329 struct kvm_vcpu *vcpu;
baff59cc 330 struct cpumask *cpus;
46808a4c 331 unsigned long i;
7053df4e 332 bool called;
46808a4c 333 int me;
7053df4e 334
ae0946cd
VK
335 me = get_cpu();
336
baff59cc
VK
337 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
338 cpumask_clear(cpus);
339
ae0946cd
VK
340 kvm_for_each_vcpu(i, vcpu, kvm) {
341 if (vcpu == except)
342 continue;
b56bd8e0 343 kvm_make_vcpu_request(vcpu, req, cpus, me);
ae0946cd
VK
344 }
345
346 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
347 put_cpu();
7053df4e 348
49846896 349 return called;
d9e368d6
AK
350}
351
54163a34
SS
352bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
353{
354 return kvm_make_all_cpus_request_except(kvm, req, NULL);
355}
a2486020 356EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
54163a34 357
a6d51016 358#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
49846896 359void kvm_flush_remote_tlbs(struct kvm *kvm)
2e53d63a 360{
3cc4e148 361 ++kvm->stat.generic.remote_tlb_flush_requests;
6bc6db00 362
4ae3cb3a
LT
363 /*
364 * We want to publish modifications to the page tables before reading
365 * mode. Pairs with a memory barrier in arch-specific code.
366 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
367 * and smp_mb in walk_shadow_page_lockless_begin/end.
368 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
369 *
370 * There is already an smp_mb__after_atomic() before
371 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
372 * barrier here.
373 */
b08660e5
TL
374 if (!kvm_arch_flush_remote_tlb(kvm)
375 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
0193cc90 376 ++kvm->stat.generic.remote_tlb_flush;
2e53d63a 377}
2ba9f0d8 378EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
a6d51016 379#endif
2e53d63a 380
683412cc
MZ
381static void kvm_flush_shadow_all(struct kvm *kvm)
382{
383 kvm_arch_flush_shadow_all(kvm);
384 kvm_arch_guest_memory_reclaimed(kvm);
385}
386
6926f95a
SC
387#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
388static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
389 gfp_t gfp_flags)
390{
391 gfp_flags |= mc->gfp_zero;
392
393 if (mc->kmem_cache)
394 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
395 else
396 return (void *)__get_free_page(gfp_flags);
397}
398
837f66c7 399int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
6926f95a 400{
63f4b210 401 gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
6926f95a
SC
402 void *obj;
403
404 if (mc->nobjs >= min)
405 return 0;
837f66c7
DM
406
407 if (unlikely(!mc->objects)) {
408 if (WARN_ON_ONCE(!capacity))
409 return -EIO;
410
411 mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
412 if (!mc->objects)
413 return -ENOMEM;
414
415 mc->capacity = capacity;
416 }
417
418 /* It is illegal to request a different capacity across topups. */
419 if (WARN_ON_ONCE(mc->capacity != capacity))
420 return -EIO;
421
422 while (mc->nobjs < mc->capacity) {
423 obj = mmu_memory_cache_alloc_obj(mc, gfp);
6926f95a
SC
424 if (!obj)
425 return mc->nobjs >= min ? 0 : -ENOMEM;
426 mc->objects[mc->nobjs++] = obj;
427 }
428 return 0;
429}
430
837f66c7
DM
431int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
432{
433 return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
434}
435
6926f95a
SC
436int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
437{
438 return mc->nobjs;
439}
440
441void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
442{
443 while (mc->nobjs) {
444 if (mc->kmem_cache)
445 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
446 else
447 free_page((unsigned long)mc->objects[--mc->nobjs]);
448 }
837f66c7
DM
449
450 kvfree(mc->objects);
451
452 mc->objects = NULL;
453 mc->capacity = 0;
6926f95a
SC
454}
455
456void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
457{
458 void *p;
459
460 if (WARN_ON(!mc->nobjs))
461 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
462 else
463 p = mc->objects[--mc->nobjs];
464 BUG_ON(!p);
465 return p;
466}
467#endif
468
8bd826d6 469static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
fb3f0f51 470{
fb3f0f51
RR
471 mutex_init(&vcpu->mutex);
472 vcpu->cpu = -1;
fb3f0f51
RR
473 vcpu->kvm = kvm;
474 vcpu->vcpu_id = id;
34bb10b7 475 vcpu->pid = NULL;
510958e9 476#ifndef __KVM_HAVE_ARCH_WQP
da4ad88c 477 rcuwait_init(&vcpu->wait);
510958e9 478#endif
af585b92 479 kvm_async_pf_vcpu_init(vcpu);
fb3f0f51 480
4c088493
R
481 kvm_vcpu_set_in_spin_loop(vcpu, false);
482 kvm_vcpu_set_dy_eligible(vcpu, false);
3a08a8f9 483 vcpu->preempted = false;
d73eb57b 484 vcpu->ready = false;
d5c48deb 485 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
a54d8066 486 vcpu->last_used_slot = NULL;
58fc1166
OU
487
488 /* Fill the stats id string for the vcpu */
489 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
490 task_pid_nr(current), id);
fb3f0f51 491}
fb3f0f51 492
27592ae8 493static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
4543bdc0
SC
494{
495 kvm_arch_vcpu_destroy(vcpu);
5593473a 496 kvm_dirty_ring_free(&vcpu->dirty_ring);
e529ef66 497
9941d224
SC
498 /*
499 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
500 * the vcpu->pid pointer, and at destruction time all file descriptors
501 * are already gone.
502 */
503 put_pid(rcu_dereference_protected(vcpu->pid, 1));
504
8bd826d6 505 free_page((unsigned long)vcpu->run);
e529ef66 506 kmem_cache_free(kvm_vcpu_cache, vcpu);
4543bdc0 507}
27592ae8
MZ
508
509void kvm_destroy_vcpus(struct kvm *kvm)
510{
46808a4c 511 unsigned long i;
27592ae8
MZ
512 struct kvm_vcpu *vcpu;
513
514 kvm_for_each_vcpu(i, vcpu, kvm) {
515 kvm_vcpu_destroy(vcpu);
c5b07754 516 xa_erase(&kvm->vcpu_array, i);
27592ae8
MZ
517 }
518
519 atomic_set(&kvm->online_vcpus, 0);
520}
521EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
4543bdc0 522
e930bffe
AA
523#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
524static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
525{
526 return container_of(mn, struct kvm, mmu_notifier);
527}
528
e649b3f0
ET
529static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
530 struct mm_struct *mm,
531 unsigned long start, unsigned long end)
532{
533 struct kvm *kvm = mmu_notifier_to_kvm(mn);
534 int idx;
535
536 idx = srcu_read_lock(&kvm->srcu);
537 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
538 srcu_read_unlock(&kvm->srcu, idx);
539}
540
3039bcc7
SC
541typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
542
f922bd9b
SC
543typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
544 unsigned long end);
545
683412cc
MZ
546typedef void (*on_unlock_fn_t)(struct kvm *kvm);
547
3039bcc7
SC
548struct kvm_hva_range {
549 unsigned long start;
550 unsigned long end;
551 pte_t pte;
552 hva_handler_t handler;
f922bd9b 553 on_lock_fn_t on_lock;
683412cc 554 on_unlock_fn_t on_unlock;
3039bcc7
SC
555 bool flush_on_ret;
556 bool may_block;
557};
558
f922bd9b
SC
559/*
560 * Use a dedicated stub instead of NULL to indicate that there is no callback
561 * function/handler. The compiler technically can't guarantee that a real
562 * function will have a non-zero address, and so it will generate code to
563 * check for !NULL, whereas comparing against a stub will be elided at compile
564 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
565 */
566static void kvm_null_fn(void)
567{
568
569}
570#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
571
ed922739
MS
572/* Iterate over each memslot intersecting [start, last] (inclusive) range */
573#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
574 for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
575 node; \
576 node = interval_tree_iter_next(node, start, last)) \
577
3039bcc7
SC
578static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
579 const struct kvm_hva_range *range)
580{
8931a454 581 bool ret = false, locked = false;
f922bd9b 582 struct kvm_gfn_range gfn_range;
3039bcc7
SC
583 struct kvm_memory_slot *slot;
584 struct kvm_memslots *slots;
3039bcc7
SC
585 int i, idx;
586
ed922739
MS
587 if (WARN_ON_ONCE(range->end <= range->start))
588 return 0;
589
f922bd9b
SC
590 /* A null handler is allowed if and only if on_lock() is provided. */
591 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
592 IS_KVM_NULL_FN(range->handler)))
593 return 0;
594
3039bcc7
SC
595 idx = srcu_read_lock(&kvm->srcu);
596
597 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
ed922739
MS
598 struct interval_tree_node *node;
599
3039bcc7 600 slots = __kvm_memslots(kvm, i);
ed922739
MS
601 kvm_for_each_memslot_in_hva_range(node, slots,
602 range->start, range->end - 1) {
3039bcc7
SC
603 unsigned long hva_start, hva_end;
604
a54d8066 605 slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
3039bcc7
SC
606 hva_start = max(range->start, slot->userspace_addr);
607 hva_end = min(range->end, slot->userspace_addr +
608 (slot->npages << PAGE_SHIFT));
3039bcc7
SC
609
610 /*
611 * To optimize for the likely case where the address
612 * range is covered by zero or one memslots, don't
613 * bother making these conditional (to avoid writes on
614 * the second or later invocation of the handler).
615 */
616 gfn_range.pte = range->pte;
617 gfn_range.may_block = range->may_block;
618
619 /*
620 * {gfn(page) | page intersects with [hva_start, hva_end)} =
621 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
622 */
623 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
624 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
625 gfn_range.slot = slot;
626
8931a454
SC
627 if (!locked) {
628 locked = true;
629 KVM_MMU_LOCK(kvm);
071064f1
PB
630 if (!IS_KVM_NULL_FN(range->on_lock))
631 range->on_lock(kvm, range->start, range->end);
632 if (IS_KVM_NULL_FN(range->handler))
633 break;
8931a454 634 }
3039bcc7
SC
635 ret |= range->handler(kvm, &gfn_range);
636 }
637 }
638
6bc6db00 639 if (range->flush_on_ret && ret)
3039bcc7
SC
640 kvm_flush_remote_tlbs(kvm);
641
683412cc 642 if (locked) {
8931a454 643 KVM_MMU_UNLOCK(kvm);
683412cc
MZ
644 if (!IS_KVM_NULL_FN(range->on_unlock))
645 range->on_unlock(kvm);
646 }
f922bd9b 647
3039bcc7
SC
648 srcu_read_unlock(&kvm->srcu, idx);
649
650 /* The notifiers are averse to booleans. :-( */
651 return (int)ret;
652}
653
654static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
655 unsigned long start,
656 unsigned long end,
657 pte_t pte,
658 hva_handler_t handler)
659{
660 struct kvm *kvm = mmu_notifier_to_kvm(mn);
661 const struct kvm_hva_range range = {
662 .start = start,
663 .end = end,
664 .pte = pte,
665 .handler = handler,
f922bd9b 666 .on_lock = (void *)kvm_null_fn,
683412cc 667 .on_unlock = (void *)kvm_null_fn,
3039bcc7
SC
668 .flush_on_ret = true,
669 .may_block = false,
670 };
3039bcc7 671
f922bd9b 672 return __kvm_handle_hva_range(kvm, &range);
3039bcc7
SC
673}
674
675static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
676 unsigned long start,
677 unsigned long end,
678 hva_handler_t handler)
679{
680 struct kvm *kvm = mmu_notifier_to_kvm(mn);
681 const struct kvm_hva_range range = {
682 .start = start,
683 .end = end,
684 .pte = __pte(0),
685 .handler = handler,
f922bd9b 686 .on_lock = (void *)kvm_null_fn,
683412cc 687 .on_unlock = (void *)kvm_null_fn,
3039bcc7
SC
688 .flush_on_ret = false,
689 .may_block = false,
690 };
3039bcc7 691
f922bd9b 692 return __kvm_handle_hva_range(kvm, &range);
3039bcc7 693}
3da0dd43
IE
694static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
695 struct mm_struct *mm,
696 unsigned long address,
697 pte_t pte)
698{
699 struct kvm *kvm = mmu_notifier_to_kvm(mn);
700
501b9185
SC
701 trace_kvm_set_spte_hva(address);
702
c13fda23 703 /*
52ac8b35 704 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
20ec3ebd
CP
705 * If mmu_invalidate_in_progress is zero, then no in-progress
706 * invalidations, including this one, found a relevant memslot at
707 * start(); rechecking memslots here is unnecessary. Note, a false
708 * positive (count elevated by a different invalidation) is sub-optimal
709 * but functionally ok.
c13fda23 710 */
52ac8b35 711 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
20ec3ebd 712 if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
071064f1 713 return;
c13fda23 714
3039bcc7 715 kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
3da0dd43
IE
716}
717
20ec3ebd
CP
718void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
719 unsigned long end)
e930bffe 720{
e930bffe
AA
721 /*
722 * The count increase must become visible at unlock time as no
723 * spte can be established without taking the mmu_lock and
724 * count is also read inside the mmu_lock critical section.
725 */
20ec3ebd
CP
726 kvm->mmu_invalidate_in_progress++;
727 if (likely(kvm->mmu_invalidate_in_progress == 1)) {
728 kvm->mmu_invalidate_range_start = start;
729 kvm->mmu_invalidate_range_end = end;
4a42d848
DS
730 } else {
731 /*
a413a625 732 * Fully tracking multiple concurrent ranges has diminishing
4a42d848
DS
733 * returns. Keep things simple and just find the minimal range
734 * which includes the current and new ranges. As there won't be
735 * enough information to subtract a range after its invalidate
736 * completes, any ranges invalidated concurrently will
737 * accumulate and persist until all outstanding invalidates
738 * complete.
739 */
20ec3ebd
CP
740 kvm->mmu_invalidate_range_start =
741 min(kvm->mmu_invalidate_range_start, start);
742 kvm->mmu_invalidate_range_end =
743 max(kvm->mmu_invalidate_range_end, end);
4a42d848 744 }
f922bd9b 745}
3039bcc7 746
f922bd9b
SC
747static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
748 const struct mmu_notifier_range *range)
749{
750 struct kvm *kvm = mmu_notifier_to_kvm(mn);
751 const struct kvm_hva_range hva_range = {
752 .start = range->start,
753 .end = range->end,
754 .pte = __pte(0),
755 .handler = kvm_unmap_gfn_range,
20ec3ebd 756 .on_lock = kvm_mmu_invalidate_begin,
683412cc 757 .on_unlock = kvm_arch_guest_memory_reclaimed,
f922bd9b
SC
758 .flush_on_ret = true,
759 .may_block = mmu_notifier_range_blockable(range),
760 };
565f3be2 761
f922bd9b
SC
762 trace_kvm_unmap_hva_range(range->start, range->end);
763
52ac8b35
PB
764 /*
765 * Prevent memslot modification between range_start() and range_end()
766 * so that conditionally locking provides the same result in both
20ec3ebd 767 * functions. Without that guarantee, the mmu_invalidate_in_progress
52ac8b35
PB
768 * adjustments will be imbalanced.
769 *
770 * Pairs with the decrement in range_end().
771 */
772 spin_lock(&kvm->mn_invalidate_lock);
773 kvm->mn_active_invalidate_count++;
774 spin_unlock(&kvm->mn_invalidate_lock);
775
58cd407c
SC
776 /*
777 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
778 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
779 * each cache's lock. There are relatively few caches in existence at
780 * any given time, and the caches themselves can check for hva overlap,
781 * i.e. don't need to rely on memslot overlap checks for performance.
782 * Because this runs without holding mmu_lock, the pfn caches must use
20ec3ebd
CP
783 * mn_active_invalidate_count (see above) instead of
784 * mmu_invalidate_in_progress.
58cd407c 785 */
982ed0de
DW
786 gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
787 hva_range.may_block);
788
f922bd9b 789 __kvm_handle_hva_range(kvm, &hva_range);
93065ac7 790
e649b3f0 791 return 0;
e930bffe
AA
792}
793
20ec3ebd
CP
794void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
795 unsigned long end)
e930bffe 796{
e930bffe
AA
797 /*
798 * This sequence increase will notify the kvm page fault that
799 * the page that is going to be mapped in the spte could have
800 * been freed.
801 */
20ec3ebd 802 kvm->mmu_invalidate_seq++;
a355aa54 803 smp_wmb();
e930bffe
AA
804 /*
805 * The above sequence increase must be visible before the
a355aa54 806 * below count decrease, which is ensured by the smp_wmb above
20ec3ebd 807 * in conjunction with the smp_rmb in mmu_invalidate_retry().
e930bffe 808 */
20ec3ebd 809 kvm->mmu_invalidate_in_progress--;
f922bd9b
SC
810}
811
812static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
813 const struct mmu_notifier_range *range)
814{
815 struct kvm *kvm = mmu_notifier_to_kvm(mn);
816 const struct kvm_hva_range hva_range = {
817 .start = range->start,
818 .end = range->end,
819 .pte = __pte(0),
820 .handler = (void *)kvm_null_fn,
20ec3ebd 821 .on_lock = kvm_mmu_invalidate_end,
683412cc 822 .on_unlock = (void *)kvm_null_fn,
f922bd9b
SC
823 .flush_on_ret = false,
824 .may_block = mmu_notifier_range_blockable(range),
825 };
52ac8b35 826 bool wake;
f922bd9b
SC
827
828 __kvm_handle_hva_range(kvm, &hva_range);
e930bffe 829
52ac8b35
PB
830 /* Pairs with the increment in range_start(). */
831 spin_lock(&kvm->mn_invalidate_lock);
832 wake = (--kvm->mn_active_invalidate_count == 0);
833 spin_unlock(&kvm->mn_invalidate_lock);
834
835 /*
836 * There can only be one waiter, since the wait happens under
837 * slots_lock.
838 */
839 if (wake)
840 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
841
20ec3ebd 842 BUG_ON(kvm->mmu_invalidate_in_progress < 0);
e930bffe
AA
843}
844
845static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
846 struct mm_struct *mm,
57128468
ALC
847 unsigned long start,
848 unsigned long end)
e930bffe 849{
501b9185
SC
850 trace_kvm_age_hva(start, end);
851
3039bcc7 852 return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
e930bffe
AA
853}
854
1d7715c6
VD
855static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
856 struct mm_struct *mm,
857 unsigned long start,
858 unsigned long end)
859{
501b9185
SC
860 trace_kvm_age_hva(start, end);
861
1d7715c6
VD
862 /*
863 * Even though we do not flush TLB, this will still adversely
864 * affect performance on pre-Haswell Intel EPT, where there is
865 * no EPT Access Bit to clear so that we have to tear down EPT
866 * tables instead. If we find this unacceptable, we can always
867 * add a parameter to kvm_age_hva so that it effectively doesn't
868 * do anything on clear_young.
869 *
870 * Also note that currently we never issue secondary TLB flushes
871 * from clear_young, leaving this job up to the regular system
872 * cadence. If we find this inaccurate, we might come up with a
873 * more sophisticated heuristic later.
874 */
3039bcc7 875 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
1d7715c6
VD
876}
877
8ee53820
AA
878static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
879 struct mm_struct *mm,
880 unsigned long address)
881{
501b9185
SC
882 trace_kvm_test_age_hva(address);
883
3039bcc7
SC
884 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
885 kvm_test_age_gfn);
8ee53820
AA
886}
887
85db06e5
MT
888static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
889 struct mm_struct *mm)
890{
891 struct kvm *kvm = mmu_notifier_to_kvm(mn);
eda2beda
LJ
892 int idx;
893
894 idx = srcu_read_lock(&kvm->srcu);
683412cc 895 kvm_flush_shadow_all(kvm);
eda2beda 896 srcu_read_unlock(&kvm->srcu, idx);
85db06e5
MT
897}
898
e930bffe 899static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
e649b3f0 900 .invalidate_range = kvm_mmu_notifier_invalidate_range,
e930bffe
AA
901 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
902 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
903 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
1d7715c6 904 .clear_young = kvm_mmu_notifier_clear_young,
8ee53820 905 .test_young = kvm_mmu_notifier_test_young,
3da0dd43 906 .change_pte = kvm_mmu_notifier_change_pte,
85db06e5 907 .release = kvm_mmu_notifier_release,
e930bffe 908};
4c07b0a4
AK
909
910static int kvm_init_mmu_notifier(struct kvm *kvm)
911{
912 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
913 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
914}
915
916#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
917
918static int kvm_init_mmu_notifier(struct kvm *kvm)
919{
920 return 0;
921}
922
e930bffe
AA
923#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
924
2fdef3a2
SS
925#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
926static int kvm_pm_notifier_call(struct notifier_block *bl,
927 unsigned long state,
928 void *unused)
929{
930 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
931
932 return kvm_arch_pm_notifier(kvm, state);
933}
934
935static void kvm_init_pm_notifier(struct kvm *kvm)
936{
937 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
938 /* Suspend KVM before we suspend ftrace, RCU, etc. */
939 kvm->pm_notifier.priority = INT_MAX;
940 register_pm_notifier(&kvm->pm_notifier);
941}
942
943static void kvm_destroy_pm_notifier(struct kvm *kvm)
944{
945 unregister_pm_notifier(&kvm->pm_notifier);
946}
947#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
948static void kvm_init_pm_notifier(struct kvm *kvm)
949{
950}
951
952static void kvm_destroy_pm_notifier(struct kvm *kvm)
953{
954}
955#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
956
a47d2b07
PB
957static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
958{
959 if (!memslot->dirty_bitmap)
960 return;
961
962 kvfree(memslot->dirty_bitmap);
963 memslot->dirty_bitmap = NULL;
964}
965
a54d8066 966/* This does not remove the slot from struct kvm_memslots data structures */
e96c81ee 967static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
a47d2b07 968{
e96c81ee 969 kvm_destroy_dirty_bitmap(slot);
a47d2b07 970
e96c81ee 971 kvm_arch_free_memslot(kvm, slot);
a47d2b07 972
a54d8066 973 kfree(slot);
a47d2b07
PB
974}
975
976static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
977{
a54d8066 978 struct hlist_node *idnode;
a47d2b07 979 struct kvm_memory_slot *memslot;
a54d8066 980 int bkt;
a47d2b07 981
a54d8066
MS
982 /*
983 * The same memslot objects live in both active and inactive sets,
984 * arbitrarily free using index '1' so the second invocation of this
985 * function isn't operating over a structure with dangling pointers
986 * (even though this function isn't actually touching them).
987 */
988 if (!slots->node_idx)
a47d2b07
PB
989 return;
990
a54d8066 991 hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
e96c81ee 992 kvm_free_memslot(kvm, memslot);
bf3e05bc
XG
993}
994
bc9e9e67
JZ
995static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
996{
997 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
998 case KVM_STATS_TYPE_INSTANT:
999 return 0444;
1000 case KVM_STATS_TYPE_CUMULATIVE:
1001 case KVM_STATS_TYPE_PEAK:
1002 default:
1003 return 0644;
1004 }
1005}
1006
1007
536a6f88
JF
1008static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1009{
1010 int i;
bc9e9e67
JZ
1011 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1012 kvm_vcpu_stats_header.num_desc;
536a6f88 1013
a44a4cc1 1014 if (IS_ERR(kvm->debugfs_dentry))
536a6f88
JF
1015 return;
1016
1017 debugfs_remove_recursive(kvm->debugfs_dentry);
1018
9d5a1dce
LC
1019 if (kvm->debugfs_stat_data) {
1020 for (i = 0; i < kvm_debugfs_num_entries; i++)
1021 kfree(kvm->debugfs_stat_data[i]);
1022 kfree(kvm->debugfs_stat_data);
1023 }
536a6f88
JF
1024}
1025
59f82aad 1026static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
536a6f88 1027{
85cd39af
PB
1028 static DEFINE_MUTEX(kvm_debugfs_lock);
1029 struct dentry *dent;
536a6f88
JF
1030 char dir_name[ITOA_MAX_LEN * 2];
1031 struct kvm_stat_data *stat_data;
bc9e9e67 1032 const struct _kvm_stats_desc *pdesc;
b74ed7a6 1033 int i, ret = -ENOMEM;
bc9e9e67
JZ
1034 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1035 kvm_vcpu_stats_header.num_desc;
536a6f88
JF
1036
1037 if (!debugfs_initialized())
1038 return 0;
1039
59f82aad 1040 snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
85cd39af
PB
1041 mutex_lock(&kvm_debugfs_lock);
1042 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1043 if (dent) {
1044 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1045 dput(dent);
1046 mutex_unlock(&kvm_debugfs_lock);
1047 return 0;
1048 }
1049 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1050 mutex_unlock(&kvm_debugfs_lock);
1051 if (IS_ERR(dent))
1052 return 0;
536a6f88 1053
85cd39af 1054 kvm->debugfs_dentry = dent;
536a6f88
JF
1055 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1056 sizeof(*kvm->debugfs_stat_data),
b12ce36a 1057 GFP_KERNEL_ACCOUNT);
536a6f88 1058 if (!kvm->debugfs_stat_data)
b74ed7a6 1059 goto out_err;
536a6f88 1060
bc9e9e67
JZ
1061 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1062 pdesc = &kvm_vm_stats_desc[i];
b12ce36a 1063 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
536a6f88 1064 if (!stat_data)
b74ed7a6 1065 goto out_err;
536a6f88
JF
1066
1067 stat_data->kvm = kvm;
bc9e9e67
JZ
1068 stat_data->desc = pdesc;
1069 stat_data->kind = KVM_STAT_VM;
1070 kvm->debugfs_stat_data[i] = stat_data;
1071 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1072 kvm->debugfs_dentry, stat_data,
1073 &stat_fops_per_vm);
1074 }
1075
1076 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1077 pdesc = &kvm_vcpu_stats_desc[i];
b12ce36a 1078 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
536a6f88 1079 if (!stat_data)
b74ed7a6 1080 goto out_err;
536a6f88
JF
1081
1082 stat_data->kvm = kvm;
bc9e9e67
JZ
1083 stat_data->desc = pdesc;
1084 stat_data->kind = KVM_STAT_VCPU;
004d62eb 1085 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
bc9e9e67 1086 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
09cbcef6
MP
1087 kvm->debugfs_dentry, stat_data,
1088 &stat_fops_per_vm);
536a6f88 1089 }
3165af73
PX
1090
1091 ret = kvm_arch_create_vm_debugfs(kvm);
b74ed7a6
OU
1092 if (ret)
1093 goto out_err;
3165af73 1094
536a6f88 1095 return 0;
b74ed7a6
OU
1096out_err:
1097 kvm_destroy_vm_debugfs(kvm);
1098 return ret;
536a6f88
JF
1099}
1100
1aa9b957
JS
1101/*
1102 * Called after the VM is otherwise initialized, but just before adding it to
1103 * the vm_list.
1104 */
1105int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1106{
1107 return 0;
1108}
1109
1110/*
1111 * Called just after removing the VM from the vm_list, but before doing any
1112 * other destruction.
1113 */
1114void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1115{
1116}
1117
3165af73
PX
1118/*
1119 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1120 * be setup already, so we can create arch-specific debugfs entries under it.
1121 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1122 * a per-arch destroy interface is not needed.
1123 */
1124int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1125{
1126 return 0;
1127}
1128
b74ed7a6 1129static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
6aa8b732 1130{
d89f5eff 1131 struct kvm *kvm = kvm_arch_alloc_vm();
a54d8066 1132 struct kvm_memslots *slots;
9121923c 1133 int r = -ENOMEM;
a54d8066 1134 int i, j;
6aa8b732 1135
d89f5eff
JK
1136 if (!kvm)
1137 return ERR_PTR(-ENOMEM);
1138
405294f2
SC
1139 /* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
1140 __module_get(kvm_chardev_ops.owner);
1141
531810ca 1142 KVM_MMU_LOCK_INIT(kvm);
f1f10076 1143 mmgrab(current->mm);
e9ad4ec8
PB
1144 kvm->mm = current->mm;
1145 kvm_eventfd_init(kvm);
1146 mutex_init(&kvm->lock);
1147 mutex_init(&kvm->irq_lock);
1148 mutex_init(&kvm->slots_lock);
b10a038e 1149 mutex_init(&kvm->slots_arch_lock);
52ac8b35
PB
1150 spin_lock_init(&kvm->mn_invalidate_lock);
1151 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
c5b07754 1152 xa_init(&kvm->vcpu_array);
52ac8b35 1153
982ed0de
DW
1154 INIT_LIST_HEAD(&kvm->gpc_list);
1155 spin_lock_init(&kvm->gpc_lock);
52ac8b35 1156
e9ad4ec8 1157 INIT_LIST_HEAD(&kvm->devices);
f502cc56 1158 kvm->max_vcpus = KVM_MAX_VCPUS;
e9ad4ec8 1159
1e702d9a
AW
1160 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1161
5c697c36
SC
1162 /*
1163 * Force subsequent debugfs file creations to fail if the VM directory
1164 * is not created (by kvm_create_vm_debugfs()).
1165 */
1166 kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1167
f2759c08
OU
1168 snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1169 task_pid_nr(current));
1170
8a44119a
PB
1171 if (init_srcu_struct(&kvm->srcu))
1172 goto out_err_no_srcu;
1173 if (init_srcu_struct(&kvm->irq_srcu))
1174 goto out_err_no_irq_srcu;
1175
e2d3fcaf 1176 refcount_set(&kvm->users_count, 1);
f481b069 1177 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
a54d8066
MS
1178 for (j = 0; j < 2; j++) {
1179 slots = &kvm->__memslots[i][j];
9121923c 1180
a54d8066
MS
1181 atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1182 slots->hva_tree = RB_ROOT_CACHED;
1183 slots->gfn_tree = RB_ROOT;
1184 hash_init(slots->id_hash);
1185 slots->node_idx = j;
1186
1187 /* Generations must be different for each address space. */
1188 slots->generation = i;
1189 }
1190
1191 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
f481b069 1192 }
00f034a1 1193
e93f8a0f 1194 for (i = 0; i < KVM_NR_BUSES; i++) {
4a12f951 1195 rcu_assign_pointer(kvm->buses[i],
b12ce36a 1196 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
57e7fbee 1197 if (!kvm->buses[i])
a97b0e77 1198 goto out_err_no_arch_destroy_vm;
e93f8a0f 1199 }
e930bffe 1200
e08b9637 1201 r = kvm_arch_init_vm(kvm, type);
d89f5eff 1202 if (r)
a97b0e77 1203 goto out_err_no_arch_destroy_vm;
10474ae8
AG
1204
1205 r = hardware_enable_all();
1206 if (r)
719d93cd 1207 goto out_err_no_disable;
10474ae8 1208
c77dcacb 1209#ifdef CONFIG_HAVE_KVM_IRQFD
136bdfee 1210 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
75858a84 1211#endif
6aa8b732 1212
74b5c5bf 1213 r = kvm_init_mmu_notifier(kvm);
1aa9b957
JS
1214 if (r)
1215 goto out_err_no_mmu_notifier;
1216
c2b82397
SC
1217 r = kvm_coalesced_mmio_init(kvm);
1218 if (r < 0)
1219 goto out_no_coalesced_mmio;
1220
4ba4f419
SC
1221 r = kvm_create_vm_debugfs(kvm, fdname);
1222 if (r)
1223 goto out_err_no_debugfs;
1224
1aa9b957 1225 r = kvm_arch_post_init_vm(kvm);
74b5c5bf 1226 if (r)
4ba4f419 1227 goto out_err;
74b5c5bf 1228
0d9ce162 1229 mutex_lock(&kvm_lock);
5e58cfe4 1230 list_add(&kvm->vm_list, &vm_list);
0d9ce162 1231 mutex_unlock(&kvm_lock);
d89f5eff 1232
2ecd9d29 1233 preempt_notifier_inc();
2fdef3a2 1234 kvm_init_pm_notifier(kvm);
2ecd9d29 1235
f17abe9a 1236 return kvm;
10474ae8
AG
1237
1238out_err:
4ba4f419
SC
1239 kvm_destroy_vm_debugfs(kvm);
1240out_err_no_debugfs:
c2b82397
SC
1241 kvm_coalesced_mmio_free(kvm);
1242out_no_coalesced_mmio:
1aa9b957
JS
1243#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1244 if (kvm->mmu_notifier.ops)
1245 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1246#endif
1247out_err_no_mmu_notifier:
10474ae8 1248 hardware_disable_all();
719d93cd 1249out_err_no_disable:
a97b0e77 1250 kvm_arch_destroy_vm(kvm);
a97b0e77 1251out_err_no_arch_destroy_vm:
e2d3fcaf 1252 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
e93f8a0f 1253 for (i = 0; i < KVM_NR_BUSES; i++)
3898da94 1254 kfree(kvm_get_bus(kvm, i));
8a44119a
PB
1255 cleanup_srcu_struct(&kvm->irq_srcu);
1256out_err_no_irq_srcu:
1257 cleanup_srcu_struct(&kvm->srcu);
1258out_err_no_srcu:
d89f5eff 1259 kvm_arch_free_vm(kvm);
e9ad4ec8 1260 mmdrop(current->mm);
405294f2 1261 module_put(kvm_chardev_ops.owner);
10474ae8 1262 return ERR_PTR(r);
f17abe9a
AK
1263}
1264
07f0a7bd
SW
1265static void kvm_destroy_devices(struct kvm *kvm)
1266{
e6e3b5a6 1267 struct kvm_device *dev, *tmp;
07f0a7bd 1268
a28ebea2
CD
1269 /*
1270 * We do not need to take the kvm->lock here, because nobody else
1271 * has a reference to the struct kvm at this point and therefore
1272 * cannot access the devices list anyhow.
1273 */
e6e3b5a6
GT
1274 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1275 list_del(&dev->vm_node);
07f0a7bd
SW
1276 dev->ops->destroy(dev);
1277 }
1278}
1279
f17abe9a
AK
1280static void kvm_destroy_vm(struct kvm *kvm)
1281{
e93f8a0f 1282 int i;
6d4e4c4f
AK
1283 struct mm_struct *mm = kvm->mm;
1284
2fdef3a2 1285 kvm_destroy_pm_notifier(kvm);
286de8f6 1286 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
536a6f88 1287 kvm_destroy_vm_debugfs(kvm);
ad8ba2cd 1288 kvm_arch_sync_events(kvm);
0d9ce162 1289 mutex_lock(&kvm_lock);
133de902 1290 list_del(&kvm->vm_list);
0d9ce162 1291 mutex_unlock(&kvm_lock);
1aa9b957
JS
1292 kvm_arch_pre_destroy_vm(kvm);
1293
399ec807 1294 kvm_free_irq_routing(kvm);
df630b8c 1295 for (i = 0; i < KVM_NR_BUSES; i++) {
3898da94 1296 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
4a12f951 1297
4a12f951
CB
1298 if (bus)
1299 kvm_io_bus_destroy(bus);
df630b8c
PX
1300 kvm->buses[i] = NULL;
1301 }
980da6ce 1302 kvm_coalesced_mmio_free(kvm);
e930bffe
AA
1303#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1304 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
52ac8b35
PB
1305 /*
1306 * At this point, pending calls to invalidate_range_start()
1307 * have completed but no more MMU notifiers will run, so
1308 * mn_active_invalidate_count may remain unbalanced.
1309 * No threads can be waiting in install_new_memslots as the
1310 * last reference on KVM has been dropped, but freeing
1311 * memslots would deadlock without this manual intervention.
1312 */
1313 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1314 kvm->mn_active_invalidate_count = 0;
f00be0ca 1315#else
683412cc 1316 kvm_flush_shadow_all(kvm);
5f94c174 1317#endif
d19a9cd2 1318 kvm_arch_destroy_vm(kvm);
07f0a7bd 1319 kvm_destroy_devices(kvm);
a54d8066
MS
1320 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1321 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1322 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1323 }
820b3fcd 1324 cleanup_srcu_struct(&kvm->irq_srcu);
d89f5eff
JK
1325 cleanup_srcu_struct(&kvm->srcu);
1326 kvm_arch_free_vm(kvm);
2ecd9d29 1327 preempt_notifier_dec();
10474ae8 1328 hardware_disable_all();
6d4e4c4f 1329 mmdrop(mm);
5f6de5cb 1330 module_put(kvm_chardev_ops.owner);
f17abe9a
AK
1331}
1332
d39f13b0
IE
1333void kvm_get_kvm(struct kvm *kvm)
1334{
e3736c3e 1335 refcount_inc(&kvm->users_count);
d39f13b0
IE
1336}
1337EXPORT_SYMBOL_GPL(kvm_get_kvm);
1338
605c7130
PX
1339/*
1340 * Make sure the vm is not during destruction, which is a safe version of
1341 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1342 */
1343bool kvm_get_kvm_safe(struct kvm *kvm)
1344{
1345 return refcount_inc_not_zero(&kvm->users_count);
1346}
1347EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1348
d39f13b0
IE
1349void kvm_put_kvm(struct kvm *kvm)
1350{
e3736c3e 1351 if (refcount_dec_and_test(&kvm->users_count))
d39f13b0
IE
1352 kvm_destroy_vm(kvm);
1353}
1354EXPORT_SYMBOL_GPL(kvm_put_kvm);
1355
149487bd
SC
1356/*
1357 * Used to put a reference that was taken on behalf of an object associated
1358 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1359 * of the new file descriptor fails and the reference cannot be transferred to
1360 * its final owner. In such cases, the caller is still actively using @kvm and
1361 * will fail miserably if the refcount unexpectedly hits zero.
1362 */
1363void kvm_put_kvm_no_destroy(struct kvm *kvm)
1364{
1365 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1366}
1367EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
d39f13b0 1368
f17abe9a
AK
1369static int kvm_vm_release(struct inode *inode, struct file *filp)
1370{
1371 struct kvm *kvm = filp->private_data;
1372
721eecbf
GH
1373 kvm_irqfd_release(kvm);
1374
d39f13b0 1375 kvm_put_kvm(kvm);
6aa8b732
AK
1376 return 0;
1377}
1378
515a0127
TY
1379/*
1380 * Allocation size is twice as large as the actual dirty bitmap size.
0dff0846 1381 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
515a0127 1382 */
3c9bd400 1383static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
a36a57b1 1384{
37b2a651 1385 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
a36a57b1 1386
37b2a651 1387 memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
a36a57b1
TY
1388 if (!memslot->dirty_bitmap)
1389 return -ENOMEM;
1390
a36a57b1
TY
1391 return 0;
1392}
1393
a54d8066 1394static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
bf3e05bc 1395{
a54d8066
MS
1396 struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1397 int node_idx_inactive = active->node_idx ^ 1;
0e60b079 1398
a54d8066 1399 return &kvm->__memslots[as_id][node_idx_inactive];
0577d1ab
SC
1400}
1401
1402/*
a54d8066
MS
1403 * Helper to get the address space ID when one of memslot pointers may be NULL.
1404 * This also serves as a sanity that at least one of the pointers is non-NULL,
1405 * and that their address space IDs don't diverge.
0577d1ab 1406 */
a54d8066
MS
1407static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1408 struct kvm_memory_slot *b)
0577d1ab 1409{
a54d8066
MS
1410 if (WARN_ON_ONCE(!a && !b))
1411 return 0;
0577d1ab 1412
a54d8066
MS
1413 if (!a)
1414 return b->as_id;
1415 if (!b)
1416 return a->as_id;
0577d1ab 1417
a54d8066
MS
1418 WARN_ON_ONCE(a->as_id != b->as_id);
1419 return a->as_id;
0577d1ab 1420}
efbeec70 1421
a54d8066
MS
1422static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1423 struct kvm_memory_slot *slot)
0577d1ab 1424{
a54d8066
MS
1425 struct rb_root *gfn_tree = &slots->gfn_tree;
1426 struct rb_node **node, *parent;
1427 int idx = slots->node_idx;
0577d1ab 1428
a54d8066
MS
1429 parent = NULL;
1430 for (node = &gfn_tree->rb_node; *node; ) {
1431 struct kvm_memory_slot *tmp;
f85e2cb5 1432
a54d8066
MS
1433 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1434 parent = *node;
1435 if (slot->base_gfn < tmp->base_gfn)
1436 node = &(*node)->rb_left;
1437 else if (slot->base_gfn > tmp->base_gfn)
1438 node = &(*node)->rb_right;
1439 else
1440 BUG();
0577d1ab 1441 }
a54d8066
MS
1442
1443 rb_link_node(&slot->gfn_node[idx], parent, node);
1444 rb_insert_color(&slot->gfn_node[idx], gfn_tree);
0577d1ab
SC
1445}
1446
a54d8066
MS
1447static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1448 struct kvm_memory_slot *slot)
0577d1ab 1449{
a54d8066
MS
1450 rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1451}
0577d1ab 1452
a54d8066
MS
1453static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1454 struct kvm_memory_slot *old,
1455 struct kvm_memory_slot *new)
1456{
1457 int idx = slots->node_idx;
0577d1ab 1458
a54d8066 1459 WARN_ON_ONCE(old->base_gfn != new->base_gfn);
0577d1ab 1460
a54d8066
MS
1461 rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1462 &slots->gfn_tree);
0577d1ab
SC
1463}
1464
1465/*
a54d8066 1466 * Replace @old with @new in the inactive memslots.
0577d1ab 1467 *
a54d8066
MS
1468 * With NULL @old this simply adds @new.
1469 * With NULL @new this simply removes @old.
0577d1ab 1470 *
a54d8066
MS
1471 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1472 * appropriately.
0577d1ab 1473 */
a54d8066
MS
1474static void kvm_replace_memslot(struct kvm *kvm,
1475 struct kvm_memory_slot *old,
1476 struct kvm_memory_slot *new)
0577d1ab 1477{
a54d8066
MS
1478 int as_id = kvm_memslots_get_as_id(old, new);
1479 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1480 int idx = slots->node_idx;
0577d1ab 1481
a54d8066
MS
1482 if (old) {
1483 hash_del(&old->id_node[idx]);
1484 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
0577d1ab 1485
a54d8066
MS
1486 if ((long)old == atomic_long_read(&slots->last_used_slot))
1487 atomic_long_set(&slots->last_used_slot, (long)new);
0577d1ab 1488
a54d8066
MS
1489 if (!new) {
1490 kvm_erase_gfn_node(slots, old);
1e8617d3 1491 return;
a54d8066
MS
1492 }
1493 }
1e8617d3 1494
a54d8066
MS
1495 /*
1496 * Initialize @new's hva range. Do this even when replacing an @old
1497 * slot, kvm_copy_memslot() deliberately does not touch node data.
1498 */
1499 new->hva_node[idx].start = new->userspace_addr;
1500 new->hva_node[idx].last = new->userspace_addr +
1501 (new->npages << PAGE_SHIFT) - 1;
1502
1503 /*
1504 * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1505 * hva_node needs to be swapped with remove+insert even though hva can't
1506 * change when replacing an existing slot.
1507 */
1508 hash_add(slots->id_hash, &new->id_node[idx], new->id);
1509 interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1510
1511 /*
1512 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1513 * switch the node in the gfn tree instead of removing the old and
1514 * inserting the new as two separate operations. Replacement is a
1515 * single O(1) operation versus two O(log(n)) operations for
1516 * remove+insert.
1517 */
1518 if (old && old->base_gfn == new->base_gfn) {
1519 kvm_replace_gfn_node(slots, old, new);
1520 } else {
1521 if (old)
1522 kvm_erase_gfn_node(slots, old);
1523 kvm_insert_gfn_node(slots, new);
0577d1ab 1524 }
bf3e05bc
XG
1525}
1526
09170a49 1527static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
a50d64d6 1528{
4d8b81ab
XG
1529 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1530
0f8a4de3 1531#ifdef __KVM_HAVE_READONLY_MEM
4d8b81ab
XG
1532 valid_flags |= KVM_MEM_READONLY;
1533#endif
1534
1535 if (mem->flags & ~valid_flags)
a50d64d6
XG
1536 return -EINVAL;
1537
1538 return 0;
1539}
1540
a54d8066 1541static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
7ec4fb44 1542{
a54d8066
MS
1543 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1544
1545 /* Grab the generation from the activate memslots. */
1546 u64 gen = __kvm_memslots(kvm, as_id)->generation;
7ec4fb44 1547
361209e0
SC
1548 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1549 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
ee3d1570 1550
52ac8b35
PB
1551 /*
1552 * Do not store the new memslots while there are invalidations in
071064f1
PB
1553 * progress, otherwise the locking in invalidate_range_start and
1554 * invalidate_range_end will be unbalanced.
52ac8b35
PB
1555 */
1556 spin_lock(&kvm->mn_invalidate_lock);
1557 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1558 while (kvm->mn_active_invalidate_count) {
1559 set_current_state(TASK_UNINTERRUPTIBLE);
1560 spin_unlock(&kvm->mn_invalidate_lock);
1561 schedule();
1562 spin_lock(&kvm->mn_invalidate_lock);
1563 }
1564 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
f481b069 1565 rcu_assign_pointer(kvm->memslots[as_id], slots);
52ac8b35 1566 spin_unlock(&kvm->mn_invalidate_lock);
b10a038e
BG
1567
1568 /*
1569 * Acquired in kvm_set_memslot. Must be released before synchronize
1570 * SRCU below in order to avoid deadlock with another thread
1571 * acquiring the slots_arch_lock in an srcu critical section.
1572 */
1573 mutex_unlock(&kvm->slots_arch_lock);
1574
7ec4fb44 1575 synchronize_srcu_expedited(&kvm->srcu);
e59dbe09 1576
ee3d1570 1577 /*
361209e0 1578 * Increment the new memslot generation a second time, dropping the
00116795 1579 * update in-progress flag and incrementing the generation based on
361209e0
SC
1580 * the number of address spaces. This provides a unique and easily
1581 * identifiable generation number while the memslots are in flux.
1582 */
1583 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1584
1585 /*
4bd518f1
PB
1586 * Generations must be unique even across address spaces. We do not need
1587 * a global counter for that, instead the generation space is evenly split
1588 * across address spaces. For example, with two address spaces, address
164bf7e5
SC
1589 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1590 * use generations 1, 3, 5, ...
ee3d1570 1591 */
164bf7e5 1592 gen += KVM_ADDRESS_SPACE_NUM;
ee3d1570 1593
15248258 1594 kvm_arch_memslots_updated(kvm, gen);
ee3d1570 1595
15248258 1596 slots->generation = gen;
7ec4fb44
GN
1597}
1598
07921665
SC
1599static int kvm_prepare_memory_region(struct kvm *kvm,
1600 const struct kvm_memory_slot *old,
1601 struct kvm_memory_slot *new,
1602 enum kvm_mr_change change)
ddc12f2a 1603{
07921665
SC
1604 int r;
1605
1606 /*
1607 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1608 * will be freed on "commit". If logging is enabled in both old and
1609 * new, reuse the existing bitmap. If logging is enabled only in the
1610 * new and KVM isn't using a ring buffer, allocate and initialize a
1611 * new bitmap.
1612 */
244893fa
SC
1613 if (change != KVM_MR_DELETE) {
1614 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1615 new->dirty_bitmap = NULL;
1616 else if (old && old->dirty_bitmap)
1617 new->dirty_bitmap = old->dirty_bitmap;
86bdf3eb 1618 else if (kvm_use_dirty_bitmap(kvm)) {
244893fa
SC
1619 r = kvm_alloc_dirty_bitmap(new);
1620 if (r)
1621 return r;
1622
1623 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1624 bitmap_set(new->dirty_bitmap, 0, new->npages);
1625 }
07921665
SC
1626 }
1627
1628 r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1629
1630 /* Free the bitmap on failure if it was allocated above. */
c87661f8 1631 if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
07921665
SC
1632 kvm_destroy_dirty_bitmap(new);
1633
1634 return r;
ddc12f2a
BG
1635}
1636
07921665
SC
1637static void kvm_commit_memory_region(struct kvm *kvm,
1638 struct kvm_memory_slot *old,
1639 const struct kvm_memory_slot *new,
1640 enum kvm_mr_change change)
ddc12f2a 1641{
6c7b2202
PB
1642 int old_flags = old ? old->flags : 0;
1643 int new_flags = new ? new->flags : 0;
07921665
SC
1644 /*
1645 * Update the total number of memslot pages before calling the arch
1646 * hook so that architectures can consume the result directly.
1647 */
1648 if (change == KVM_MR_DELETE)
1649 kvm->nr_memslot_pages -= old->npages;
1650 else if (change == KVM_MR_CREATE)
1651 kvm->nr_memslot_pages += new->npages;
1652
6c7b2202
PB
1653 if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1654 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1655 atomic_set(&kvm->nr_memslots_dirty_logging,
1656 atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1657 }
1658
07921665
SC
1659 kvm_arch_commit_memory_region(kvm, old, new, change);
1660
a54d8066
MS
1661 switch (change) {
1662 case KVM_MR_CREATE:
1663 /* Nothing more to do. */
1664 break;
1665 case KVM_MR_DELETE:
1666 /* Free the old memslot and all its metadata. */
1667 kvm_free_memslot(kvm, old);
1668 break;
1669 case KVM_MR_MOVE:
1670 case KVM_MR_FLAGS_ONLY:
1671 /*
1672 * Free the dirty bitmap as needed; the below check encompasses
1673 * both the flags and whether a ring buffer is being used)
1674 */
1675 if (old->dirty_bitmap && !new->dirty_bitmap)
1676 kvm_destroy_dirty_bitmap(old);
1677
1678 /*
1679 * The final quirk. Free the detached, old slot, but only its
1680 * memory, not any metadata. Metadata, including arch specific
1681 * data, may be reused by @new.
1682 */
1683 kfree(old);
1684 break;
1685 default:
1686 BUG();
1687 }
ddc12f2a
BG
1688}
1689
36947254 1690/*
a54d8066
MS
1691 * Activate @new, which must be installed in the inactive slots by the caller,
1692 * by swapping the active slots and then propagating @new to @old once @old is
1693 * unreachable and can be safely modified.
1694 *
1695 * With NULL @old this simply adds @new to @active (while swapping the sets).
1696 * With NULL @new this simply removes @old from @active and frees it
1697 * (while also swapping the sets).
36947254 1698 */
a54d8066
MS
1699static void kvm_activate_memslot(struct kvm *kvm,
1700 struct kvm_memory_slot *old,
1701 struct kvm_memory_slot *new)
36947254 1702{
a54d8066 1703 int as_id = kvm_memslots_get_as_id(old, new);
36947254 1704
a54d8066
MS
1705 kvm_swap_active_memslots(kvm, as_id);
1706
1707 /* Propagate the new memslot to the now inactive memslots. */
1708 kvm_replace_memslot(kvm, old, new);
1709}
1710
1711static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1712 const struct kvm_memory_slot *src)
1713{
1714 dest->base_gfn = src->base_gfn;
1715 dest->npages = src->npages;
1716 dest->dirty_bitmap = src->dirty_bitmap;
1717 dest->arch = src->arch;
1718 dest->userspace_addr = src->userspace_addr;
1719 dest->flags = src->flags;
1720 dest->id = src->id;
1721 dest->as_id = src->as_id;
1722}
1723
1724static void kvm_invalidate_memslot(struct kvm *kvm,
1725 struct kvm_memory_slot *old,
244893fa 1726 struct kvm_memory_slot *invalid_slot)
a54d8066 1727{
07921665 1728 /*
a54d8066
MS
1729 * Mark the current slot INVALID. As with all memslot modifications,
1730 * this must be done on an unreachable slot to avoid modifying the
1731 * current slot in the active tree.
07921665 1732 */
244893fa
SC
1733 kvm_copy_memslot(invalid_slot, old);
1734 invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1735 kvm_replace_memslot(kvm, old, invalid_slot);
a54d8066
MS
1736
1737 /*
1738 * Activate the slot that is now marked INVALID, but don't propagate
1739 * the slot to the now inactive slots. The slot is either going to be
1740 * deleted or recreated as a new slot.
1741 */
1742 kvm_swap_active_memslots(kvm, old->as_id);
1743
1744 /*
1745 * From this point no new shadow pages pointing to a deleted, or moved,
1746 * memslot will be created. Validation of sp->gfn happens in:
1747 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1748 * - kvm_is_visible_gfn (mmu_check_root)
1749 */
bcb63dcd 1750 kvm_arch_flush_shadow_memslot(kvm, old);
683412cc 1751 kvm_arch_guest_memory_reclaimed(kvm);
a54d8066
MS
1752
1753 /* Was released by kvm_swap_active_memslots, reacquire. */
1754 mutex_lock(&kvm->slots_arch_lock);
1755
1756 /*
1757 * Copy the arch-specific field of the newly-installed slot back to the
1758 * old slot as the arch data could have changed between releasing
1759 * slots_arch_lock in install_new_memslots() and re-acquiring the lock
1760 * above. Writers are required to retrieve memslots *after* acquiring
1761 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1762 */
244893fa 1763 old->arch = invalid_slot->arch;
a54d8066
MS
1764}
1765
1766static void kvm_create_memslot(struct kvm *kvm,
244893fa 1767 struct kvm_memory_slot *new)
a54d8066 1768{
244893fa
SC
1769 /* Add the new memslot to the inactive set and activate. */
1770 kvm_replace_memslot(kvm, NULL, new);
1771 kvm_activate_memslot(kvm, NULL, new);
a54d8066
MS
1772}
1773
1774static void kvm_delete_memslot(struct kvm *kvm,
1775 struct kvm_memory_slot *old,
1776 struct kvm_memory_slot *invalid_slot)
1777{
1778 /*
1779 * Remove the old memslot (in the inactive memslots) by passing NULL as
244893fa 1780 * the "new" slot, and for the invalid version in the active slots.
a54d8066
MS
1781 */
1782 kvm_replace_memslot(kvm, old, NULL);
a54d8066 1783 kvm_activate_memslot(kvm, invalid_slot, NULL);
a54d8066 1784}
36947254 1785
244893fa
SC
1786static void kvm_move_memslot(struct kvm *kvm,
1787 struct kvm_memory_slot *old,
1788 struct kvm_memory_slot *new,
1789 struct kvm_memory_slot *invalid_slot)
a54d8066 1790{
a54d8066 1791 /*
244893fa
SC
1792 * Replace the old memslot in the inactive slots, and then swap slots
1793 * and replace the current INVALID with the new as well.
a54d8066 1794 */
244893fa
SC
1795 kvm_replace_memslot(kvm, old, new);
1796 kvm_activate_memslot(kvm, invalid_slot, new);
a54d8066 1797}
36947254 1798
a54d8066
MS
1799static void kvm_update_flags_memslot(struct kvm *kvm,
1800 struct kvm_memory_slot *old,
244893fa 1801 struct kvm_memory_slot *new)
a54d8066
MS
1802{
1803 /*
1804 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1805 * an intermediate step. Instead, the old memslot is simply replaced
1806 * with a new, updated copy in both memslot sets.
1807 */
244893fa
SC
1808 kvm_replace_memslot(kvm, old, new);
1809 kvm_activate_memslot(kvm, old, new);
36947254
SC
1810}
1811
cf47f50b 1812static int kvm_set_memslot(struct kvm *kvm,
a54d8066 1813 struct kvm_memory_slot *old,
ce5f0215 1814 struct kvm_memory_slot *new,
cf47f50b
SC
1815 enum kvm_mr_change change)
1816{
244893fa 1817 struct kvm_memory_slot *invalid_slot;
cf47f50b
SC
1818 int r;
1819
b10a038e 1820 /*
a54d8066 1821 * Released in kvm_swap_active_memslots.
b10a038e
BG
1822 *
1823 * Must be held from before the current memslots are copied until
1824 * after the new memslots are installed with rcu_assign_pointer,
a54d8066 1825 * then released before the synchronize srcu in kvm_swap_active_memslots.
b10a038e
BG
1826 *
1827 * When modifying memslots outside of the slots_lock, must be held
1828 * before reading the pointer to the current memslots until after all
1829 * changes to those memslots are complete.
1830 *
1831 * These rules ensure that installing new memslots does not lose
1832 * changes made to the previous memslots.
1833 */
1834 mutex_lock(&kvm->slots_arch_lock);
1835
a54d8066
MS
1836 /*
1837 * Invalidate the old slot if it's being deleted or moved. This is
1838 * done prior to actually deleting/moving the memslot to allow vCPUs to
1839 * continue running by ensuring there are no mappings or shadow pages
1840 * for the memslot when it is deleted/moved. Without pre-invalidation
1841 * (and without a lock), a window would exist between effecting the
1842 * delete/move and committing the changes in arch code where KVM or a
1843 * guest could access a non-existent memslot.
244893fa
SC
1844 *
1845 * Modifications are done on a temporary, unreachable slot. The old
1846 * slot needs to be preserved in case a later step fails and the
1847 * invalidation needs to be reverted.
a54d8066 1848 */
cf47f50b 1849 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
244893fa
SC
1850 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1851 if (!invalid_slot) {
1852 mutex_unlock(&kvm->slots_arch_lock);
1853 return -ENOMEM;
1854 }
1855 kvm_invalidate_memslot(kvm, old, invalid_slot);
1856 }
b10a038e 1857
a54d8066
MS
1858 r = kvm_prepare_memory_region(kvm, old, new, change);
1859 if (r) {
b10a038e 1860 /*
a54d8066
MS
1861 * For DELETE/MOVE, revert the above INVALID change. No
1862 * modifications required since the original slot was preserved
1863 * in the inactive slots. Changing the active memslots also
1864 * release slots_arch_lock.
b10a038e 1865 */
244893fa
SC
1866 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1867 kvm_activate_memslot(kvm, invalid_slot, old);
1868 kfree(invalid_slot);
1869 } else {
a54d8066 1870 mutex_unlock(&kvm->slots_arch_lock);
244893fa 1871 }
a54d8066 1872 return r;
cf47f50b
SC
1873 }
1874
bda44d84 1875 /*
a54d8066
MS
1876 * For DELETE and MOVE, the working slot is now active as the INVALID
1877 * version of the old slot. MOVE is particularly special as it reuses
1878 * the old slot and returns a copy of the old slot (in working_slot).
1879 * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1880 * old slot is detached but otherwise preserved.
bda44d84 1881 */
a54d8066 1882 if (change == KVM_MR_CREATE)
244893fa 1883 kvm_create_memslot(kvm, new);
a54d8066 1884 else if (change == KVM_MR_DELETE)
244893fa 1885 kvm_delete_memslot(kvm, old, invalid_slot);
a54d8066 1886 else if (change == KVM_MR_MOVE)
244893fa 1887 kvm_move_memslot(kvm, old, new, invalid_slot);
a54d8066 1888 else if (change == KVM_MR_FLAGS_ONLY)
244893fa 1889 kvm_update_flags_memslot(kvm, old, new);
a54d8066
MS
1890 else
1891 BUG();
cf47f50b 1892
244893fa
SC
1893 /* Free the temporary INVALID slot used for DELETE and MOVE. */
1894 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1895 kfree(invalid_slot);
bda44d84 1896
a54d8066
MS
1897 /*
1898 * No need to refresh new->arch, changes after dropping slots_arch_lock
a413a625 1899 * will directly hit the final, active memslot. Architectures are
a54d8066
MS
1900 * responsible for knowing that new->arch may be stale.
1901 */
1902 kvm_commit_memory_region(kvm, old, new, change);
cf47f50b 1903
cf47f50b 1904 return 0;
cf47f50b
SC
1905}
1906
44401a20
MS
1907static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1908 gfn_t start, gfn_t end)
5c0b4f3d 1909{
44401a20 1910 struct kvm_memslot_iter iter;
5c0b4f3d 1911
44401a20
MS
1912 kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1913 if (iter.slot->id != id)
1914 return true;
1915 }
5c0b4f3d 1916
44401a20 1917 return false;
5c0b4f3d
SC
1918}
1919
6aa8b732
AK
1920/*
1921 * Allocate some memory and give it an address in the guest physical address
1922 * space.
1923 *
1924 * Discontiguous memory is allowed, mostly for framebuffers.
f78e0e2e 1925 *
02d5d55b 1926 * Must be called holding kvm->slots_lock for write.
6aa8b732 1927 */
f78e0e2e 1928int __kvm_set_memory_region(struct kvm *kvm,
09170a49 1929 const struct kvm_userspace_memory_region *mem)
6aa8b732 1930{
244893fa 1931 struct kvm_memory_slot *old, *new;
44401a20 1932 struct kvm_memslots *slots;
f64c0398 1933 enum kvm_mr_change change;
0f9bdef3
SC
1934 unsigned long npages;
1935 gfn_t base_gfn;
163da372
SC
1936 int as_id, id;
1937 int r;
6aa8b732 1938
a50d64d6
XG
1939 r = check_memory_region_flags(mem);
1940 if (r)
71a4c30b 1941 return r;
a50d64d6 1942
f481b069
PB
1943 as_id = mem->slot >> 16;
1944 id = (u16)mem->slot;
1945
6aa8b732 1946 /* General sanity checks */
6b285a55
SC
1947 if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1948 (mem->memory_size != (unsigned long)mem->memory_size))
71a4c30b 1949 return -EINVAL;
6aa8b732 1950 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
71a4c30b 1951 return -EINVAL;
fa3d315a 1952 /* We can read the guest memory with __xxx_user() later on. */
09d952c9 1953 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
139bc8a6 1954 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
96d4f267 1955 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
09d952c9 1956 mem->memory_size))
71a4c30b 1957 return -EINVAL;
f481b069 1958 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
71a4c30b 1959 return -EINVAL;
6aa8b732 1960 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
71a4c30b 1961 return -EINVAL;
0f9bdef3
SC
1962 if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
1963 return -EINVAL;
6aa8b732 1964
44401a20 1965 slots = __kvm_memslots(kvm, as_id);
6aa8b732 1966
5c0b4f3d 1967 /*
7cd08553
SC
1968 * Note, the old memslot (and the pointer itself!) may be invalidated
1969 * and/or destroyed by kvm_set_memslot().
5c0b4f3d 1970 */
44401a20 1971 old = id_to_memslot(slots, id);
163da372 1972
47ea7d90 1973 if (!mem->memory_size) {
7cd08553 1974 if (!old || !old->npages)
47ea7d90 1975 return -EINVAL;
5c0b4f3d 1976
7cd08553 1977 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
47ea7d90 1978 return -EIO;
6aa8b732 1979
244893fa 1980 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
47ea7d90 1981 }
5c0b4f3d 1982
0f9bdef3
SC
1983 base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
1984 npages = (mem->memory_size >> PAGE_SHIFT);
163da372 1985
7cd08553 1986 if (!old || !old->npages) {
5c0b4f3d 1987 change = KVM_MR_CREATE;
afa319a5
SC
1988
1989 /*
1990 * To simplify KVM internals, the total number of pages across
1991 * all memslots must fit in an unsigned long.
1992 */
0f9bdef3 1993 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
afa319a5 1994 return -EINVAL;
5c0b4f3d 1995 } else { /* Modify an existing slot. */
0f9bdef3
SC
1996 if ((mem->userspace_addr != old->userspace_addr) ||
1997 (npages != old->npages) ||
1998 ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
71a4c30b 1999 return -EINVAL;
09170a49 2000
0f9bdef3 2001 if (base_gfn != old->base_gfn)
5c0b4f3d 2002 change = KVM_MR_MOVE;
0f9bdef3 2003 else if (mem->flags != old->flags)
5c0b4f3d
SC
2004 change = KVM_MR_FLAGS_ONLY;
2005 else /* Nothing to change. */
2006 return 0;
09170a49 2007 }
6aa8b732 2008
44401a20 2009 if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
0f9bdef3 2010 kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
44401a20 2011 return -EEXIST;
6aa8b732 2012
244893fa
SC
2013 /* Allocate a slot that will persist in the memslot. */
2014 new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2015 if (!new)
2016 return -ENOMEM;
3c9bd400 2017
244893fa
SC
2018 new->as_id = as_id;
2019 new->id = id;
2020 new->base_gfn = base_gfn;
2021 new->npages = npages;
2022 new->flags = mem->flags;
2023 new->userspace_addr = mem->userspace_addr;
6aa8b732 2024
244893fa 2025 r = kvm_set_memslot(kvm, old, new, change);
cf47f50b 2026 if (r)
244893fa 2027 kfree(new);
6aa8b732 2028 return r;
210c7c4d 2029}
f78e0e2e
SY
2030EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2031
2032int kvm_set_memory_region(struct kvm *kvm,
09170a49 2033 const struct kvm_userspace_memory_region *mem)
f78e0e2e
SY
2034{
2035 int r;
2036
79fac95e 2037 mutex_lock(&kvm->slots_lock);
47ae31e2 2038 r = __kvm_set_memory_region(kvm, mem);
79fac95e 2039 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
2040 return r;
2041}
210c7c4d
IE
2042EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2043
7940876e
SH
2044static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2045 struct kvm_userspace_memory_region *mem)
210c7c4d 2046{
f481b069 2047 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
e0d62c7f 2048 return -EINVAL;
09170a49 2049
47ae31e2 2050 return kvm_set_memory_region(kvm, mem);
6aa8b732
AK
2051}
2052
0dff0846 2053#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2a49f61d
SC
2054/**
2055 * kvm_get_dirty_log - get a snapshot of dirty pages
2056 * @kvm: pointer to kvm instance
2057 * @log: slot id and address to which we copy the log
2058 * @is_dirty: set to '1' if any dirty pages were found
2059 * @memslot: set to the associated memslot, always valid on success
2060 */
2061int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2062 int *is_dirty, struct kvm_memory_slot **memslot)
6aa8b732 2063{
9f6b8029 2064 struct kvm_memslots *slots;
843574a3 2065 int i, as_id, id;
87bf6e7d 2066 unsigned long n;
6aa8b732
AK
2067 unsigned long any = 0;
2068
86bdf3eb
GS
2069 /* Dirty ring tracking may be exclusive to dirty log tracking */
2070 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2071 return -ENXIO;
2072
2a49f61d
SC
2073 *memslot = NULL;
2074 *is_dirty = 0;
2075
f481b069
PB
2076 as_id = log->slot >> 16;
2077 id = (u16)log->slot;
2078 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
843574a3 2079 return -EINVAL;
6aa8b732 2080
f481b069 2081 slots = __kvm_memslots(kvm, as_id);
2a49f61d 2082 *memslot = id_to_memslot(slots, id);
0577d1ab 2083 if (!(*memslot) || !(*memslot)->dirty_bitmap)
843574a3 2084 return -ENOENT;
6aa8b732 2085
2a49f61d
SC
2086 kvm_arch_sync_dirty_log(kvm, *memslot);
2087
2088 n = kvm_dirty_bitmap_bytes(*memslot);
6aa8b732 2089
cd1a4a98 2090 for (i = 0; !any && i < n/sizeof(long); ++i)
2a49f61d 2091 any = (*memslot)->dirty_bitmap[i];
6aa8b732 2092
2a49f61d 2093 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
843574a3 2094 return -EFAULT;
6aa8b732 2095
5bb064dc
ZX
2096 if (any)
2097 *is_dirty = 1;
843574a3 2098 return 0;
6aa8b732 2099}
2ba9f0d8 2100EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
6aa8b732 2101
0dff0846 2102#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
ba0513b5 2103/**
b8b00220 2104 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2a31b9db 2105 * and reenable dirty page tracking for the corresponding pages.
ba0513b5
MS
2106 * @kvm: pointer to kvm instance
2107 * @log: slot id and address to which we copy the log
ba0513b5
MS
2108 *
2109 * We need to keep it in mind that VCPU threads can write to the bitmap
2110 * concurrently. So, to avoid losing track of dirty pages we keep the
2111 * following order:
2112 *
2113 * 1. Take a snapshot of the bit and clear it if needed.
2114 * 2. Write protect the corresponding page.
2115 * 3. Copy the snapshot to the userspace.
2116 * 4. Upon return caller flushes TLB's if needed.
2117 *
2118 * Between 2 and 4, the guest may write to the page using the remaining TLB
2119 * entry. This is not a problem because the page is reported dirty using
2120 * the snapshot taken before and step 4 ensures that writes done after
2121 * exiting to userspace will be logged for the next call.
2122 *
2123 */
0dff0846 2124static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
ba0513b5 2125{
9f6b8029 2126 struct kvm_memslots *slots;
ba0513b5 2127 struct kvm_memory_slot *memslot;
58d6db34 2128 int i, as_id, id;
ba0513b5
MS
2129 unsigned long n;
2130 unsigned long *dirty_bitmap;
2131 unsigned long *dirty_bitmap_buffer;
0dff0846 2132 bool flush;
ba0513b5 2133
86bdf3eb
GS
2134 /* Dirty ring tracking may be exclusive to dirty log tracking */
2135 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2136 return -ENXIO;
2137
f481b069
PB
2138 as_id = log->slot >> 16;
2139 id = (u16)log->slot;
2140 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
58d6db34 2141 return -EINVAL;
ba0513b5 2142
f481b069
PB
2143 slots = __kvm_memslots(kvm, as_id);
2144 memslot = id_to_memslot(slots, id);
0577d1ab
SC
2145 if (!memslot || !memslot->dirty_bitmap)
2146 return -ENOENT;
ba0513b5
MS
2147
2148 dirty_bitmap = memslot->dirty_bitmap;
ba0513b5 2149
0dff0846
SC
2150 kvm_arch_sync_dirty_log(kvm, memslot);
2151
ba0513b5 2152 n = kvm_dirty_bitmap_bytes(memslot);
0dff0846 2153 flush = false;
2a31b9db
PB
2154 if (kvm->manual_dirty_log_protect) {
2155 /*
2156 * Unlike kvm_get_dirty_log, we always return false in *flush,
2157 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2158 * is some code duplication between this function and
2159 * kvm_get_dirty_log, but hopefully all architecture
2160 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2161 * can be eliminated.
2162 */
2163 dirty_bitmap_buffer = dirty_bitmap;
2164 } else {
2165 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2166 memset(dirty_bitmap_buffer, 0, n);
ba0513b5 2167
531810ca 2168 KVM_MMU_LOCK(kvm);
2a31b9db
PB
2169 for (i = 0; i < n / sizeof(long); i++) {
2170 unsigned long mask;
2171 gfn_t offset;
ba0513b5 2172
2a31b9db
PB
2173 if (!dirty_bitmap[i])
2174 continue;
2175
0dff0846 2176 flush = true;
2a31b9db
PB
2177 mask = xchg(&dirty_bitmap[i], 0);
2178 dirty_bitmap_buffer[i] = mask;
2179
a67794ca
LT
2180 offset = i * BITS_PER_LONG;
2181 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2182 offset, mask);
2a31b9db 2183 }
531810ca 2184 KVM_MMU_UNLOCK(kvm);
2a31b9db
PB
2185 }
2186
0dff0846
SC
2187 if (flush)
2188 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2189
2a31b9db
PB
2190 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2191 return -EFAULT;
2192 return 0;
2193}
0dff0846
SC
2194
2195
2196/**
2197 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2198 * @kvm: kvm instance
2199 * @log: slot id and address to which we copy the log
2200 *
2201 * Steps 1-4 below provide general overview of dirty page logging. See
2202 * kvm_get_dirty_log_protect() function description for additional details.
2203 *
2204 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2205 * always flush the TLB (step 4) even if previous step failed and the dirty
2206 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2207 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2208 * writes will be marked dirty for next log read.
2209 *
2210 * 1. Take a snapshot of the bit and clear it if needed.
2211 * 2. Write protect the corresponding page.
2212 * 3. Copy the snapshot to the userspace.
2213 * 4. Flush TLB's if needed.
2214 */
2215static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2216 struct kvm_dirty_log *log)
2217{
2218 int r;
2219
2220 mutex_lock(&kvm->slots_lock);
2221
2222 r = kvm_get_dirty_log_protect(kvm, log);
2223
2224 mutex_unlock(&kvm->slots_lock);
2225 return r;
2226}
2a31b9db
PB
2227
2228/**
2229 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2230 * and reenable dirty page tracking for the corresponding pages.
2231 * @kvm: pointer to kvm instance
2232 * @log: slot id and address from which to fetch the bitmap of dirty pages
2233 */
0dff0846
SC
2234static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2235 struct kvm_clear_dirty_log *log)
2a31b9db
PB
2236{
2237 struct kvm_memslots *slots;
2238 struct kvm_memory_slot *memslot;
98938aa8 2239 int as_id, id;
2a31b9db 2240 gfn_t offset;
98938aa8 2241 unsigned long i, n;
2a31b9db
PB
2242 unsigned long *dirty_bitmap;
2243 unsigned long *dirty_bitmap_buffer;
0dff0846 2244 bool flush;
2a31b9db 2245
86bdf3eb
GS
2246 /* Dirty ring tracking may be exclusive to dirty log tracking */
2247 if (!kvm_use_dirty_bitmap(kvm))
b2cc64c4
PX
2248 return -ENXIO;
2249
2a31b9db
PB
2250 as_id = log->slot >> 16;
2251 id = (u16)log->slot;
2252 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2253 return -EINVAL;
2254
76d58e0f 2255 if (log->first_page & 63)
2a31b9db
PB
2256 return -EINVAL;
2257
2258 slots = __kvm_memslots(kvm, as_id);
2259 memslot = id_to_memslot(slots, id);
0577d1ab
SC
2260 if (!memslot || !memslot->dirty_bitmap)
2261 return -ENOENT;
2a31b9db
PB
2262
2263 dirty_bitmap = memslot->dirty_bitmap;
2a31b9db 2264
4ddc9204 2265 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
98938aa8
TB
2266
2267 if (log->first_page > memslot->npages ||
76d58e0f
PB
2268 log->num_pages > memslot->npages - log->first_page ||
2269 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2270 return -EINVAL;
98938aa8 2271
0dff0846
SC
2272 kvm_arch_sync_dirty_log(kvm, memslot);
2273
2274 flush = false;
2a31b9db
PB
2275 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2276 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2277 return -EFAULT;
ba0513b5 2278
531810ca 2279 KVM_MMU_LOCK(kvm);
53eac7a8
PX
2280 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2281 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2a31b9db
PB
2282 i++, offset += BITS_PER_LONG) {
2283 unsigned long mask = *dirty_bitmap_buffer++;
2284 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2285 if (!mask)
ba0513b5
MS
2286 continue;
2287
2a31b9db 2288 mask &= atomic_long_fetch_andnot(mask, p);
ba0513b5 2289
2a31b9db
PB
2290 /*
2291 * mask contains the bits that really have been cleared. This
2292 * never includes any bits beyond the length of the memslot (if
2293 * the length is not aligned to 64 pages), therefore it is not
2294 * a problem if userspace sets them in log->dirty_bitmap.
2295 */
58d2930f 2296 if (mask) {
0dff0846 2297 flush = true;
58d2930f
TY
2298 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2299 offset, mask);
2300 }
ba0513b5 2301 }
531810ca 2302 KVM_MMU_UNLOCK(kvm);
2a31b9db 2303
0dff0846
SC
2304 if (flush)
2305 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2306
58d6db34 2307 return 0;
ba0513b5 2308}
0dff0846
SC
2309
2310static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2311 struct kvm_clear_dirty_log *log)
2312{
2313 int r;
2314
2315 mutex_lock(&kvm->slots_lock);
2316
2317 r = kvm_clear_dirty_log_protect(kvm, log);
2318
2319 mutex_unlock(&kvm->slots_lock);
2320 return r;
2321}
2322#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
ba0513b5 2323
49c7754c
GN
2324struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2325{
2326 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2327}
a1f4d395 2328EXPORT_SYMBOL_GPL(gfn_to_memslot);
6aa8b732 2329
8e73485c
PB
2330struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2331{
fe22ed82 2332 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
a54d8066 2333 u64 gen = slots->generation;
fe22ed82 2334 struct kvm_memory_slot *slot;
fe22ed82 2335
a54d8066
MS
2336 /*
2337 * This also protects against using a memslot from a different address space,
2338 * since different address spaces have different generation numbers.
2339 */
2340 if (unlikely(gen != vcpu->last_used_slot_gen)) {
2341 vcpu->last_used_slot = NULL;
2342 vcpu->last_used_slot_gen = gen;
2343 }
2344
2345 slot = try_get_memslot(vcpu->last_used_slot, gfn);
fe22ed82
DM
2346 if (slot)
2347 return slot;
2348
2349 /*
2350 * Fall back to searching all memslots. We purposely use
2351 * search_memslots() instead of __gfn_to_memslot() to avoid
a54d8066 2352 * thrashing the VM-wide last_used_slot in kvm_memslots.
fe22ed82 2353 */
a54d8066 2354 slot = search_memslots(slots, gfn, false);
fe22ed82 2355 if (slot) {
a54d8066 2356 vcpu->last_used_slot = slot;
fe22ed82
DM
2357 return slot;
2358 }
2359
2360 return NULL;
8e73485c
PB
2361}
2362
33e94154 2363bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
e0d62c7f 2364{
bf3e05bc 2365 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
e0d62c7f 2366
c36b7150 2367 return kvm_is_visible_memslot(memslot);
e0d62c7f
IE
2368}
2369EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2370
995decb6
VK
2371bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2372{
2373 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2374
2375 return kvm_is_visible_memslot(memslot);
2376}
2377EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2378
f9b84e19 2379unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
8f0b1ab6
JR
2380{
2381 struct vm_area_struct *vma;
2382 unsigned long addr, size;
2383
2384 size = PAGE_SIZE;
2385
42cde48b 2386 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
8f0b1ab6
JR
2387 if (kvm_is_error_hva(addr))
2388 return PAGE_SIZE;
2389
d8ed45c5 2390 mmap_read_lock(current->mm);
8f0b1ab6
JR
2391 vma = find_vma(current->mm, addr);
2392 if (!vma)
2393 goto out;
2394
2395 size = vma_kernel_pagesize(vma);
2396
2397out:
d8ed45c5 2398 mmap_read_unlock(current->mm);
8f0b1ab6
JR
2399
2400 return size;
2401}
2402
8283e36a 2403static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
4d8b81ab
XG
2404{
2405 return slot->flags & KVM_MEM_READONLY;
2406}
2407
8283e36a 2408static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
4d8b81ab 2409 gfn_t *nr_pages, bool write)
539cb660 2410{
bc6678a3 2411 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
ca3a490c 2412 return KVM_HVA_ERR_BAD;
48987781 2413
4d8b81ab
XG
2414 if (memslot_is_readonly(slot) && write)
2415 return KVM_HVA_ERR_RO_BAD;
48987781
XG
2416
2417 if (nr_pages)
2418 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2419
4d8b81ab 2420 return __gfn_to_hva_memslot(slot, gfn);
539cb660 2421}
48987781 2422
4d8b81ab
XG
2423static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2424 gfn_t *nr_pages)
2425{
2426 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
539cb660 2427}
48987781 2428
4d8b81ab 2429unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
7940876e 2430 gfn_t gfn)
4d8b81ab
XG
2431{
2432 return gfn_to_hva_many(slot, gfn, NULL);
2433}
2434EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2435
48987781
XG
2436unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2437{
49c7754c 2438 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
48987781 2439}
0d150298 2440EXPORT_SYMBOL_GPL(gfn_to_hva);
539cb660 2441
8e73485c
PB
2442unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2443{
2444 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2445}
2446EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2447
86ab8cff 2448/*
970c0d4b
WY
2449 * Return the hva of a @gfn and the R/W attribute if possible.
2450 *
2451 * @slot: the kvm_memory_slot which contains @gfn
2452 * @gfn: the gfn to be translated
2453 * @writable: used to return the read/write attribute of the @slot if the hva
2454 * is valid and @writable is not NULL
86ab8cff 2455 */
64d83126
CD
2456unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2457 gfn_t gfn, bool *writable)
86ab8cff 2458{
a2ac07fe
GN
2459 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2460
2461 if (!kvm_is_error_hva(hva) && writable)
ba6a3541
PB
2462 *writable = !memslot_is_readonly(slot);
2463
a2ac07fe 2464 return hva;
86ab8cff
XG
2465}
2466
64d83126
CD
2467unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2468{
2469 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2470
2471 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2472}
2473
8e73485c
PB
2474unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2475{
2476 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2477
2478 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2479}
2480
fafc3dba
HY
2481static inline int check_user_page_hwpoison(unsigned long addr)
2482{
0d731759 2483 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
fafc3dba 2484
0d731759 2485 rc = get_user_pages(addr, 1, flags, NULL, NULL);
fafc3dba
HY
2486 return rc == -EHWPOISON;
2487}
2488
2fc84311 2489/*
b9b33da2
PB
2490 * The fast path to get the writable pfn which will be stored in @pfn,
2491 * true indicates success, otherwise false is returned. It's also the
311497e0 2492 * only part that runs if we can in atomic context.
2fc84311 2493 */
b9b33da2
PB
2494static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2495 bool *writable, kvm_pfn_t *pfn)
954bbbc2 2496{
8d4e1288 2497 struct page *page[1];
954bbbc2 2498
12ce13fe
XG
2499 /*
2500 * Fast pin a writable pfn only if it is a write fault request
2501 * or the caller allows to map a writable pfn for a read fault
2502 * request.
2503 */
2504 if (!(write_fault || writable))
2505 return false;
612819c3 2506
dadbb612 2507 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2fc84311 2508 *pfn = page_to_pfn(page[0]);
612819c3 2509
2fc84311
XG
2510 if (writable)
2511 *writable = true;
2512 return true;
2513 }
af585b92 2514
2fc84311
XG
2515 return false;
2516}
612819c3 2517
2fc84311
XG
2518/*
2519 * The slow path to get the pfn of the specified host virtual address,
2520 * 1 indicates success, -errno is returned if error is detected.
2521 */
2522static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
c8b88b33 2523 bool interruptible, bool *writable, kvm_pfn_t *pfn)
2fc84311 2524{
ce53053c
AV
2525 unsigned int flags = FOLL_HWPOISON;
2526 struct page *page;
28249139 2527 int npages;
612819c3 2528
2fc84311
XG
2529 might_sleep();
2530
2531 if (writable)
2532 *writable = write_fault;
2533
ce53053c
AV
2534 if (write_fault)
2535 flags |= FOLL_WRITE;
2536 if (async)
2537 flags |= FOLL_NOWAIT;
c8b88b33
PX
2538 if (interruptible)
2539 flags |= FOLL_INTERRUPTIBLE;
d4944b0e 2540
ce53053c 2541 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2fc84311
XG
2542 if (npages != 1)
2543 return npages;
2544
2545 /* map read fault as writable if possible */
12ce13fe 2546 if (unlikely(!write_fault) && writable) {
ce53053c 2547 struct page *wpage;
2fc84311 2548
dadbb612 2549 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2fc84311 2550 *writable = true;
ce53053c
AV
2551 put_page(page);
2552 page = wpage;
612819c3 2553 }
887c08ac 2554 }
ce53053c 2555 *pfn = page_to_pfn(page);
2fc84311
XG
2556 return npages;
2557}
539cb660 2558
4d8b81ab
XG
2559static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2560{
2561 if (unlikely(!(vma->vm_flags & VM_READ)))
2562 return false;
2e2e3738 2563
4d8b81ab
XG
2564 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2565 return false;
887c08ac 2566
4d8b81ab
XG
2567 return true;
2568}
bf998156 2569
f8be156b
NP
2570static int kvm_try_get_pfn(kvm_pfn_t pfn)
2571{
b14b2690
SC
2572 struct page *page = kvm_pfn_to_refcounted_page(pfn);
2573
2574 if (!page)
f8be156b 2575 return 1;
b14b2690
SC
2576
2577 return get_page_unless_zero(page);
f8be156b
NP
2578}
2579
92176a8e 2580static int hva_to_pfn_remapped(struct vm_area_struct *vma,
1625566e
XT
2581 unsigned long addr, bool write_fault,
2582 bool *writable, kvm_pfn_t *p_pfn)
92176a8e 2583{
a9545779 2584 kvm_pfn_t pfn;
bd2fae8d
PB
2585 pte_t *ptep;
2586 spinlock_t *ptl;
add6a0cd
PB
2587 int r;
2588
9fd6dad1 2589 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
add6a0cd
PB
2590 if (r) {
2591 /*
2592 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2593 * not call the fault handler, so do it here.
2594 */
2595 bool unlocked = false;
64019a2e 2596 r = fixup_user_fault(current->mm, addr,
add6a0cd
PB
2597 (write_fault ? FAULT_FLAG_WRITE : 0),
2598 &unlocked);
a8387d0b
PB
2599 if (unlocked)
2600 return -EAGAIN;
add6a0cd
PB
2601 if (r)
2602 return r;
2603
9fd6dad1 2604 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
add6a0cd
PB
2605 if (r)
2606 return r;
bd2fae8d 2607 }
add6a0cd 2608
bd2fae8d
PB
2609 if (write_fault && !pte_write(*ptep)) {
2610 pfn = KVM_PFN_ERR_RO_FAULT;
2611 goto out;
add6a0cd
PB
2612 }
2613
a340b3e2 2614 if (writable)
bd2fae8d
PB
2615 *writable = pte_write(*ptep);
2616 pfn = pte_pfn(*ptep);
add6a0cd
PB
2617
2618 /*
2619 * Get a reference here because callers of *hva_to_pfn* and
2620 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2621 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
36c3ce6c 2622 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
add6a0cd
PB
2623 * simply do nothing for reserved pfns.
2624 *
2625 * Whoever called remap_pfn_range is also going to call e.g.
2626 * unmap_mapping_range before the underlying pages are freed,
2627 * causing a call to our MMU notifier.
f8be156b
NP
2628 *
2629 * Certain IO or PFNMAP mappings can be backed with valid
2630 * struct pages, but be allocated without refcounting e.g.,
2631 * tail pages of non-compound higher order allocations, which
2632 * would then underflow the refcount when the caller does the
2633 * required put_page. Don't allow those pages here.
add6a0cd 2634 */
f8be156b
NP
2635 if (!kvm_try_get_pfn(pfn))
2636 r = -EFAULT;
add6a0cd 2637
bd2fae8d
PB
2638out:
2639 pte_unmap_unlock(ptep, ptl);
add6a0cd 2640 *p_pfn = pfn;
f8be156b
NP
2641
2642 return r;
92176a8e
PB
2643}
2644
12ce13fe
XG
2645/*
2646 * Pin guest page in memory and return its pfn.
2647 * @addr: host virtual address which maps memory to the guest
2648 * @atomic: whether this function can sleep
c8b88b33 2649 * @interruptible: whether the process can be interrupted by non-fatal signals
12ce13fe
XG
2650 * @async: whether this function need to wait IO complete if the
2651 * host page is not in the memory
2652 * @write_fault: whether we should get a writable host page
2653 * @writable: whether it allows to map a writable host page for !@write_fault
2654 *
2655 * The function will map a writable host page for these two cases:
2656 * 1): @write_fault = true
2657 * 2): @write_fault = false && @writable, @writable will tell the caller
2658 * whether the mapping is writable.
2659 */
c8b88b33
PX
2660kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2661 bool *async, bool write_fault, bool *writable)
2fc84311
XG
2662{
2663 struct vm_area_struct *vma;
943dfea8 2664 kvm_pfn_t pfn;
92176a8e 2665 int npages, r;
2e2e3738 2666
2fc84311
XG
2667 /* we can do it either atomically or asynchronously, not both */
2668 BUG_ON(atomic && async);
8d4e1288 2669
b9b33da2 2670 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2fc84311
XG
2671 return pfn;
2672
2673 if (atomic)
2674 return KVM_PFN_ERR_FAULT;
2675
c8b88b33
PX
2676 npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2677 writable, &pfn);
2fc84311
XG
2678 if (npages == 1)
2679 return pfn;
fe5ed56c
PX
2680 if (npages == -EINTR)
2681 return KVM_PFN_ERR_SIGPENDING;
8d4e1288 2682
d8ed45c5 2683 mmap_read_lock(current->mm);
2fc84311
XG
2684 if (npages == -EHWPOISON ||
2685 (!async && check_user_page_hwpoison(addr))) {
2686 pfn = KVM_PFN_ERR_HWPOISON;
2687 goto exit;
2688 }
2689
a8387d0b 2690retry:
fc98c03b 2691 vma = vma_lookup(current->mm, addr);
2fc84311
XG
2692
2693 if (vma == NULL)
2694 pfn = KVM_PFN_ERR_FAULT;
92176a8e 2695 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
1625566e 2696 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
a8387d0b
PB
2697 if (r == -EAGAIN)
2698 goto retry;
92176a8e
PB
2699 if (r < 0)
2700 pfn = KVM_PFN_ERR_FAULT;
2fc84311 2701 } else {
4d8b81ab 2702 if (async && vma_is_valid(vma, write_fault))
2fc84311
XG
2703 *async = true;
2704 pfn = KVM_PFN_ERR_FAULT;
2705 }
2706exit:
d8ed45c5 2707 mmap_read_unlock(current->mm);
2e2e3738 2708 return pfn;
35149e21
AL
2709}
2710
8283e36a 2711kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
c8b88b33
PX
2712 bool atomic, bool interruptible, bool *async,
2713 bool write_fault, bool *writable, hva_t *hva)
887c08ac 2714{
4d8b81ab
XG
2715 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2716
4a42d848
DS
2717 if (hva)
2718 *hva = addr;
2719
b2740d35
PB
2720 if (addr == KVM_HVA_ERR_RO_BAD) {
2721 if (writable)
2722 *writable = false;
4d8b81ab 2723 return KVM_PFN_ERR_RO_FAULT;
b2740d35 2724 }
4d8b81ab 2725
b2740d35
PB
2726 if (kvm_is_error_hva(addr)) {
2727 if (writable)
2728 *writable = false;
81c52c56 2729 return KVM_PFN_NOSLOT;
b2740d35 2730 }
4d8b81ab
XG
2731
2732 /* Do not map writable pfn in the readonly memslot. */
2733 if (writable && memslot_is_readonly(slot)) {
2734 *writable = false;
2735 writable = NULL;
2736 }
2737
c8b88b33 2738 return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
4d8b81ab 2739 writable);
887c08ac 2740}
3520469d 2741EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
887c08ac 2742
ba049e93 2743kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
612819c3
MT
2744 bool *writable)
2745{
c8b88b33
PX
2746 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
2747 NULL, write_fault, writable, NULL);
612819c3
MT
2748}
2749EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2750
8283e36a 2751kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
506f0d6f 2752{
c8b88b33
PX
2753 return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
2754 NULL, NULL);
506f0d6f 2755}
e37afc6e 2756EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
506f0d6f 2757
8283e36a 2758kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
506f0d6f 2759{
c8b88b33
PX
2760 return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
2761 NULL, NULL);
506f0d6f 2762}
037d92dc 2763EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
506f0d6f 2764
ba049e93 2765kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
8e73485c
PB
2766{
2767 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2768}
2769EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2770
ba049e93 2771kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
e37afc6e
PB
2772{
2773 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2774}
2775EXPORT_SYMBOL_GPL(gfn_to_pfn);
2776
ba049e93 2777kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
8e73485c
PB
2778{
2779 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2780}
2781EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2782
d9ef13c2
PB
2783int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2784 struct page **pages, int nr_pages)
48987781
XG
2785{
2786 unsigned long addr;
076b925d 2787 gfn_t entry = 0;
48987781 2788
d9ef13c2 2789 addr = gfn_to_hva_many(slot, gfn, &entry);
48987781
XG
2790 if (kvm_is_error_hva(addr))
2791 return -1;
2792
2793 if (entry < nr_pages)
2794 return 0;
2795
dadbb612 2796 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
48987781
XG
2797}
2798EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2799
b1624f99
SC
2800/*
2801 * Do not use this helper unless you are absolutely certain the gfn _must_ be
2802 * backed by 'struct page'. A valid example is if the backing memslot is
2803 * controlled by KVM. Note, if the returned page is valid, it's refcount has
2804 * been elevated by gfn_to_pfn().
2805 */
35149e21
AL
2806struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2807{
b14b2690 2808 struct page *page;
ba049e93 2809 kvm_pfn_t pfn;
2e2e3738
AL
2810
2811 pfn = gfn_to_pfn(kvm, gfn);
2e2e3738 2812
81c52c56 2813 if (is_error_noslot_pfn(pfn))
cb9aaa30 2814 return KVM_ERR_PTR_BAD_PAGE;
a2766325 2815
b14b2690
SC
2816 page = kvm_pfn_to_refcounted_page(pfn);
2817 if (!page)
6cede2e6 2818 return KVM_ERR_PTR_BAD_PAGE;
a2766325 2819
b14b2690 2820 return page;
954bbbc2
AK
2821}
2822EXPORT_SYMBOL_GPL(gfn_to_page);
2823
357a18ad 2824void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
91724814 2825{
91724814
BO
2826 if (dirty)
2827 kvm_release_pfn_dirty(pfn);
2828 else
2829 kvm_release_pfn_clean(pfn);
2830}
2831
357a18ad 2832int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
e45adf66
KA
2833{
2834 kvm_pfn_t pfn;
2835 void *hva = NULL;
2836 struct page *page = KVM_UNMAPPED_PAGE;
2837
2838 if (!map)
2839 return -EINVAL;
2840
357a18ad 2841 pfn = gfn_to_pfn(vcpu->kvm, gfn);
e45adf66
KA
2842 if (is_error_noslot_pfn(pfn))
2843 return -EINVAL;
2844
2845 if (pfn_valid(pfn)) {
2846 page = pfn_to_page(pfn);
357a18ad 2847 hva = kmap(page);
d30b214d 2848#ifdef CONFIG_HAS_IOMEM
91724814 2849 } else {
357a18ad 2850 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
d30b214d 2851#endif
e45adf66
KA
2852 }
2853
2854 if (!hva)
2855 return -EFAULT;
2856
2857 map->page = page;
2858 map->hva = hva;
2859 map->pfn = pfn;
2860 map->gfn = gfn;
2861
2862 return 0;
2863}
e45adf66
KA
2864EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2865
357a18ad 2866void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
e45adf66
KA
2867{
2868 if (!map)
2869 return;
2870
2871 if (!map->hva)
2872 return;
2873
357a18ad
DW
2874 if (map->page != KVM_UNMAPPED_PAGE)
2875 kunmap(map->page);
eb1f2f38 2876#ifdef CONFIG_HAS_IOMEM
91724814 2877 else
357a18ad 2878 memunmap(map->hva);
eb1f2f38 2879#endif
e45adf66 2880
91724814 2881 if (dirty)
357a18ad 2882 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
91724814 2883
357a18ad 2884 kvm_release_pfn(map->pfn, dirty);
e45adf66
KA
2885
2886 map->hva = NULL;
2887 map->page = NULL;
2888}
2889EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2890
8e1c6914 2891static bool kvm_is_ad_tracked_page(struct page *page)
8e73485c 2892{
8e1c6914
SC
2893 /*
2894 * Per page-flags.h, pages tagged PG_reserved "should in general not be
2895 * touched (e.g. set dirty) except by its owner".
2896 */
2897 return !PageReserved(page);
2898}
8e73485c 2899
8e1c6914
SC
2900static void kvm_set_page_dirty(struct page *page)
2901{
2902 if (kvm_is_ad_tracked_page(page))
2903 SetPageDirty(page);
2904}
8e73485c 2905
8e1c6914
SC
2906static void kvm_set_page_accessed(struct page *page)
2907{
2908 if (kvm_is_ad_tracked_page(page))
2909 mark_page_accessed(page);
8e73485c 2910}
8e73485c 2911
b4231d61
IE
2912void kvm_release_page_clean(struct page *page)
2913{
32cad84f
XG
2914 WARN_ON(is_error_page(page));
2915
8e1c6914
SC
2916 kvm_set_page_accessed(page);
2917 put_page(page);
b4231d61
IE
2918}
2919EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2920
ba049e93 2921void kvm_release_pfn_clean(kvm_pfn_t pfn)
35149e21 2922{
b14b2690
SC
2923 struct page *page;
2924
2925 if (is_error_noslot_pfn(pfn))
2926 return;
2927
2928 page = kvm_pfn_to_refcounted_page(pfn);
2929 if (!page)
2930 return;
2931
2932 kvm_release_page_clean(page);
35149e21
AL
2933}
2934EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2935
b4231d61 2936void kvm_release_page_dirty(struct page *page)
8a7ae055 2937{
a2766325
XG
2938 WARN_ON(is_error_page(page));
2939
8e1c6914
SC
2940 kvm_set_page_dirty(page);
2941 kvm_release_page_clean(page);
35149e21
AL
2942}
2943EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2944
f7a6509f 2945void kvm_release_pfn_dirty(kvm_pfn_t pfn)
35149e21 2946{
b14b2690
SC
2947 struct page *page;
2948
2949 if (is_error_noslot_pfn(pfn))
2950 return;
2951
2952 page = kvm_pfn_to_refcounted_page(pfn);
2953 if (!page)
2954 return;
2955
2956 kvm_release_page_dirty(page);
35149e21 2957}
f7a6509f 2958EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
35149e21 2959
8e1c6914
SC
2960/*
2961 * Note, checking for an error/noslot pfn is the caller's responsibility when
2962 * directly marking a page dirty/accessed. Unlike the "release" helpers, the
2963 * "set" helpers are not to be used when the pfn might point at garbage.
2964 */
ba049e93 2965void kvm_set_pfn_dirty(kvm_pfn_t pfn)
35149e21 2966{
8e1c6914
SC
2967 if (WARN_ON(is_error_noslot_pfn(pfn)))
2968 return;
2969
2970 if (pfn_valid(pfn))
2971 kvm_set_page_dirty(pfn_to_page(pfn));
8a7ae055 2972}
35149e21
AL
2973EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2974
ba049e93 2975void kvm_set_pfn_accessed(kvm_pfn_t pfn)
35149e21 2976{
8e1c6914
SC
2977 if (WARN_ON(is_error_noslot_pfn(pfn)))
2978 return;
2979
2980 if (pfn_valid(pfn))
2981 kvm_set_page_accessed(pfn_to_page(pfn));
35149e21
AL
2982}
2983EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2984
195aefde
IE
2985static int next_segment(unsigned long len, int offset)
2986{
2987 if (len > PAGE_SIZE - offset)
2988 return PAGE_SIZE - offset;
2989 else
2990 return len;
2991}
2992
8e73485c
PB
2993static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2994 void *data, int offset, int len)
195aefde 2995{
e0506bcb
IE
2996 int r;
2997 unsigned long addr;
195aefde 2998
8e73485c 2999 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
e0506bcb
IE
3000 if (kvm_is_error_hva(addr))
3001 return -EFAULT;
3180a7fc 3002 r = __copy_from_user(data, (void __user *)addr + offset, len);
e0506bcb 3003 if (r)
195aefde 3004 return -EFAULT;
195aefde
IE
3005 return 0;
3006}
8e73485c
PB
3007
3008int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3009 int len)
3010{
3011 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3012
3013 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3014}
195aefde
IE
3015EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3016
8e73485c
PB
3017int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3018 int offset, int len)
3019{
3020 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3021
3022 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3023}
3024EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3025
195aefde
IE
3026int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3027{
3028 gfn_t gfn = gpa >> PAGE_SHIFT;
3029 int seg;
3030 int offset = offset_in_page(gpa);
3031 int ret;
3032
3033 while ((seg = next_segment(len, offset)) != 0) {
3034 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3035 if (ret < 0)
3036 return ret;
3037 offset = 0;
3038 len -= seg;
3039 data += seg;
3040 ++gfn;
3041 }
3042 return 0;
3043}
3044EXPORT_SYMBOL_GPL(kvm_read_guest);
3045
8e73485c 3046int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
7ec54588 3047{
7ec54588 3048 gfn_t gfn = gpa >> PAGE_SHIFT;
8e73485c 3049 int seg;
7ec54588 3050 int offset = offset_in_page(gpa);
8e73485c
PB
3051 int ret;
3052
3053 while ((seg = next_segment(len, offset)) != 0) {
3054 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3055 if (ret < 0)
3056 return ret;
3057 offset = 0;
3058 len -= seg;
3059 data += seg;
3060 ++gfn;
3061 }
3062 return 0;
3063}
3064EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
7ec54588 3065
8e73485c
PB
3066static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3067 void *data, int offset, unsigned long len)
3068{
3069 int r;
3070 unsigned long addr;
3071
3072 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
7ec54588
MT
3073 if (kvm_is_error_hva(addr))
3074 return -EFAULT;
0aac03f0 3075 pagefault_disable();
3180a7fc 3076 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
0aac03f0 3077 pagefault_enable();
7ec54588
MT
3078 if (r)
3079 return -EFAULT;
3080 return 0;
3081}
7ec54588 3082
8e73485c
PB
3083int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3084 void *data, unsigned long len)
3085{
3086 gfn_t gfn = gpa >> PAGE_SHIFT;
3087 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3088 int offset = offset_in_page(gpa);
3089
3090 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3091}
3092EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3093
28bd726a
PX
3094static int __kvm_write_guest_page(struct kvm *kvm,
3095 struct kvm_memory_slot *memslot, gfn_t gfn,
8e73485c 3096 const void *data, int offset, int len)
195aefde 3097{
e0506bcb
IE
3098 int r;
3099 unsigned long addr;
195aefde 3100
251eb841 3101 addr = gfn_to_hva_memslot(memslot, gfn);
e0506bcb
IE
3102 if (kvm_is_error_hva(addr))
3103 return -EFAULT;
8b0cedff 3104 r = __copy_to_user((void __user *)addr + offset, data, len);
e0506bcb 3105 if (r)
195aefde 3106 return -EFAULT;
28bd726a 3107 mark_page_dirty_in_slot(kvm, memslot, gfn);
195aefde
IE
3108 return 0;
3109}
8e73485c
PB
3110
3111int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3112 const void *data, int offset, int len)
3113{
3114 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3115
28bd726a 3116 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
8e73485c 3117}
195aefde
IE
3118EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3119
8e73485c
PB
3120int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3121 const void *data, int offset, int len)
3122{
3123 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3124
28bd726a 3125 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
8e73485c
PB
3126}
3127EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3128
195aefde
IE
3129int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3130 unsigned long len)
3131{
3132 gfn_t gfn = gpa >> PAGE_SHIFT;
3133 int seg;
3134 int offset = offset_in_page(gpa);
3135 int ret;
3136
3137 while ((seg = next_segment(len, offset)) != 0) {
3138 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3139 if (ret < 0)
3140 return ret;
3141 offset = 0;
3142 len -= seg;
3143 data += seg;
3144 ++gfn;
3145 }
3146 return 0;
3147}
ff651cb6 3148EXPORT_SYMBOL_GPL(kvm_write_guest);
195aefde 3149
8e73485c
PB
3150int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3151 unsigned long len)
3152{
3153 gfn_t gfn = gpa >> PAGE_SHIFT;
3154 int seg;
3155 int offset = offset_in_page(gpa);
3156 int ret;
3157
3158 while ((seg = next_segment(len, offset)) != 0) {
3159 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3160 if (ret < 0)
3161 return ret;
3162 offset = 0;
3163 len -= seg;
3164 data += seg;
3165 ++gfn;
3166 }
3167 return 0;
3168}
3169EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3170
5a2d4365
PB
3171static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3172 struct gfn_to_hva_cache *ghc,
3173 gpa_t gpa, unsigned long len)
49c7754c 3174{
49c7754c 3175 int offset = offset_in_page(gpa);
8f964525
AH
3176 gfn_t start_gfn = gpa >> PAGE_SHIFT;
3177 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3178 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3179 gfn_t nr_pages_avail;
49c7754c 3180
6ad1e29f 3181 /* Update ghc->generation before performing any error checks. */
49c7754c 3182 ghc->generation = slots->generation;
6ad1e29f
SC
3183
3184 if (start_gfn > end_gfn) {
3185 ghc->hva = KVM_HVA_ERR_BAD;
3186 return -EINVAL;
3187 }
f1b9dd5e
JM
3188
3189 /*
3190 * If the requested region crosses two memslots, we still
3191 * verify that the entire region is valid here.
3192 */
6ad1e29f 3193 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
f1b9dd5e
JM
3194 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3195 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3196 &nr_pages_avail);
3197 if (kvm_is_error_hva(ghc->hva))
6ad1e29f 3198 return -EFAULT;
f1b9dd5e
JM
3199 }
3200
3201 /* Use the slow path for cross page reads and writes. */
6ad1e29f 3202 if (nr_pages_needed == 1)
49c7754c 3203 ghc->hva += offset;
f1b9dd5e 3204 else
8f964525 3205 ghc->memslot = NULL;
f1b9dd5e 3206
6ad1e29f
SC
3207 ghc->gpa = gpa;
3208 ghc->len = len;
3209 return 0;
49c7754c 3210}
5a2d4365 3211
4e335d9e 3212int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
5a2d4365
PB
3213 gpa_t gpa, unsigned long len)
3214{
4e335d9e 3215 struct kvm_memslots *slots = kvm_memslots(kvm);
5a2d4365
PB
3216 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3217}
4e335d9e 3218EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
49c7754c 3219
4e335d9e 3220int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
7a86dab8
JM
3221 void *data, unsigned int offset,
3222 unsigned long len)
49c7754c 3223{
4e335d9e 3224 struct kvm_memslots *slots = kvm_memslots(kvm);
49c7754c 3225 int r;
4ec6e863 3226 gpa_t gpa = ghc->gpa + offset;
49c7754c 3227
5f25e71e
PB
3228 if (WARN_ON_ONCE(len + offset > ghc->len))
3229 return -EINVAL;
8f964525 3230
dc9ce71e
SC
3231 if (slots->generation != ghc->generation) {
3232 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3233 return -EFAULT;
3234 }
8f964525 3235
49c7754c
GN
3236 if (kvm_is_error_hva(ghc->hva))
3237 return -EFAULT;
3238
fcfbc617
SC
3239 if (unlikely(!ghc->memslot))
3240 return kvm_write_guest(kvm, gpa, data, len);
3241
4ec6e863 3242 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
49c7754c
GN
3243 if (r)
3244 return -EFAULT;
28bd726a 3245 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
49c7754c
GN
3246
3247 return 0;
3248}
4e335d9e 3249EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
4ec6e863 3250
4e335d9e
PB
3251int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3252 void *data, unsigned long len)
4ec6e863 3253{
4e335d9e 3254 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
4ec6e863 3255}
4e335d9e 3256EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
49c7754c 3257
0958f0ce
VK
3258int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3259 void *data, unsigned int offset,
3260 unsigned long len)
e03b644f 3261{
4e335d9e 3262 struct kvm_memslots *slots = kvm_memslots(kvm);
e03b644f 3263 int r;
0958f0ce 3264 gpa_t gpa = ghc->gpa + offset;
e03b644f 3265
5f25e71e
PB
3266 if (WARN_ON_ONCE(len + offset > ghc->len))
3267 return -EINVAL;
8f964525 3268
dc9ce71e
SC
3269 if (slots->generation != ghc->generation) {
3270 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3271 return -EFAULT;
3272 }
8f964525 3273
e03b644f
GN
3274 if (kvm_is_error_hva(ghc->hva))
3275 return -EFAULT;
3276
fcfbc617 3277 if (unlikely(!ghc->memslot))
0958f0ce 3278 return kvm_read_guest(kvm, gpa, data, len);
fcfbc617 3279
0958f0ce 3280 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
e03b644f
GN
3281 if (r)
3282 return -EFAULT;
3283
3284 return 0;
3285}
0958f0ce
VK
3286EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3287
3288int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3289 void *data, unsigned long len)
3290{
3291 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3292}
4e335d9e 3293EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
e03b644f 3294
195aefde
IE
3295int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3296{
2f541442 3297 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
195aefde
IE
3298 gfn_t gfn = gpa >> PAGE_SHIFT;
3299 int seg;
3300 int offset = offset_in_page(gpa);
3301 int ret;
3302
bfda0e84 3303 while ((seg = next_segment(len, offset)) != 0) {
2f541442 3304 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
195aefde
IE
3305 if (ret < 0)
3306 return ret;
3307 offset = 0;
3308 len -= seg;
3309 ++gfn;
3310 }
3311 return 0;
3312}
3313EXPORT_SYMBOL_GPL(kvm_clear_guest);
3314
28bd726a 3315void mark_page_dirty_in_slot(struct kvm *kvm,
8283e36a 3316 const struct kvm_memory_slot *memslot,
28bd726a 3317 gfn_t gfn)
6aa8b732 3318{
2efd61a6
DW
3319 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3320
e09fccb5 3321#ifdef CONFIG_HAVE_KVM_DIRTY_RING
86bdf3eb 3322 if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
2efd61a6 3323 return;
86bdf3eb 3324
c57351a7 3325 WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
e09fccb5 3326#endif
2efd61a6 3327
044c59c4 3328 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
7e9d619d 3329 unsigned long rel_gfn = gfn - memslot->base_gfn;
fb04a1ed 3330 u32 slot = (memslot->as_id << 16) | memslot->id;
6aa8b732 3331
86bdf3eb 3332 if (kvm->dirty_ring_size && vcpu)
cf87ac73 3333 kvm_dirty_ring_push(vcpu, slot, rel_gfn);
c57351a7 3334 else if (memslot->dirty_bitmap)
fb04a1ed 3335 set_bit_le(rel_gfn, memslot->dirty_bitmap);
6aa8b732
AK
3336 }
3337}
a6a0b05d 3338EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
6aa8b732 3339
49c7754c
GN
3340void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3341{
3342 struct kvm_memory_slot *memslot;
3343
3344 memslot = gfn_to_memslot(kvm, gfn);
28bd726a 3345 mark_page_dirty_in_slot(kvm, memslot, gfn);
49c7754c 3346}
2ba9f0d8 3347EXPORT_SYMBOL_GPL(mark_page_dirty);
49c7754c 3348
8e73485c
PB
3349void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3350{
3351 struct kvm_memory_slot *memslot;
3352
3353 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
28bd726a 3354 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
8e73485c
PB
3355}
3356EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3357
20b7035c
JS
3358void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3359{
3360 if (!vcpu->sigset_active)
3361 return;
3362
3363 /*
3364 * This does a lockless modification of ->real_blocked, which is fine
3365 * because, only current can change ->real_blocked and all readers of
3366 * ->real_blocked don't care as long ->real_blocked is always a subset
3367 * of ->blocked.
3368 */
3369 sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3370}
3371
3372void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3373{
3374 if (!vcpu->sigset_active)
3375 return;
3376
3377 sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3378 sigemptyset(&current->real_blocked);
3379}
3380
aca6ff29
WL
3381static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3382{
dee339b5 3383 unsigned int old, val, grow, grow_start;
aca6ff29 3384
2cbd7824 3385 old = val = vcpu->halt_poll_ns;
dee339b5 3386 grow_start = READ_ONCE(halt_poll_ns_grow_start);
6b6de68c 3387 grow = READ_ONCE(halt_poll_ns_grow);
7fa08e71
NW
3388 if (!grow)
3389 goto out;
3390
dee339b5
NW
3391 val *= grow;
3392 if (val < grow_start)
3393 val = grow_start;
aca6ff29
WL
3394
3395 vcpu->halt_poll_ns = val;
7fa08e71 3396out:
2cbd7824 3397 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
aca6ff29
WL
3398}
3399
3400static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3401{
ae232ea4 3402 unsigned int old, val, shrink, grow_start;
aca6ff29 3403
2cbd7824 3404 old = val = vcpu->halt_poll_ns;
6b6de68c 3405 shrink = READ_ONCE(halt_poll_ns_shrink);
ae232ea4 3406 grow_start = READ_ONCE(halt_poll_ns_grow_start);
6b6de68c 3407 if (shrink == 0)
aca6ff29
WL
3408 val = 0;
3409 else
6b6de68c 3410 val /= shrink;
aca6ff29 3411
ae232ea4
SS
3412 if (val < grow_start)
3413 val = 0;
3414
aca6ff29 3415 vcpu->halt_poll_ns = val;
2cbd7824 3416 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
aca6ff29
WL
3417}
3418
f7819512
PB
3419static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3420{
50c28f21
JS
3421 int ret = -EINTR;
3422 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3423
c59fb127 3424 if (kvm_arch_vcpu_runnable(vcpu))
50c28f21 3425 goto out;
f7819512 3426 if (kvm_cpu_has_pending_timer(vcpu))
50c28f21 3427 goto out;
f7819512 3428 if (signal_pending(current))
50c28f21 3429 goto out;
084071d5
MT
3430 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3431 goto out;
f7819512 3432
50c28f21
JS
3433 ret = 0;
3434out:
3435 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3436 return ret;
f7819512
PB
3437}
3438
fac42688
SC
3439/*
3440 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3441 * pending. This is mostly used when halting a vCPU, but may also be used
3442 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3443 */
3444bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
cb953129 3445{
fac42688
SC
3446 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3447 bool waited = false;
3448
c3858335
JZ
3449 vcpu->stat.generic.blocking = 1;
3450
18869f26 3451 preempt_disable();
fac42688 3452 kvm_arch_vcpu_blocking(vcpu);
fac42688 3453 prepare_to_rcuwait(wait);
18869f26
ML
3454 preempt_enable();
3455
fac42688
SC
3456 for (;;) {
3457 set_current_state(TASK_INTERRUPTIBLE);
3458
3459 if (kvm_vcpu_check_block(vcpu) < 0)
3460 break;
3461
3462 waited = true;
3463 schedule();
3464 }
fac42688 3465
18869f26
ML
3466 preempt_disable();
3467 finish_rcuwait(wait);
fac42688 3468 kvm_arch_vcpu_unblocking(vcpu);
18869f26 3469 preempt_enable();
fac42688 3470
c3858335
JZ
3471 vcpu->stat.generic.blocking = 0;
3472
fac42688
SC
3473 return waited;
3474}
3475
29e72893
SC
3476static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3477 ktime_t end, bool success)
cb953129 3478{
30c94347 3479 struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
29e72893
SC
3480 u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3481
30c94347
SC
3482 ++vcpu->stat.generic.halt_attempted_poll;
3483
3484 if (success) {
3485 ++vcpu->stat.generic.halt_successful_poll;
3486
3487 if (!vcpu_valid_wakeup(vcpu))
3488 ++vcpu->stat.generic.halt_poll_invalid;
3489
3490 stats->halt_poll_success_ns += poll_ns;
3491 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3492 } else {
3493 stats->halt_poll_fail_ns += poll_ns;
3494 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3495 }
cb953129
DM
3496}
3497
175d5dc7
DM
3498static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3499{
9eb8ca04
DM
3500 struct kvm *kvm = vcpu->kvm;
3501
3502 if (kvm->override_halt_poll_ns) {
3503 /*
3504 * Ensure kvm->max_halt_poll_ns is not read before
3505 * kvm->override_halt_poll_ns.
3506 *
3507 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3508 */
3509 smp_rmb();
3510 return READ_ONCE(kvm->max_halt_poll_ns);
3511 }
3512
3513 return READ_ONCE(halt_poll_ns);
175d5dc7
DM
3514}
3515
b6958ce4 3516/*
fac42688
SC
3517 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3518 * polling is enabled, busy wait for a short time before blocking to avoid the
3519 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3520 * is halted.
b6958ce4 3521 */
91b99ea7 3522void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
d3bef15f 3523{
175d5dc7 3524 unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
6f390916 3525 bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
cb953129 3526 ktime_t start, cur, poll_end;
f7819512 3527 bool waited = false;
97b6847a 3528 bool do_halt_poll;
91b99ea7 3529 u64 halt_ns;
07ab0f8d 3530
175d5dc7
DM
3531 if (vcpu->halt_poll_ns > max_halt_poll_ns)
3532 vcpu->halt_poll_ns = max_halt_poll_ns;
97b6847a
DM
3533
3534 do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3535
cb953129 3536 start = cur = poll_end = ktime_get();
8df6a61c 3537 if (do_halt_poll) {
109a9826 3538 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
f95ef0cd 3539
f7819512 3540 do {
30c94347 3541 if (kvm_vcpu_check_block(vcpu) < 0)
f7819512 3542 goto out;
74775654 3543 cpu_relax();
cb953129 3544 poll_end = cur = ktime_get();
6bd5b743 3545 } while (kvm_vcpu_can_poll(cur, stop));
f7819512 3546 }
e5c239cf 3547
fac42688 3548 waited = kvm_vcpu_block(vcpu);
8ccba534 3549
f7819512 3550 cur = ktime_get();
87bcc5fa
JZ
3551 if (waited) {
3552 vcpu->stat.generic.halt_wait_ns +=
3553 ktime_to_ns(cur) - ktime_to_ns(poll_end);
8ccba534
JZ
3554 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3555 ktime_to_ns(cur) - ktime_to_ns(poll_end));
87bcc5fa 3556 }
f7819512 3557out:
91b99ea7
SC
3558 /* The total time the vCPU was "halted", including polling time. */
3559 halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
aca6ff29 3560
29e72893
SC
3561 /*
3562 * Note, halt-polling is considered successful so long as the vCPU was
3563 * never actually scheduled out, i.e. even if the wake event arrived
3564 * after of the halt-polling loop itself, but before the full wait.
3565 */
8df6a61c 3566 if (do_halt_poll)
29e72893 3567 update_halt_poll_stats(vcpu, start, poll_end, !waited);
cb953129 3568
6f390916 3569 if (halt_poll_allowed) {
175d5dc7
DM
3570 /* Recompute the max halt poll time in case it changed. */
3571 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3572
44551b2f 3573 if (!vcpu_valid_wakeup(vcpu)) {
aca6ff29 3574 shrink_halt_poll_ns(vcpu);
175d5dc7 3575 } else if (max_halt_poll_ns) {
91b99ea7 3576 if (halt_ns <= vcpu->halt_poll_ns)
44551b2f
WL
3577 ;
3578 /* we had a long block, shrink polling */
acd05785 3579 else if (vcpu->halt_poll_ns &&
175d5dc7 3580 halt_ns > max_halt_poll_ns)
44551b2f
WL
3581 shrink_halt_poll_ns(vcpu);
3582 /* we had a short halt and our poll time is too small */
175d5dc7
DM
3583 else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3584 halt_ns < max_halt_poll_ns)
44551b2f
WL
3585 grow_halt_poll_ns(vcpu);
3586 } else {
3587 vcpu->halt_poll_ns = 0;
3588 }
3589 }
aca6ff29 3590
91b99ea7 3591 trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
b6958ce4 3592}
91b99ea7 3593EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
b6958ce4 3594
178f02ff 3595bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
b6d33834 3596{
d92a5d1c 3597 if (__kvm_vcpu_wake_up(vcpu)) {
d73eb57b 3598 WRITE_ONCE(vcpu->ready, true);
0193cc90 3599 ++vcpu->stat.generic.halt_wakeup;
178f02ff 3600 return true;
b6d33834
CD
3601 }
3602
178f02ff 3603 return false;
dd1a4cc1
RK
3604}
3605EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3606
0266c894 3607#ifndef CONFIG_S390
dd1a4cc1
RK
3608/*
3609 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3610 */
3611void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3612{
85b64045 3613 int me, cpu;
dd1a4cc1 3614
178f02ff
RK
3615 if (kvm_vcpu_wake_up(vcpu))
3616 return;
3617
aefdc2ed
PB
3618 me = get_cpu();
3619 /*
3620 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3621 * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3622 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3623 * within the vCPU thread itself.
3624 */
3625 if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3626 if (vcpu->mode == IN_GUEST_MODE)
3627 WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3628 goto out;
3629 }
3630
85b64045
SC
3631 /*
3632 * Note, the vCPU could get migrated to a different pCPU at any point
3633 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3634 * IPI to the previous pCPU. But, that's ok because the purpose of the
3635 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3636 * vCPU also requires it to leave IN_GUEST_MODE.
3637 */
85b64045
SC
3638 if (kvm_arch_vcpu_should_kick(vcpu)) {
3639 cpu = READ_ONCE(vcpu->cpu);
3640 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
b6d33834 3641 smp_send_reschedule(cpu);
85b64045 3642 }
aefdc2ed 3643out:
b6d33834
CD
3644 put_cpu();
3645}
a20ed54d 3646EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
0266c894 3647#endif /* !CONFIG_S390 */
b6d33834 3648
fa93384f 3649int kvm_vcpu_yield_to(struct kvm_vcpu *target)
41628d33
KW
3650{
3651 struct pid *pid;
3652 struct task_struct *task = NULL;
fa93384f 3653 int ret = 0;
41628d33
KW
3654
3655 rcu_read_lock();
3656 pid = rcu_dereference(target->pid);
3657 if (pid)
27fbe64b 3658 task = get_pid_task(pid, PIDTYPE_PID);
41628d33
KW
3659 rcu_read_unlock();
3660 if (!task)
c45c528e 3661 return ret;
c45c528e 3662 ret = yield_to(task, 1);
41628d33 3663 put_task_struct(task);
c45c528e
R
3664
3665 return ret;
41628d33
KW
3666}
3667EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3668
06e48c51
R
3669/*
3670 * Helper that checks whether a VCPU is eligible for directed yield.
3671 * Most eligible candidate to yield is decided by following heuristics:
3672 *
3673 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3674 * (preempted lock holder), indicated by @in_spin_loop.
656012c7 3675 * Set at the beginning and cleared at the end of interception/PLE handler.
06e48c51
R
3676 *
3677 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3678 * chance last time (mostly it has become eligible now since we have probably
3679 * yielded to lockholder in last iteration. This is done by toggling
3680 * @dy_eligible each time a VCPU checked for eligibility.)
3681 *
3682 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3683 * to preempted lock-holder could result in wrong VCPU selection and CPU
3684 * burning. Giving priority for a potential lock-holder increases lock
3685 * progress.
3686 *
3687 * Since algorithm is based on heuristics, accessing another VCPU data without
3688 * locking does not harm. It may result in trying to yield to same VCPU, fail
3689 * and continue with next VCPU and so on.
3690 */
7940876e 3691static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
06e48c51 3692{
4a55dd72 3693#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
06e48c51
R
3694 bool eligible;
3695
3696 eligible = !vcpu->spin_loop.in_spin_loop ||
34656113 3697 vcpu->spin_loop.dy_eligible;
06e48c51
R
3698
3699 if (vcpu->spin_loop.in_spin_loop)
3700 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3701
3702 return eligible;
4a55dd72
SW
3703#else
3704 return true;
06e48c51 3705#endif
4a55dd72 3706}
c45c528e 3707
17e433b5
WL
3708/*
3709 * Unlike kvm_arch_vcpu_runnable, this function is called outside
3710 * a vcpu_load/vcpu_put pair. However, for most architectures
3711 * kvm_arch_vcpu_runnable does not require vcpu_load.
3712 */
3713bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3714{
3715 return kvm_arch_vcpu_runnable(vcpu);
3716}
3717
3718static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3719{
3720 if (kvm_arch_dy_runnable(vcpu))
3721 return true;
3722
3723#ifdef CONFIG_KVM_ASYNC_PF
3724 if (!list_empty_careful(&vcpu->async_pf.done))
3725 return true;
3726#endif
3727
3728 return false;
3729}
3730
52acd22f
WL
3731bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3732{
3733 return false;
3734}
3735
199b5763 3736void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
d255f4f2 3737{
217ece61
RR
3738 struct kvm *kvm = me->kvm;
3739 struct kvm_vcpu *vcpu;
3740 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
46808a4c 3741 unsigned long i;
217ece61 3742 int yielded = 0;
c45c528e 3743 int try = 3;
217ece61 3744 int pass;
d255f4f2 3745
4c088493 3746 kvm_vcpu_set_in_spin_loop(me, true);
217ece61
RR
3747 /*
3748 * We boost the priority of a VCPU that is runnable but not
3749 * currently running, because it got preempted by something
3750 * else and called schedule in __vcpu_run. Hopefully that
3751 * VCPU is holding the lock that we need and will release it.
3752 * We approximate round-robin by starting at the last boosted VCPU.
3753 */
c45c528e 3754 for (pass = 0; pass < 2 && !yielded && try; pass++) {
217ece61 3755 kvm_for_each_vcpu(i, vcpu, kvm) {
5cfc2aab 3756 if (!pass && i <= last_boosted_vcpu) {
217ece61
RR
3757 i = last_boosted_vcpu;
3758 continue;
3759 } else if (pass && i > last_boosted_vcpu)
3760 break;
d73eb57b 3761 if (!READ_ONCE(vcpu->ready))
7bc7ae25 3762 continue;
217ece61
RR
3763 if (vcpu == me)
3764 continue;
d92a5d1c 3765 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
217ece61 3766 continue;
046ddeed 3767 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
52acd22f
WL
3768 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3769 !kvm_arch_vcpu_in_kernel(vcpu))
199b5763 3770 continue;
06e48c51
R
3771 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3772 continue;
c45c528e
R
3773
3774 yielded = kvm_vcpu_yield_to(vcpu);
3775 if (yielded > 0) {
217ece61 3776 kvm->last_boosted_vcpu = i;
217ece61 3777 break;
c45c528e
R
3778 } else if (yielded < 0) {
3779 try--;
3780 if (!try)
3781 break;
217ece61 3782 }
217ece61
RR
3783 }
3784 }
4c088493 3785 kvm_vcpu_set_in_spin_loop(me, false);
06e48c51
R
3786
3787 /* Ensure vcpu is not eligible during next spinloop */
3788 kvm_vcpu_set_dy_eligible(me, false);
d255f4f2
ZE
3789}
3790EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3791
fb04a1ed
PX
3792static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3793{
dc70ec21 3794#ifdef CONFIG_HAVE_KVM_DIRTY_RING
fb04a1ed
PX
3795 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3796 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3797 kvm->dirty_ring_size / PAGE_SIZE);
3798#else
3799 return false;
3800#endif
3801}
3802
1499fa80 3803static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
9a2bb7f4 3804{
11bac800 3805 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
9a2bb7f4
AK
3806 struct page *page;
3807
e4a533a4 3808 if (vmf->pgoff == 0)
039576c0 3809 page = virt_to_page(vcpu->run);
09566765 3810#ifdef CONFIG_X86
e4a533a4 3811 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
ad312c7c 3812 page = virt_to_page(vcpu->arch.pio_data);
5f94c174 3813#endif
4b4357e0 3814#ifdef CONFIG_KVM_MMIO
5f94c174
LV
3815 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3816 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
09566765 3817#endif
fb04a1ed
PX
3818 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3819 page = kvm_dirty_ring_get_page(
3820 &vcpu->dirty_ring,
3821 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
039576c0 3822 else
5b1c1493 3823 return kvm_arch_vcpu_fault(vcpu, vmf);
9a2bb7f4 3824 get_page(page);
e4a533a4
NP
3825 vmf->page = page;
3826 return 0;
9a2bb7f4
AK
3827}
3828
f0f37e2f 3829static const struct vm_operations_struct kvm_vcpu_vm_ops = {
e4a533a4 3830 .fault = kvm_vcpu_fault,
9a2bb7f4
AK
3831};
3832
3833static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3834{
fb04a1ed 3835 struct kvm_vcpu *vcpu = file->private_data;
11476d27 3836 unsigned long pages = vma_pages(vma);
fb04a1ed
PX
3837
3838 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3839 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3840 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3841 return -EINVAL;
3842
9a2bb7f4
AK
3843 vma->vm_ops = &kvm_vcpu_vm_ops;
3844 return 0;
3845}
3846
bccf2150
AK
3847static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3848{
3849 struct kvm_vcpu *vcpu = filp->private_data;
3850
66c0b394 3851 kvm_put_kvm(vcpu->kvm);
bccf2150
AK
3852 return 0;
3853}
3854
70375c2d 3855static const struct file_operations kvm_vcpu_fops = {
bccf2150
AK
3856 .release = kvm_vcpu_release,
3857 .unlocked_ioctl = kvm_vcpu_ioctl,
9a2bb7f4 3858 .mmap = kvm_vcpu_mmap,
6038f373 3859 .llseek = noop_llseek,
7ddfd3e0 3860 KVM_COMPAT(kvm_vcpu_compat_ioctl),
bccf2150
AK
3861};
3862
3863/*
3864 * Allocates an inode for the vcpu.
3865 */
3866static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3867{
e46b4692
MY
3868 char name[8 + 1 + ITOA_MAX_LEN + 1];
3869
3870 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3871 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
bccf2150
AK
3872}
3873
e36de87d
VP
3874#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3875static int vcpu_get_pid(void *data, u64 *val)
3876{
3877 struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
3878 *val = pid_nr(rcu_access_pointer(vcpu->pid));
3879 return 0;
3880}
3881
3882DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
3883
3e7093d0 3884static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
45b5939e 3885{
d56f5136 3886 struct dentry *debugfs_dentry;
45b5939e 3887 char dir_name[ITOA_MAX_LEN * 2];
45b5939e 3888
45b5939e 3889 if (!debugfs_initialized())
3e7093d0 3890 return;
45b5939e
LC
3891
3892 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
d56f5136
PB
3893 debugfs_dentry = debugfs_create_dir(dir_name,
3894 vcpu->kvm->debugfs_dentry);
e36de87d
VP
3895 debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
3896 &vcpu_get_pid_fops);
45b5939e 3897
d56f5136 3898 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
45b5939e 3899}
e36de87d 3900#endif
45b5939e 3901
c5ea7660
AK
3902/*
3903 * Creates some virtual cpus. Good luck creating more than one.
3904 */
73880c80 3905static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
c5ea7660
AK
3906{
3907 int r;
e09fefde 3908 struct kvm_vcpu *vcpu;
8bd826d6 3909 struct page *page;
c5ea7660 3910
a1c42dde 3911 if (id >= KVM_MAX_VCPU_IDS)
338c7dba
AH
3912 return -EINVAL;
3913
6c7caebc 3914 mutex_lock(&kvm->lock);
f502cc56 3915 if (kvm->created_vcpus >= kvm->max_vcpus) {
6c7caebc
PB
3916 mutex_unlock(&kvm->lock);
3917 return -EINVAL;
3918 }
3919
1d5e740d
ZG
3920 r = kvm_arch_vcpu_precreate(kvm, id);
3921 if (r) {
3922 mutex_unlock(&kvm->lock);
3923 return r;
3924 }
3925
6c7caebc
PB
3926 kvm->created_vcpus++;
3927 mutex_unlock(&kvm->lock);
3928
85f47930 3929 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
e529ef66
SC
3930 if (!vcpu) {
3931 r = -ENOMEM;
6c7caebc
PB
3932 goto vcpu_decrement;
3933 }
c5ea7660 3934
fcd97ad5 3935 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
93bb59ca 3936 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
8bd826d6
SC
3937 if (!page) {
3938 r = -ENOMEM;
e529ef66 3939 goto vcpu_free;
8bd826d6
SC
3940 }
3941 vcpu->run = page_address(page);
3942
3943 kvm_vcpu_init(vcpu, kvm, id);
e529ef66
SC
3944
3945 r = kvm_arch_vcpu_create(vcpu);
3946 if (r)
8bd826d6 3947 goto vcpu_free_run_page;
e529ef66 3948
fb04a1ed
PX
3949 if (kvm->dirty_ring_size) {
3950 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3951 id, kvm->dirty_ring_size);
3952 if (r)
3953 goto arch_vcpu_destroy;
3954 }
3955
11ec2804 3956 mutex_lock(&kvm->lock);
e09fefde
DH
3957 if (kvm_get_vcpu_by_id(kvm, id)) {
3958 r = -EEXIST;
3959 goto unlock_vcpu_destroy;
3960 }
73880c80 3961
8750e72a 3962 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
c5b07754
MZ
3963 r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
3964 BUG_ON(r == -EBUSY);
3965 if (r)
3966 goto unlock_vcpu_destroy;
c5ea7660 3967
fb3f0f51 3968 /* Now it's all set up, let userspace reach it */
66c0b394 3969 kvm_get_kvm(kvm);
bccf2150 3970 r = create_vcpu_fd(vcpu);
73880c80 3971 if (r < 0) {
c5b07754 3972 xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
149487bd 3973 kvm_put_kvm_no_destroy(kvm);
d780592b 3974 goto unlock_vcpu_destroy;
73880c80
GN
3975 }
3976
dd489240 3977 /*
c5b07754
MZ
3978 * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
3979 * pointer before kvm->online_vcpu's incremented value.
dd489240 3980 */
73880c80
GN
3981 smp_wmb();
3982 atomic_inc(&kvm->online_vcpus);
3983
73880c80 3984 mutex_unlock(&kvm->lock);
42897d86 3985 kvm_arch_vcpu_postcreate(vcpu);
63d04348 3986 kvm_create_vcpu_debugfs(vcpu);
fb3f0f51 3987 return r;
39c3b86e 3988
d780592b 3989unlock_vcpu_destroy:
7d8fece6 3990 mutex_unlock(&kvm->lock);
fb04a1ed
PX
3991 kvm_dirty_ring_free(&vcpu->dirty_ring);
3992arch_vcpu_destroy:
d40ccc62 3993 kvm_arch_vcpu_destroy(vcpu);
8bd826d6
SC
3994vcpu_free_run_page:
3995 free_page((unsigned long)vcpu->run);
e529ef66
SC
3996vcpu_free:
3997 kmem_cache_free(kvm_vcpu_cache, vcpu);
6c7caebc
PB
3998vcpu_decrement:
3999 mutex_lock(&kvm->lock);
4000 kvm->created_vcpus--;
4001 mutex_unlock(&kvm->lock);
c5ea7660
AK
4002 return r;
4003}
4004
1961d276
AK
4005static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4006{
4007 if (sigset) {
4008 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4009 vcpu->sigset_active = 1;
4010 vcpu->sigset = *sigset;
4011 } else
4012 vcpu->sigset_active = 0;
4013 return 0;
4014}
4015
ce55c049
JZ
4016static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4017 size_t size, loff_t *offset)
4018{
4019 struct kvm_vcpu *vcpu = file->private_data;
4020
4021 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4022 &kvm_vcpu_stats_desc[0], &vcpu->stat,
4023 sizeof(vcpu->stat), user_buffer, size, offset);
4024}
4025
4026static const struct file_operations kvm_vcpu_stats_fops = {
4027 .read = kvm_vcpu_stats_read,
4028 .llseek = noop_llseek,
4029};
4030
4031static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4032{
4033 int fd;
4034 struct file *file;
4035 char name[15 + ITOA_MAX_LEN + 1];
4036
4037 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4038
4039 fd = get_unused_fd_flags(O_CLOEXEC);
4040 if (fd < 0)
4041 return fd;
4042
4043 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4044 if (IS_ERR(file)) {
4045 put_unused_fd(fd);
4046 return PTR_ERR(file);
4047 }
4048 file->f_mode |= FMODE_PREAD;
4049 fd_install(fd, file);
4050
4051 return fd;
4052}
4053
bccf2150
AK
4054static long kvm_vcpu_ioctl(struct file *filp,
4055 unsigned int ioctl, unsigned long arg)
6aa8b732 4056{
bccf2150 4057 struct kvm_vcpu *vcpu = filp->private_data;
2f366987 4058 void __user *argp = (void __user *)arg;
313a3dc7 4059 int r;
fa3795a7
DH
4060 struct kvm_fpu *fpu = NULL;
4061 struct kvm_sregs *kvm_sregs = NULL;
6aa8b732 4062
f4d31653 4063 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
6d4e4c4f 4064 return -EIO;
2122ff5e 4065
2ea75be3
DM
4066 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4067 return -EINVAL;
4068
2122ff5e 4069 /*
5cb0944c
PB
4070 * Some architectures have vcpu ioctls that are asynchronous to vcpu
4071 * execution; mutex_lock() would break them.
2122ff5e 4072 */
5cb0944c
PB
4073 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4074 if (r != -ENOIOCTLCMD)
9fc77441 4075 return r;
2122ff5e 4076
ec7660cc
CD
4077 if (mutex_lock_killable(&vcpu->mutex))
4078 return -EINTR;
6aa8b732 4079 switch (ioctl) {
0e4524a5
CB
4080 case KVM_RUN: {
4081 struct pid *oldpid;
f0fe5108
AK
4082 r = -EINVAL;
4083 if (arg)
4084 goto out;
0e4524a5 4085 oldpid = rcu_access_pointer(vcpu->pid);
71dbc8a9 4086 if (unlikely(oldpid != task_pid(current))) {
7a72f7a1 4087 /* The thread running this VCPU changed. */
bd2a6394 4088 struct pid *newpid;
f95ef0cd 4089
bd2a6394
CD
4090 r = kvm_arch_vcpu_run_pid_change(vcpu);
4091 if (r)
4092 break;
4093
4094 newpid = get_task_pid(current, PIDTYPE_PID);
7a72f7a1
CB
4095 rcu_assign_pointer(vcpu->pid, newpid);
4096 if (oldpid)
4097 synchronize_rcu();
4098 put_pid(oldpid);
4099 }
1b94f6f8 4100 r = kvm_arch_vcpu_ioctl_run(vcpu);
64be5007 4101 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
6aa8b732 4102 break;
0e4524a5 4103 }
6aa8b732 4104 case KVM_GET_REGS: {
3e4bb3ac 4105 struct kvm_regs *kvm_regs;
6aa8b732 4106
3e4bb3ac 4107 r = -ENOMEM;
b12ce36a 4108 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3e4bb3ac 4109 if (!kvm_regs)
6aa8b732 4110 goto out;
3e4bb3ac
XZ
4111 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4112 if (r)
4113 goto out_free1;
6aa8b732 4114 r = -EFAULT;
3e4bb3ac
XZ
4115 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4116 goto out_free1;
6aa8b732 4117 r = 0;
3e4bb3ac
XZ
4118out_free1:
4119 kfree(kvm_regs);
6aa8b732
AK
4120 break;
4121 }
4122 case KVM_SET_REGS: {
3e4bb3ac 4123 struct kvm_regs *kvm_regs;
6aa8b732 4124
ff5c2c03
SL
4125 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4126 if (IS_ERR(kvm_regs)) {
4127 r = PTR_ERR(kvm_regs);
6aa8b732 4128 goto out;
ff5c2c03 4129 }
3e4bb3ac 4130 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3e4bb3ac 4131 kfree(kvm_regs);
6aa8b732
AK
4132 break;
4133 }
4134 case KVM_GET_SREGS: {
b12ce36a
BG
4135 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
4136 GFP_KERNEL_ACCOUNT);
fa3795a7
DH
4137 r = -ENOMEM;
4138 if (!kvm_sregs)
4139 goto out;
4140 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
6aa8b732
AK
4141 if (r)
4142 goto out;
4143 r = -EFAULT;
fa3795a7 4144 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
6aa8b732
AK
4145 goto out;
4146 r = 0;
4147 break;
4148 }
4149 case KVM_SET_SREGS: {
ff5c2c03
SL
4150 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4151 if (IS_ERR(kvm_sregs)) {
4152 r = PTR_ERR(kvm_sregs);
18595411 4153 kvm_sregs = NULL;
6aa8b732 4154 goto out;
ff5c2c03 4155 }
fa3795a7 4156 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
6aa8b732
AK
4157 break;
4158 }
62d9f0db
MT
4159 case KVM_GET_MP_STATE: {
4160 struct kvm_mp_state mp_state;
4161
4162 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4163 if (r)
4164 goto out;
4165 r = -EFAULT;
893bdbf1 4166 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
62d9f0db
MT
4167 goto out;
4168 r = 0;
4169 break;
4170 }
4171 case KVM_SET_MP_STATE: {
4172 struct kvm_mp_state mp_state;
4173
4174 r = -EFAULT;
893bdbf1 4175 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
62d9f0db
MT
4176 goto out;
4177 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
62d9f0db
MT
4178 break;
4179 }
6aa8b732
AK
4180 case KVM_TRANSLATE: {
4181 struct kvm_translation tr;
4182
4183 r = -EFAULT;
893bdbf1 4184 if (copy_from_user(&tr, argp, sizeof(tr)))
6aa8b732 4185 goto out;
8b006791 4186 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
6aa8b732
AK
4187 if (r)
4188 goto out;
4189 r = -EFAULT;
893bdbf1 4190 if (copy_to_user(argp, &tr, sizeof(tr)))
6aa8b732
AK
4191 goto out;
4192 r = 0;
4193 break;
4194 }
d0bfb940
JK
4195 case KVM_SET_GUEST_DEBUG: {
4196 struct kvm_guest_debug dbg;
6aa8b732
AK
4197
4198 r = -EFAULT;
893bdbf1 4199 if (copy_from_user(&dbg, argp, sizeof(dbg)))
6aa8b732 4200 goto out;
d0bfb940 4201 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
6aa8b732
AK
4202 break;
4203 }
1961d276
AK
4204 case KVM_SET_SIGNAL_MASK: {
4205 struct kvm_signal_mask __user *sigmask_arg = argp;
4206 struct kvm_signal_mask kvm_sigmask;
4207 sigset_t sigset, *p;
4208
4209 p = NULL;
4210 if (argp) {
4211 r = -EFAULT;
4212 if (copy_from_user(&kvm_sigmask, argp,
893bdbf1 4213 sizeof(kvm_sigmask)))
1961d276
AK
4214 goto out;
4215 r = -EINVAL;
893bdbf1 4216 if (kvm_sigmask.len != sizeof(sigset))
1961d276
AK
4217 goto out;
4218 r = -EFAULT;
4219 if (copy_from_user(&sigset, sigmask_arg->sigset,
893bdbf1 4220 sizeof(sigset)))
1961d276
AK
4221 goto out;
4222 p = &sigset;
4223 }
376d41ff 4224 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1961d276
AK
4225 break;
4226 }
b8836737 4227 case KVM_GET_FPU: {
b12ce36a 4228 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
fa3795a7
DH
4229 r = -ENOMEM;
4230 if (!fpu)
4231 goto out;
4232 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
b8836737
AK
4233 if (r)
4234 goto out;
4235 r = -EFAULT;
fa3795a7 4236 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
b8836737
AK
4237 goto out;
4238 r = 0;
4239 break;
4240 }
4241 case KVM_SET_FPU: {
ff5c2c03
SL
4242 fpu = memdup_user(argp, sizeof(*fpu));
4243 if (IS_ERR(fpu)) {
4244 r = PTR_ERR(fpu);
18595411 4245 fpu = NULL;
b8836737 4246 goto out;
ff5c2c03 4247 }
fa3795a7 4248 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
b8836737
AK
4249 break;
4250 }
ce55c049
JZ
4251 case KVM_GET_STATS_FD: {
4252 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4253 break;
4254 }
bccf2150 4255 default:
313a3dc7 4256 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
bccf2150
AK
4257 }
4258out:
ec7660cc 4259 mutex_unlock(&vcpu->mutex);
fa3795a7
DH
4260 kfree(fpu);
4261 kfree(kvm_sregs);
bccf2150
AK
4262 return r;
4263}
4264
de8e5d74 4265#ifdef CONFIG_KVM_COMPAT
1dda606c
AG
4266static long kvm_vcpu_compat_ioctl(struct file *filp,
4267 unsigned int ioctl, unsigned long arg)
4268{
4269 struct kvm_vcpu *vcpu = filp->private_data;
4270 void __user *argp = compat_ptr(arg);
4271 int r;
4272
f4d31653 4273 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
1dda606c
AG
4274 return -EIO;
4275
4276 switch (ioctl) {
4277 case KVM_SET_SIGNAL_MASK: {
4278 struct kvm_signal_mask __user *sigmask_arg = argp;
4279 struct kvm_signal_mask kvm_sigmask;
1dda606c
AG
4280 sigset_t sigset;
4281
4282 if (argp) {
4283 r = -EFAULT;
4284 if (copy_from_user(&kvm_sigmask, argp,
893bdbf1 4285 sizeof(kvm_sigmask)))
1dda606c
AG
4286 goto out;
4287 r = -EINVAL;
3968cf62 4288 if (kvm_sigmask.len != sizeof(compat_sigset_t))
1dda606c
AG
4289 goto out;
4290 r = -EFAULT;
1393b4aa
PB
4291 if (get_compat_sigset(&sigset,
4292 (compat_sigset_t __user *)sigmask_arg->sigset))
1dda606c 4293 goto out;
760a9a30
AC
4294 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4295 } else
4296 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
1dda606c
AG
4297 break;
4298 }
4299 default:
4300 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4301 }
4302
4303out:
4304 return r;
4305}
4306#endif
4307
a1cd3f08
CLG
4308static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4309{
4310 struct kvm_device *dev = filp->private_data;
4311
4312 if (dev->ops->mmap)
4313 return dev->ops->mmap(dev, vma);
4314
4315 return -ENODEV;
4316}
4317
852b6d57
SW
4318static int kvm_device_ioctl_attr(struct kvm_device *dev,
4319 int (*accessor)(struct kvm_device *dev,
4320 struct kvm_device_attr *attr),
4321 unsigned long arg)
4322{
4323 struct kvm_device_attr attr;
4324
4325 if (!accessor)
4326 return -EPERM;
4327
4328 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4329 return -EFAULT;
4330
4331 return accessor(dev, &attr);
4332}
4333
4334static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4335 unsigned long arg)
4336{
4337 struct kvm_device *dev = filp->private_data;
4338
f4d31653 4339 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
ddba9180
SC
4340 return -EIO;
4341
852b6d57
SW
4342 switch (ioctl) {
4343 case KVM_SET_DEVICE_ATTR:
4344 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4345 case KVM_GET_DEVICE_ATTR:
4346 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4347 case KVM_HAS_DEVICE_ATTR:
4348 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4349 default:
4350 if (dev->ops->ioctl)
4351 return dev->ops->ioctl(dev, ioctl, arg);
4352
4353 return -ENOTTY;
4354 }
4355}
4356
852b6d57
SW
4357static int kvm_device_release(struct inode *inode, struct file *filp)
4358{
4359 struct kvm_device *dev = filp->private_data;
4360 struct kvm *kvm = dev->kvm;
4361
2bde9b3e
CLG
4362 if (dev->ops->release) {
4363 mutex_lock(&kvm->lock);
4364 list_del(&dev->vm_node);
4365 dev->ops->release(dev);
4366 mutex_unlock(&kvm->lock);
4367 }
4368
852b6d57
SW
4369 kvm_put_kvm(kvm);
4370 return 0;
4371}
4372
4373static const struct file_operations kvm_device_fops = {
4374 .unlocked_ioctl = kvm_device_ioctl,
4375 .release = kvm_device_release,
7ddfd3e0 4376 KVM_COMPAT(kvm_device_ioctl),
a1cd3f08 4377 .mmap = kvm_device_mmap,
852b6d57
SW
4378};
4379
4380struct kvm_device *kvm_device_from_filp(struct file *filp)
4381{
4382 if (filp->f_op != &kvm_device_fops)
4383 return NULL;
4384
4385 return filp->private_data;
4386}
4387
8538cb22 4388static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
5df554ad 4389#ifdef CONFIG_KVM_MPIC
d60eacb0
WD
4390 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4391 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
5975a2e0 4392#endif
d60eacb0
WD
4393};
4394
8538cb22 4395int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
d60eacb0
WD
4396{
4397 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4398 return -ENOSPC;
4399
4400 if (kvm_device_ops_table[type] != NULL)
4401 return -EEXIST;
4402
4403 kvm_device_ops_table[type] = ops;
4404 return 0;
4405}
4406
571ee1b6
WL
4407void kvm_unregister_device_ops(u32 type)
4408{
4409 if (kvm_device_ops_table[type] != NULL)
4410 kvm_device_ops_table[type] = NULL;
4411}
4412
852b6d57
SW
4413static int kvm_ioctl_create_device(struct kvm *kvm,
4414 struct kvm_create_device *cd)
4415{
eceb6e1d 4416 const struct kvm_device_ops *ops;
852b6d57
SW
4417 struct kvm_device *dev;
4418 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
1d487e9b 4419 int type;
852b6d57
SW
4420 int ret;
4421
d60eacb0
WD
4422 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4423 return -ENODEV;
4424
1d487e9b
PB
4425 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4426 ops = kvm_device_ops_table[type];
d60eacb0 4427 if (ops == NULL)
852b6d57 4428 return -ENODEV;
852b6d57
SW
4429
4430 if (test)
4431 return 0;
4432
b12ce36a 4433 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
852b6d57
SW
4434 if (!dev)
4435 return -ENOMEM;
4436
4437 dev->ops = ops;
4438 dev->kvm = kvm;
852b6d57 4439
a28ebea2 4440 mutex_lock(&kvm->lock);
1d487e9b 4441 ret = ops->create(dev, type);
852b6d57 4442 if (ret < 0) {
a28ebea2 4443 mutex_unlock(&kvm->lock);
852b6d57
SW
4444 kfree(dev);
4445 return ret;
4446 }
a28ebea2
CD
4447 list_add(&dev->vm_node, &kvm->devices);
4448 mutex_unlock(&kvm->lock);
852b6d57 4449
023e9fdd
CD
4450 if (ops->init)
4451 ops->init(dev);
4452
cfa39381 4453 kvm_get_kvm(kvm);
24009b05 4454 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
852b6d57 4455 if (ret < 0) {
149487bd 4456 kvm_put_kvm_no_destroy(kvm);
a28ebea2
CD
4457 mutex_lock(&kvm->lock);
4458 list_del(&dev->vm_node);
e8bc2427
AK
4459 if (ops->release)
4460 ops->release(dev);
a28ebea2 4461 mutex_unlock(&kvm->lock);
e8bc2427
AK
4462 if (ops->destroy)
4463 ops->destroy(dev);
852b6d57
SW
4464 return ret;
4465 }
4466
852b6d57
SW
4467 cd->fd = ret;
4468 return 0;
4469}
4470
92b591a4
AG
4471static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4472{
4473 switch (arg) {
4474 case KVM_CAP_USER_MEMORY:
4475 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4476 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
92b591a4
AG
4477 case KVM_CAP_INTERNAL_ERROR_DATA:
4478#ifdef CONFIG_HAVE_KVM_MSI
4479 case KVM_CAP_SIGNAL_MSI:
4480#endif
297e2105 4481#ifdef CONFIG_HAVE_KVM_IRQFD
dc9be0fa 4482 case KVM_CAP_IRQFD:
92b591a4
AG
4483 case KVM_CAP_IRQFD_RESAMPLE:
4484#endif
e9ea5069 4485 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
92b591a4 4486 case KVM_CAP_CHECK_EXTENSION_VM:
e5d83c74 4487 case KVM_CAP_ENABLE_CAP_VM:
acd05785 4488 case KVM_CAP_HALT_POLL:
92b591a4 4489 return 1;
4b4357e0 4490#ifdef CONFIG_KVM_MMIO
30422558
PB
4491 case KVM_CAP_COALESCED_MMIO:
4492 return KVM_COALESCED_MMIO_PAGE_OFFSET;
0804c849
PH
4493 case KVM_CAP_COALESCED_PIO:
4494 return 1;
30422558 4495#endif
3c9bd400
JZ
4496#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4497 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4498 return KVM_DIRTY_LOG_MANUAL_CAPS;
4499#endif
92b591a4
AG
4500#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4501 case KVM_CAP_IRQ_ROUTING:
4502 return KVM_MAX_IRQ_ROUTES;
f481b069
PB
4503#endif
4504#if KVM_ADDRESS_SPACE_NUM > 1
4505 case KVM_CAP_MULTI_ADDRESS_SPACE:
4506 return KVM_ADDRESS_SPACE_NUM;
92b591a4 4507#endif
c110ae57
PB
4508 case KVM_CAP_NR_MEMSLOTS:
4509 return KVM_USER_MEM_SLOTS;
fb04a1ed 4510 case KVM_CAP_DIRTY_LOG_RING:
17601bfe
MZ
4511#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4512 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4513#else
4514 return 0;
4515#endif
4516 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4517#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
fb04a1ed
PX
4518 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4519#else
4520 return 0;
86bdf3eb
GS
4521#endif
4522#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4523 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
fb04a1ed 4524#endif
ce55c049 4525 case KVM_CAP_BINARY_STATS_FD:
d495f942 4526 case KVM_CAP_SYSTEM_EVENT_DATA:
ce55c049 4527 return 1;
92b591a4
AG
4528 default:
4529 break;
4530 }
4531 return kvm_vm_ioctl_check_extension(kvm, arg);
4532}
4533
fb04a1ed
PX
4534static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4535{
4536 int r;
4537
4538 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4539 return -EINVAL;
4540
4541 /* the size should be power of 2 */
4542 if (!size || (size & (size - 1)))
4543 return -EINVAL;
4544
4545 /* Should be bigger to keep the reserved entries, or a page */
4546 if (size < kvm_dirty_ring_get_rsvd_entries() *
4547 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4548 return -EINVAL;
4549
4550 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4551 sizeof(struct kvm_dirty_gfn))
4552 return -E2BIG;
4553
4554 /* We only allow it to set once */
4555 if (kvm->dirty_ring_size)
4556 return -EINVAL;
4557
4558 mutex_lock(&kvm->lock);
4559
4560 if (kvm->created_vcpus) {
4561 /* We don't allow to change this value after vcpu created */
4562 r = -EINVAL;
4563 } else {
4564 kvm->dirty_ring_size = size;
4565 r = 0;
4566 }
4567
4568 mutex_unlock(&kvm->lock);
4569 return r;
4570}
4571
4572static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4573{
46808a4c 4574 unsigned long i;
fb04a1ed
PX
4575 struct kvm_vcpu *vcpu;
4576 int cleared = 0;
4577
4578 if (!kvm->dirty_ring_size)
4579 return -EINVAL;
4580
4581 mutex_lock(&kvm->slots_lock);
4582
4583 kvm_for_each_vcpu(i, vcpu, kvm)
4584 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4585
4586 mutex_unlock(&kvm->slots_lock);
4587
4588 if (cleared)
4589 kvm_flush_remote_tlbs(kvm);
4590
4591 return cleared;
4592}
4593
e5d83c74
PB
4594int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4595 struct kvm_enable_cap *cap)
4596{
4597 return -EINVAL;
4598}
4599
86bdf3eb
GS
4600static bool kvm_are_all_memslots_empty(struct kvm *kvm)
4601{
4602 int i;
4603
4604 lockdep_assert_held(&kvm->slots_lock);
4605
4606 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
4607 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4608 return false;
4609 }
4610
4611 return true;
4612}
4613
e5d83c74
PB
4614static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4615 struct kvm_enable_cap *cap)
4616{
4617 switch (cap->cap) {
2a31b9db 4618#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3c9bd400
JZ
4619 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4620 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4621
4622 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4623 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4624
4625 if (cap->flags || (cap->args[0] & ~allowed_options))
2a31b9db
PB
4626 return -EINVAL;
4627 kvm->manual_dirty_log_protect = cap->args[0];
4628 return 0;
3c9bd400 4629 }
2a31b9db 4630#endif
acd05785
DM
4631 case KVM_CAP_HALT_POLL: {
4632 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4633 return -EINVAL;
4634
4635 kvm->max_halt_poll_ns = cap->args[0];
9eb8ca04
DM
4636
4637 /*
4638 * Ensure kvm->override_halt_poll_ns does not become visible
4639 * before kvm->max_halt_poll_ns.
4640 *
4641 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4642 */
4643 smp_wmb();
4644 kvm->override_halt_poll_ns = true;
4645
acd05785
DM
4646 return 0;
4647 }
fb04a1ed 4648 case KVM_CAP_DIRTY_LOG_RING:
17601bfe 4649 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
7a2726ec
GS
4650 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
4651 return -EINVAL;
4652
fb04a1ed 4653 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
86bdf3eb
GS
4654 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
4655 int r = -EINVAL;
4656
4657 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
4658 !kvm->dirty_ring_size || cap->flags)
4659 return r;
4660
4661 mutex_lock(&kvm->slots_lock);
4662
4663 /*
4664 * For simplicity, allow enabling ring+bitmap if and only if
4665 * there are no memslots, e.g. to ensure all memslots allocate
4666 * a bitmap after the capability is enabled.
4667 */
4668 if (kvm_are_all_memslots_empty(kvm)) {
4669 kvm->dirty_ring_with_bitmap = true;
4670 r = 0;
4671 }
4672
4673 mutex_unlock(&kvm->slots_lock);
4674
4675 return r;
4676 }
e5d83c74
PB
4677 default:
4678 return kvm_vm_ioctl_enable_cap(kvm, cap);
4679 }
4680}
4681
fcfe1bae
JZ
4682static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4683 size_t size, loff_t *offset)
4684{
4685 struct kvm *kvm = file->private_data;
4686
4687 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4688 &kvm_vm_stats_desc[0], &kvm->stat,
4689 sizeof(kvm->stat), user_buffer, size, offset);
4690}
4691
4692static const struct file_operations kvm_vm_stats_fops = {
4693 .read = kvm_vm_stats_read,
4694 .llseek = noop_llseek,
4695};
4696
4697static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4698{
4699 int fd;
4700 struct file *file;
4701
4702 fd = get_unused_fd_flags(O_CLOEXEC);
4703 if (fd < 0)
4704 return fd;
4705
4706 file = anon_inode_getfile("kvm-vm-stats",
4707 &kvm_vm_stats_fops, kvm, O_RDONLY);
4708 if (IS_ERR(file)) {
4709 put_unused_fd(fd);
4710 return PTR_ERR(file);
4711 }
4712 file->f_mode |= FMODE_PREAD;
4713 fd_install(fd, file);
4714
4715 return fd;
4716}
4717
bccf2150
AK
4718static long kvm_vm_ioctl(struct file *filp,
4719 unsigned int ioctl, unsigned long arg)
4720{
4721 struct kvm *kvm = filp->private_data;
4722 void __user *argp = (void __user *)arg;
1fe779f8 4723 int r;
bccf2150 4724
f4d31653 4725 if (kvm->mm != current->mm || kvm->vm_dead)
6d4e4c4f 4726 return -EIO;
bccf2150
AK
4727 switch (ioctl) {
4728 case KVM_CREATE_VCPU:
4729 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
bccf2150 4730 break;
e5d83c74
PB
4731 case KVM_ENABLE_CAP: {
4732 struct kvm_enable_cap cap;
4733
4734 r = -EFAULT;
4735 if (copy_from_user(&cap, argp, sizeof(cap)))
4736 goto out;
4737 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4738 break;
4739 }
6fc138d2
IE
4740 case KVM_SET_USER_MEMORY_REGION: {
4741 struct kvm_userspace_memory_region kvm_userspace_mem;
4742
4743 r = -EFAULT;
4744 if (copy_from_user(&kvm_userspace_mem, argp,
893bdbf1 4745 sizeof(kvm_userspace_mem)))
6fc138d2
IE
4746 goto out;
4747
47ae31e2 4748 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
6aa8b732
AK
4749 break;
4750 }
4751 case KVM_GET_DIRTY_LOG: {
4752 struct kvm_dirty_log log;
4753
4754 r = -EFAULT;
893bdbf1 4755 if (copy_from_user(&log, argp, sizeof(log)))
6aa8b732 4756 goto out;
2c6f5df9 4757 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6aa8b732
AK
4758 break;
4759 }
2a31b9db
PB
4760#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4761 case KVM_CLEAR_DIRTY_LOG: {
4762 struct kvm_clear_dirty_log log;
4763
4764 r = -EFAULT;
4765 if (copy_from_user(&log, argp, sizeof(log)))
4766 goto out;
4767 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4768 break;
4769 }
4770#endif
4b4357e0 4771#ifdef CONFIG_KVM_MMIO
5f94c174
LV
4772 case KVM_REGISTER_COALESCED_MMIO: {
4773 struct kvm_coalesced_mmio_zone zone;
f95ef0cd 4774
5f94c174 4775 r = -EFAULT;
893bdbf1 4776 if (copy_from_user(&zone, argp, sizeof(zone)))
5f94c174 4777 goto out;
5f94c174 4778 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
5f94c174
LV
4779 break;
4780 }
4781 case KVM_UNREGISTER_COALESCED_MMIO: {
4782 struct kvm_coalesced_mmio_zone zone;
f95ef0cd 4783
5f94c174 4784 r = -EFAULT;
893bdbf1 4785 if (copy_from_user(&zone, argp, sizeof(zone)))
5f94c174 4786 goto out;
5f94c174 4787 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
5f94c174
LV
4788 break;
4789 }
4790#endif
721eecbf
GH
4791 case KVM_IRQFD: {
4792 struct kvm_irqfd data;
4793
4794 r = -EFAULT;
893bdbf1 4795 if (copy_from_user(&data, argp, sizeof(data)))
721eecbf 4796 goto out;
d4db2935 4797 r = kvm_irqfd(kvm, &data);
721eecbf
GH
4798 break;
4799 }
d34e6b17
GH
4800 case KVM_IOEVENTFD: {
4801 struct kvm_ioeventfd data;
4802
4803 r = -EFAULT;
893bdbf1 4804 if (copy_from_user(&data, argp, sizeof(data)))
d34e6b17
GH
4805 goto out;
4806 r = kvm_ioeventfd(kvm, &data);
4807 break;
4808 }
07975ad3
JK
4809#ifdef CONFIG_HAVE_KVM_MSI
4810 case KVM_SIGNAL_MSI: {
4811 struct kvm_msi msi;
4812
4813 r = -EFAULT;
893bdbf1 4814 if (copy_from_user(&msi, argp, sizeof(msi)))
07975ad3
JK
4815 goto out;
4816 r = kvm_send_userspace_msi(kvm, &msi);
4817 break;
4818 }
23d43cf9
CD
4819#endif
4820#ifdef __KVM_HAVE_IRQ_LINE
4821 case KVM_IRQ_LINE_STATUS:
4822 case KVM_IRQ_LINE: {
4823 struct kvm_irq_level irq_event;
4824
4825 r = -EFAULT;
893bdbf1 4826 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
23d43cf9
CD
4827 goto out;
4828
aa2fbe6d
YZ
4829 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4830 ioctl == KVM_IRQ_LINE_STATUS);
23d43cf9
CD
4831 if (r)
4832 goto out;
4833
4834 r = -EFAULT;
4835 if (ioctl == KVM_IRQ_LINE_STATUS) {
893bdbf1 4836 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
23d43cf9
CD
4837 goto out;
4838 }
4839
4840 r = 0;
4841 break;
4842 }
73880c80 4843#endif
aa8d5944
AG
4844#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4845 case KVM_SET_GSI_ROUTING: {
4846 struct kvm_irq_routing routing;
4847 struct kvm_irq_routing __user *urouting;
f8c1b85b 4848 struct kvm_irq_routing_entry *entries = NULL;
aa8d5944
AG
4849
4850 r = -EFAULT;
4851 if (copy_from_user(&routing, argp, sizeof(routing)))
4852 goto out;
4853 r = -EINVAL;
5c0aea0e
DH
4854 if (!kvm_arch_can_set_irq_routing(kvm))
4855 goto out;
caf1ff26 4856 if (routing.nr > KVM_MAX_IRQ_ROUTES)
aa8d5944
AG
4857 goto out;
4858 if (routing.flags)
4859 goto out;
f8c1b85b 4860 if (routing.nr) {
f8c1b85b 4861 urouting = argp;
7ec28e26
DE
4862 entries = vmemdup_user(urouting->entries,
4863 array_size(sizeof(*entries),
4864 routing.nr));
4865 if (IS_ERR(entries)) {
4866 r = PTR_ERR(entries);
4867 goto out;
4868 }
f8c1b85b 4869 }
aa8d5944
AG
4870 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4871 routing.flags);
7ec28e26 4872 kvfree(entries);
aa8d5944
AG
4873 break;
4874 }
4875#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
852b6d57
SW
4876 case KVM_CREATE_DEVICE: {
4877 struct kvm_create_device cd;
4878
4879 r = -EFAULT;
4880 if (copy_from_user(&cd, argp, sizeof(cd)))
4881 goto out;
4882
4883 r = kvm_ioctl_create_device(kvm, &cd);
4884 if (r)
4885 goto out;
4886
4887 r = -EFAULT;
4888 if (copy_to_user(argp, &cd, sizeof(cd)))
4889 goto out;
4890
4891 r = 0;
4892 break;
4893 }
92b591a4
AG
4894 case KVM_CHECK_EXTENSION:
4895 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4896 break;
fb04a1ed
PX
4897 case KVM_RESET_DIRTY_RINGS:
4898 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4899 break;
fcfe1bae
JZ
4900 case KVM_GET_STATS_FD:
4901 r = kvm_vm_ioctl_get_stats_fd(kvm);
4902 break;
f17abe9a 4903 default:
1fe779f8 4904 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
f17abe9a
AK
4905 }
4906out:
4907 return r;
4908}
4909
de8e5d74 4910#ifdef CONFIG_KVM_COMPAT
6ff5894c
AB
4911struct compat_kvm_dirty_log {
4912 __u32 slot;
4913 __u32 padding1;
4914 union {
4915 compat_uptr_t dirty_bitmap; /* one bit per page */
4916 __u64 padding2;
4917 };
4918};
4919
8750f9bb
PB
4920struct compat_kvm_clear_dirty_log {
4921 __u32 slot;
4922 __u32 num_pages;
4923 __u64 first_page;
4924 union {
4925 compat_uptr_t dirty_bitmap; /* one bit per page */
4926 __u64 padding2;
4927 };
4928};
4929
ed51862f
AG
4930long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
4931 unsigned long arg)
4932{
4933 return -ENOTTY;
4934}
4935
6ff5894c
AB
4936static long kvm_vm_compat_ioctl(struct file *filp,
4937 unsigned int ioctl, unsigned long arg)
4938{
4939 struct kvm *kvm = filp->private_data;
4940 int r;
4941
f4d31653 4942 if (kvm->mm != current->mm || kvm->vm_dead)
6ff5894c 4943 return -EIO;
ed51862f
AG
4944
4945 r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
4946 if (r != -ENOTTY)
4947 return r;
4948
6ff5894c 4949 switch (ioctl) {
8750f9bb
PB
4950#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4951 case KVM_CLEAR_DIRTY_LOG: {
4952 struct compat_kvm_clear_dirty_log compat_log;
4953 struct kvm_clear_dirty_log log;
4954
4955 if (copy_from_user(&compat_log, (void __user *)arg,
4956 sizeof(compat_log)))
4957 return -EFAULT;
4958 log.slot = compat_log.slot;
4959 log.num_pages = compat_log.num_pages;
4960 log.first_page = compat_log.first_page;
4961 log.padding2 = compat_log.padding2;
4962 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4963
4964 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4965 break;
4966 }
4967#endif
6ff5894c
AB
4968 case KVM_GET_DIRTY_LOG: {
4969 struct compat_kvm_dirty_log compat_log;
4970 struct kvm_dirty_log log;
4971
6ff5894c
AB
4972 if (copy_from_user(&compat_log, (void __user *)arg,
4973 sizeof(compat_log)))
f6a3b168 4974 return -EFAULT;
6ff5894c
AB
4975 log.slot = compat_log.slot;
4976 log.padding1 = compat_log.padding1;
4977 log.padding2 = compat_log.padding2;
4978 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4979
4980 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
6ff5894c
AB
4981 break;
4982 }
4983 default:
4984 r = kvm_vm_ioctl(filp, ioctl, arg);
4985 }
6ff5894c
AB
4986 return r;
4987}
4988#endif
4989
70375c2d 4990static const struct file_operations kvm_vm_fops = {
f17abe9a
AK
4991 .release = kvm_vm_release,
4992 .unlocked_ioctl = kvm_vm_ioctl,
6038f373 4993 .llseek = noop_llseek,
7ddfd3e0 4994 KVM_COMPAT(kvm_vm_compat_ioctl),
f17abe9a
AK
4995};
4996
54526d1f
NT
4997bool file_is_kvm(struct file *file)
4998{
4999 return file && file->f_op == &kvm_vm_fops;
5000}
5001EXPORT_SYMBOL_GPL(file_is_kvm);
5002
e08b9637 5003static int kvm_dev_ioctl_create_vm(unsigned long type)
f17abe9a 5004{
59f82aad 5005 char fdname[ITOA_MAX_LEN + 1];
20020f4c 5006 int r, fd;
f17abe9a 5007 struct kvm *kvm;
506cfba9 5008 struct file *file;
f17abe9a 5009
20020f4c
OU
5010 fd = get_unused_fd_flags(O_CLOEXEC);
5011 if (fd < 0)
5012 return fd;
5013
59f82aad
OU
5014 snprintf(fdname, sizeof(fdname), "%d", fd);
5015
b74ed7a6 5016 kvm = kvm_create_vm(type, fdname);
20020f4c
OU
5017 if (IS_ERR(kvm)) {
5018 r = PTR_ERR(kvm);
5019 goto put_fd;
5020 }
5021
506cfba9
AV
5022 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5023 if (IS_ERR(file)) {
78588335
ME
5024 r = PTR_ERR(file);
5025 goto put_kvm;
506cfba9 5026 }
536a6f88 5027
525df861
PB
5028 /*
5029 * Don't call kvm_put_kvm anymore at this point; file->f_op is
5030 * already set, with ->release() being kvm_vm_release(). In error
5031 * cases it will be called by the final fput(file) and will take
5032 * care of doing kvm_put_kvm(kvm).
5033 */
286de8f6 5034 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
f17abe9a 5035
20020f4c
OU
5036 fd_install(fd, file);
5037 return fd;
78588335
ME
5038
5039put_kvm:
5040 kvm_put_kvm(kvm);
20020f4c
OU
5041put_fd:
5042 put_unused_fd(fd);
78588335 5043 return r;
f17abe9a
AK
5044}
5045
5046static long kvm_dev_ioctl(struct file *filp,
5047 unsigned int ioctl, unsigned long arg)
5048{
07c45a36 5049 long r = -EINVAL;
f17abe9a
AK
5050
5051 switch (ioctl) {
5052 case KVM_GET_API_VERSION:
f0fe5108
AK
5053 if (arg)
5054 goto out;
f17abe9a
AK
5055 r = KVM_API_VERSION;
5056 break;
5057 case KVM_CREATE_VM:
e08b9637 5058 r = kvm_dev_ioctl_create_vm(arg);
f17abe9a 5059 break;
018d00d2 5060 case KVM_CHECK_EXTENSION:
784aa3d7 5061 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5d308f45 5062 break;
07c45a36 5063 case KVM_GET_VCPU_MMAP_SIZE:
07c45a36
AK
5064 if (arg)
5065 goto out;
adb1ff46
AK
5066 r = PAGE_SIZE; /* struct kvm_run */
5067#ifdef CONFIG_X86
5068 r += PAGE_SIZE; /* pio data page */
5f94c174 5069#endif
4b4357e0 5070#ifdef CONFIG_KVM_MMIO
5f94c174 5071 r += PAGE_SIZE; /* coalesced mmio ring page */
adb1ff46 5072#endif
07c45a36 5073 break;
d4c9ff2d
FEL
5074 case KVM_TRACE_ENABLE:
5075 case KVM_TRACE_PAUSE:
5076 case KVM_TRACE_DISABLE:
2023a29c 5077 r = -EOPNOTSUPP;
d4c9ff2d 5078 break;
6aa8b732 5079 default:
043405e1 5080 return kvm_arch_dev_ioctl(filp, ioctl, arg);
6aa8b732
AK
5081 }
5082out:
5083 return r;
5084}
5085
6aa8b732 5086static struct file_operations kvm_chardev_ops = {
6aa8b732 5087 .unlocked_ioctl = kvm_dev_ioctl,
6038f373 5088 .llseek = noop_llseek,
7ddfd3e0 5089 KVM_COMPAT(kvm_dev_ioctl),
6aa8b732
AK
5090};
5091
5092static struct miscdevice kvm_dev = {
bbe4432e 5093 KVM_MINOR,
6aa8b732
AK
5094 "kvm",
5095 &kvm_chardev_ops,
5096};
5097
75b7127c 5098static void hardware_enable_nolock(void *junk)
1b6c0168
AK
5099{
5100 int cpu = raw_smp_processor_id();
10474ae8 5101 int r;
1b6c0168 5102
7f59f492 5103 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
1b6c0168 5104 return;
10474ae8 5105
7f59f492 5106 cpumask_set_cpu(cpu, cpus_hardware_enabled);
10474ae8 5107
13a34e06 5108 r = kvm_arch_hardware_enable();
10474ae8
AG
5109
5110 if (r) {
5111 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
5112 atomic_inc(&hardware_enable_failed);
1170adc6 5113 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
10474ae8 5114 }
1b6c0168
AK
5115}
5116
8c18b2d2 5117static int kvm_starting_cpu(unsigned int cpu)
75b7127c 5118{
4a937f96 5119 raw_spin_lock(&kvm_count_lock);
4fa92fb2
PB
5120 if (kvm_usage_count)
5121 hardware_enable_nolock(NULL);
4a937f96 5122 raw_spin_unlock(&kvm_count_lock);
8c18b2d2 5123 return 0;
75b7127c
TY
5124}
5125
5126static void hardware_disable_nolock(void *junk)
1b6c0168
AK
5127{
5128 int cpu = raw_smp_processor_id();
5129
7f59f492 5130 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
1b6c0168 5131 return;
7f59f492 5132 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
13a34e06 5133 kvm_arch_hardware_disable();
1b6c0168
AK
5134}
5135
8c18b2d2 5136static int kvm_dying_cpu(unsigned int cpu)
75b7127c 5137{
4a937f96 5138 raw_spin_lock(&kvm_count_lock);
4fa92fb2
PB
5139 if (kvm_usage_count)
5140 hardware_disable_nolock(NULL);
4a937f96 5141 raw_spin_unlock(&kvm_count_lock);
8c18b2d2 5142 return 0;
75b7127c
TY
5143}
5144
10474ae8
AG
5145static void hardware_disable_all_nolock(void)
5146{
5147 BUG_ON(!kvm_usage_count);
5148
5149 kvm_usage_count--;
5150 if (!kvm_usage_count)
75b7127c 5151 on_each_cpu(hardware_disable_nolock, NULL, 1);
10474ae8
AG
5152}
5153
5154static void hardware_disable_all(void)
5155{
4a937f96 5156 raw_spin_lock(&kvm_count_lock);
10474ae8 5157 hardware_disable_all_nolock();
4a937f96 5158 raw_spin_unlock(&kvm_count_lock);
10474ae8
AG
5159}
5160
5161static int hardware_enable_all(void)
5162{
5163 int r = 0;
5164
4a937f96 5165 raw_spin_lock(&kvm_count_lock);
10474ae8
AG
5166
5167 kvm_usage_count++;
5168 if (kvm_usage_count == 1) {
5169 atomic_set(&hardware_enable_failed, 0);
75b7127c 5170 on_each_cpu(hardware_enable_nolock, NULL, 1);
10474ae8
AG
5171
5172 if (atomic_read(&hardware_enable_failed)) {
5173 hardware_disable_all_nolock();
5174 r = -EBUSY;
5175 }
5176 }
5177
4a937f96 5178 raw_spin_unlock(&kvm_count_lock);
10474ae8
AG
5179
5180 return r;
5181}
5182
9a2b85c6 5183static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
d77c26fc 5184 void *v)
9a2b85c6 5185{
8e1c1815
SY
5186 /*
5187 * Some (well, at least mine) BIOSes hang on reboot if
5188 * in vmx root mode.
5189 *
5190 * And Intel TXT required VMX off for all cpu when system shutdown.
5191 */
1170adc6 5192 pr_info("kvm: exiting hardware virtualization\n");
8e1c1815 5193 kvm_rebooting = true;
75b7127c 5194 on_each_cpu(hardware_disable_nolock, NULL, 1);
9a2b85c6
RR
5195 return NOTIFY_OK;
5196}
5197
5198static struct notifier_block kvm_reboot_notifier = {
5199 .notifier_call = kvm_reboot,
5200 .priority = 0,
5201};
5202
e93f8a0f 5203static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2eeb2e94
GH
5204{
5205 int i;
5206
5207 for (i = 0; i < bus->dev_count; i++) {
743eeb0b 5208 struct kvm_io_device *pos = bus->range[i].dev;
2eeb2e94
GH
5209
5210 kvm_iodevice_destructor(pos);
5211 }
e93f8a0f 5212 kfree(bus);
2eeb2e94
GH
5213}
5214
c21fbff1 5215static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
20e87b72 5216 const struct kvm_io_range *r2)
743eeb0b 5217{
8f4216c7
JW
5218 gpa_t addr1 = r1->addr;
5219 gpa_t addr2 = r2->addr;
5220
5221 if (addr1 < addr2)
743eeb0b 5222 return -1;
8f4216c7
JW
5223
5224 /* If r2->len == 0, match the exact address. If r2->len != 0,
5225 * accept any overlapping write. Any order is acceptable for
5226 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5227 * we process all of them.
5228 */
5229 if (r2->len) {
5230 addr1 += r1->len;
5231 addr2 += r2->len;
5232 }
5233
5234 if (addr1 > addr2)
743eeb0b 5235 return 1;
8f4216c7 5236
743eeb0b
SL
5237 return 0;
5238}
5239
a343c9b7
PB
5240static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5241{
c21fbff1 5242 return kvm_io_bus_cmp(p1, p2);
a343c9b7
PB
5243}
5244
39369f7a 5245static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
743eeb0b
SL
5246 gpa_t addr, int len)
5247{
5248 struct kvm_io_range *range, key;
5249 int off;
5250
5251 key = (struct kvm_io_range) {
5252 .addr = addr,
5253 .len = len,
5254 };
5255
5256 range = bsearch(&key, bus->range, bus->dev_count,
5257 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5258 if (range == NULL)
5259 return -ENOENT;
5260
5261 off = range - bus->range;
5262
c21fbff1 5263 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
743eeb0b
SL
5264 off--;
5265
5266 return off;
5267}
5268
e32edf4f 5269static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
126a5af5
CH
5270 struct kvm_io_range *range, const void *val)
5271{
5272 int idx;
5273
5274 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5275 if (idx < 0)
5276 return -EOPNOTSUPP;
5277
5278 while (idx < bus->dev_count &&
c21fbff1 5279 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
e32edf4f 5280 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
126a5af5
CH
5281 range->len, val))
5282 return idx;
5283 idx++;
5284 }
5285
5286 return -EOPNOTSUPP;
5287}
5288
bda9020e 5289/* kvm_io_bus_write - called under kvm->slots_lock */
e32edf4f 5290int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
bda9020e 5291 int len, const void *val)
2eeb2e94 5292{
90d83dc3 5293 struct kvm_io_bus *bus;
743eeb0b 5294 struct kvm_io_range range;
126a5af5 5295 int r;
743eeb0b
SL
5296
5297 range = (struct kvm_io_range) {
5298 .addr = addr,
5299 .len = len,
5300 };
90d83dc3 5301
e32edf4f 5302 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5303 if (!bus)
5304 return -ENOMEM;
e32edf4f 5305 r = __kvm_io_bus_write(vcpu, bus, &range, val);
126a5af5
CH
5306 return r < 0 ? r : 0;
5307}
a2420107 5308EXPORT_SYMBOL_GPL(kvm_io_bus_write);
126a5af5
CH
5309
5310/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
e32edf4f
NN
5311int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5312 gpa_t addr, int len, const void *val, long cookie)
126a5af5
CH
5313{
5314 struct kvm_io_bus *bus;
5315 struct kvm_io_range range;
5316
5317 range = (struct kvm_io_range) {
5318 .addr = addr,
5319 .len = len,
5320 };
5321
e32edf4f 5322 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5323 if (!bus)
5324 return -ENOMEM;
126a5af5
CH
5325
5326 /* First try the device referenced by cookie. */
5327 if ((cookie >= 0) && (cookie < bus->dev_count) &&
c21fbff1 5328 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
e32edf4f 5329 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
126a5af5
CH
5330 val))
5331 return cookie;
5332
5333 /*
5334 * cookie contained garbage; fall back to search and return the
5335 * correct cookie value.
5336 */
e32edf4f 5337 return __kvm_io_bus_write(vcpu, bus, &range, val);
126a5af5
CH
5338}
5339
e32edf4f
NN
5340static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5341 struct kvm_io_range *range, void *val)
126a5af5
CH
5342{
5343 int idx;
5344
5345 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
743eeb0b
SL
5346 if (idx < 0)
5347 return -EOPNOTSUPP;
5348
5349 while (idx < bus->dev_count &&
c21fbff1 5350 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
e32edf4f 5351 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
126a5af5
CH
5352 range->len, val))
5353 return idx;
743eeb0b
SL
5354 idx++;
5355 }
5356
bda9020e
MT
5357 return -EOPNOTSUPP;
5358}
2eeb2e94 5359
bda9020e 5360/* kvm_io_bus_read - called under kvm->slots_lock */
e32edf4f 5361int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
e93f8a0f 5362 int len, void *val)
bda9020e 5363{
90d83dc3 5364 struct kvm_io_bus *bus;
743eeb0b 5365 struct kvm_io_range range;
126a5af5 5366 int r;
743eeb0b
SL
5367
5368 range = (struct kvm_io_range) {
5369 .addr = addr,
5370 .len = len,
5371 };
e93f8a0f 5372
e32edf4f 5373 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
90db1043
DH
5374 if (!bus)
5375 return -ENOMEM;
e32edf4f 5376 r = __kvm_io_bus_read(vcpu, bus, &range, val);
126a5af5
CH
5377 return r < 0 ? r : 0;
5378}
743eeb0b 5379
79fac95e 5380/* Caller must hold slots_lock. */
743eeb0b
SL
5381int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5382 int len, struct kvm_io_device *dev)
6c474694 5383{
d4c67a7a 5384 int i;
e93f8a0f 5385 struct kvm_io_bus *new_bus, *bus;
d4c67a7a 5386 struct kvm_io_range range;
090b7aff 5387
4a12f951 5388 bus = kvm_get_bus(kvm, bus_idx);
90db1043
DH
5389 if (!bus)
5390 return -ENOMEM;
5391
6ea34c9b
AK
5392 /* exclude ioeventfd which is limited by maximum fd */
5393 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
090b7aff 5394 return -ENOSPC;
2eeb2e94 5395
90952cd3 5396 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
b12ce36a 5397 GFP_KERNEL_ACCOUNT);
e93f8a0f
MT
5398 if (!new_bus)
5399 return -ENOMEM;
d4c67a7a
GH
5400
5401 range = (struct kvm_io_range) {
5402 .addr = addr,
5403 .len = len,
5404 .dev = dev,
5405 };
5406
5407 for (i = 0; i < bus->dev_count; i++)
5408 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5409 break;
5410
5411 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5412 new_bus->dev_count++;
5413 new_bus->range[i] = range;
5414 memcpy(new_bus->range + i + 1, bus->range + i,
5415 (bus->dev_count - i) * sizeof(struct kvm_io_range));
e93f8a0f
MT
5416 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5417 synchronize_srcu_expedited(&kvm->srcu);
5418 kfree(bus);
090b7aff
GH
5419
5420 return 0;
5421}
5422
5d3c4c79
SC
5423int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5424 struct kvm_io_device *dev)
090b7aff 5425{
f6588660 5426 int i, j;
e93f8a0f 5427 struct kvm_io_bus *new_bus, *bus;
090b7aff 5428
7c896d37
SC
5429 lockdep_assert_held(&kvm->slots_lock);
5430
4a12f951 5431 bus = kvm_get_bus(kvm, bus_idx);
df630b8c 5432 if (!bus)
5d3c4c79 5433 return 0;
df630b8c 5434
7c896d37 5435 for (i = 0; i < bus->dev_count; i++) {
a1300716 5436 if (bus->range[i].dev == dev) {
090b7aff
GH
5437 break;
5438 }
7c896d37 5439 }
e93f8a0f 5440
90db1043 5441 if (i == bus->dev_count)
5d3c4c79 5442 return 0;
a1300716 5443
90952cd3 5444 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
b12ce36a 5445 GFP_KERNEL_ACCOUNT);
f6588660 5446 if (new_bus) {
871c433b 5447 memcpy(new_bus, bus, struct_size(bus, range, i));
f6588660
RK
5448 new_bus->dev_count--;
5449 memcpy(new_bus->range + i, bus->range + i + 1,
871c433b 5450 flex_array_size(new_bus, range, new_bus->dev_count - i));
2ee37574
SC
5451 }
5452
5453 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5454 synchronize_srcu_expedited(&kvm->srcu);
5455
5456 /* Destroy the old bus _after_ installing the (null) bus. */
5457 if (!new_bus) {
90db1043 5458 pr_err("kvm: failed to shrink bus, removing it completely\n");
f6588660
RK
5459 for (j = 0; j < bus->dev_count; j++) {
5460 if (j == i)
5461 continue;
5462 kvm_iodevice_destructor(bus->range[j].dev);
5463 }
90db1043 5464 }
a1300716 5465
e93f8a0f 5466 kfree(bus);
5d3c4c79 5467 return new_bus ? 0 : -ENOMEM;
2eeb2e94
GH
5468}
5469
8a39d006
AP
5470struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5471 gpa_t addr)
5472{
5473 struct kvm_io_bus *bus;
5474 int dev_idx, srcu_idx;
5475 struct kvm_io_device *iodev = NULL;
5476
5477 srcu_idx = srcu_read_lock(&kvm->srcu);
5478
5479 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
90db1043
DH
5480 if (!bus)
5481 goto out_unlock;
8a39d006
AP
5482
5483 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5484 if (dev_idx < 0)
5485 goto out_unlock;
5486
5487 iodev = bus->range[dev_idx].dev;
5488
5489out_unlock:
5490 srcu_read_unlock(&kvm->srcu, srcu_idx);
5491
5492 return iodev;
5493}
5494EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5495
536a6f88
JF
5496static int kvm_debugfs_open(struct inode *inode, struct file *file,
5497 int (*get)(void *, u64 *), int (*set)(void *, u64),
5498 const char *fmt)
5499{
180418e2 5500 int ret;
536a6f88
JF
5501 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5502 inode->i_private;
5503
605c7130
PX
5504 /*
5505 * The debugfs files are a reference to the kvm struct which
5506 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
5507 * avoids the race between open and the removal of the debugfs directory.
536a6f88 5508 */
605c7130 5509 if (!kvm_get_kvm_safe(stat_data->kvm))
536a6f88
JF
5510 return -ENOENT;
5511
180418e2
HW
5512 ret = simple_attr_open(inode, file, get,
5513 kvm_stats_debugfs_mode(stat_data->desc) & 0222
5514 ? set : NULL, fmt);
5515 if (ret)
536a6f88 5516 kvm_put_kvm(stat_data->kvm);
536a6f88 5517
180418e2 5518 return ret;
536a6f88
JF
5519}
5520
5521static int kvm_debugfs_release(struct inode *inode, struct file *file)
5522{
5523 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5524 inode->i_private;
5525
5526 simple_attr_release(inode, file);
5527 kvm_put_kvm(stat_data->kvm);
5528
5529 return 0;
5530}
5531
09cbcef6 5532static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
536a6f88 5533{
bc9e9e67 5534 *val = *(u64 *)((void *)(&kvm->stat) + offset);
536a6f88 5535
09cbcef6
MP
5536 return 0;
5537}
5538
5539static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5540{
bc9e9e67 5541 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
536a6f88
JF
5542
5543 return 0;
5544}
5545
09cbcef6 5546static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
ce35ef27 5547{
46808a4c 5548 unsigned long i;
09cbcef6 5549 struct kvm_vcpu *vcpu;
ce35ef27 5550
09cbcef6 5551 *val = 0;
ce35ef27 5552
09cbcef6 5553 kvm_for_each_vcpu(i, vcpu, kvm)
bc9e9e67 5554 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
ce35ef27
SJS
5555
5556 return 0;
5557}
5558
09cbcef6 5559static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
536a6f88 5560{
46808a4c 5561 unsigned long i;
09cbcef6 5562 struct kvm_vcpu *vcpu;
536a6f88 5563
09cbcef6 5564 kvm_for_each_vcpu(i, vcpu, kvm)
bc9e9e67 5565 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
09cbcef6
MP
5566
5567 return 0;
5568}
536a6f88 5569
09cbcef6 5570static int kvm_stat_data_get(void *data, u64 *val)
536a6f88 5571{
09cbcef6 5572 int r = -EFAULT;
536a6f88 5573 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
536a6f88 5574
bc9e9e67 5575 switch (stat_data->kind) {
09cbcef6
MP
5576 case KVM_STAT_VM:
5577 r = kvm_get_stat_per_vm(stat_data->kvm,
bc9e9e67 5578 stat_data->desc->desc.offset, val);
09cbcef6
MP
5579 break;
5580 case KVM_STAT_VCPU:
5581 r = kvm_get_stat_per_vcpu(stat_data->kvm,
bc9e9e67 5582 stat_data->desc->desc.offset, val);
09cbcef6
MP
5583 break;
5584 }
536a6f88 5585
09cbcef6 5586 return r;
536a6f88
JF
5587}
5588
09cbcef6 5589static int kvm_stat_data_clear(void *data, u64 val)
ce35ef27 5590{
09cbcef6 5591 int r = -EFAULT;
ce35ef27 5592 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
ce35ef27
SJS
5593
5594 if (val)
5595 return -EINVAL;
5596
bc9e9e67 5597 switch (stat_data->kind) {
09cbcef6
MP
5598 case KVM_STAT_VM:
5599 r = kvm_clear_stat_per_vm(stat_data->kvm,
bc9e9e67 5600 stat_data->desc->desc.offset);
09cbcef6
MP
5601 break;
5602 case KVM_STAT_VCPU:
5603 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
bc9e9e67 5604 stat_data->desc->desc.offset);
09cbcef6
MP
5605 break;
5606 }
ce35ef27 5607
09cbcef6 5608 return r;
ce35ef27
SJS
5609}
5610
09cbcef6 5611static int kvm_stat_data_open(struct inode *inode, struct file *file)
536a6f88
JF
5612{
5613 __simple_attr_check_format("%llu\n", 0ull);
09cbcef6
MP
5614 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5615 kvm_stat_data_clear, "%llu\n");
536a6f88
JF
5616}
5617
09cbcef6
MP
5618static const struct file_operations stat_fops_per_vm = {
5619 .owner = THIS_MODULE,
5620 .open = kvm_stat_data_open,
536a6f88 5621 .release = kvm_debugfs_release,
09cbcef6
MP
5622 .read = simple_attr_read,
5623 .write = simple_attr_write,
5624 .llseek = no_llseek,
536a6f88
JF
5625};
5626
8b88b099 5627static int vm_stat_get(void *_offset, u64 *val)
ba1389b7
AK
5628{
5629 unsigned offset = (long)_offset;
ba1389b7 5630 struct kvm *kvm;
536a6f88 5631 u64 tmp_val;
ba1389b7 5632
8b88b099 5633 *val = 0;
0d9ce162 5634 mutex_lock(&kvm_lock);
536a6f88 5635 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 5636 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
536a6f88
JF
5637 *val += tmp_val;
5638 }
0d9ce162 5639 mutex_unlock(&kvm_lock);
8b88b099 5640 return 0;
ba1389b7
AK
5641}
5642
ce35ef27
SJS
5643static int vm_stat_clear(void *_offset, u64 val)
5644{
5645 unsigned offset = (long)_offset;
5646 struct kvm *kvm;
ce35ef27
SJS
5647
5648 if (val)
5649 return -EINVAL;
5650
0d9ce162 5651 mutex_lock(&kvm_lock);
ce35ef27 5652 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 5653 kvm_clear_stat_per_vm(kvm, offset);
ce35ef27 5654 }
0d9ce162 5655 mutex_unlock(&kvm_lock);
ce35ef27
SJS
5656
5657 return 0;
5658}
5659
5660DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
bc9e9e67 5661DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
ba1389b7 5662
8b88b099 5663static int vcpu_stat_get(void *_offset, u64 *val)
1165f5fe
AK
5664{
5665 unsigned offset = (long)_offset;
1165f5fe 5666 struct kvm *kvm;
536a6f88 5667 u64 tmp_val;
1165f5fe 5668
8b88b099 5669 *val = 0;
0d9ce162 5670 mutex_lock(&kvm_lock);
536a6f88 5671 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 5672 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
536a6f88
JF
5673 *val += tmp_val;
5674 }
0d9ce162 5675 mutex_unlock(&kvm_lock);
8b88b099 5676 return 0;
1165f5fe
AK
5677}
5678
ce35ef27
SJS
5679static int vcpu_stat_clear(void *_offset, u64 val)
5680{
5681 unsigned offset = (long)_offset;
5682 struct kvm *kvm;
ce35ef27
SJS
5683
5684 if (val)
5685 return -EINVAL;
5686
0d9ce162 5687 mutex_lock(&kvm_lock);
ce35ef27 5688 list_for_each_entry(kvm, &vm_list, vm_list) {
09cbcef6 5689 kvm_clear_stat_per_vcpu(kvm, offset);
ce35ef27 5690 }
0d9ce162 5691 mutex_unlock(&kvm_lock);
ce35ef27
SJS
5692
5693 return 0;
5694}
5695
5696DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5697 "%llu\n");
bc9e9e67 5698DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
1165f5fe 5699
286de8f6
CI
5700static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5701{
5702 struct kobj_uevent_env *env;
286de8f6
CI
5703 unsigned long long created, active;
5704
5705 if (!kvm_dev.this_device || !kvm)
5706 return;
5707
0d9ce162 5708 mutex_lock(&kvm_lock);
286de8f6
CI
5709 if (type == KVM_EVENT_CREATE_VM) {
5710 kvm_createvm_count++;
5711 kvm_active_vms++;
5712 } else if (type == KVM_EVENT_DESTROY_VM) {
5713 kvm_active_vms--;
5714 }
5715 created = kvm_createvm_count;
5716 active = kvm_active_vms;
0d9ce162 5717 mutex_unlock(&kvm_lock);
286de8f6 5718
b12ce36a 5719 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
286de8f6
CI
5720 if (!env)
5721 return;
5722
5723 add_uevent_var(env, "CREATED=%llu", created);
5724 add_uevent_var(env, "COUNT=%llu", active);
5725
fdeaf7e3 5726 if (type == KVM_EVENT_CREATE_VM) {
286de8f6 5727 add_uevent_var(env, "EVENT=create");
fdeaf7e3
CI
5728 kvm->userspace_pid = task_pid_nr(current);
5729 } else if (type == KVM_EVENT_DESTROY_VM) {
286de8f6 5730 add_uevent_var(env, "EVENT=destroy");
fdeaf7e3
CI
5731 }
5732 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
286de8f6 5733
a44a4cc1 5734 if (!IS_ERR(kvm->debugfs_dentry)) {
b12ce36a 5735 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
fdeaf7e3
CI
5736
5737 if (p) {
5738 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5739 if (!IS_ERR(tmp))
5740 add_uevent_var(env, "STATS_PATH=%s", tmp);
5741 kfree(p);
286de8f6
CI
5742 }
5743 }
5744 /* no need for checks, since we are adding at most only 5 keys */
5745 env->envp[env->envp_idx++] = NULL;
5746 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5747 kfree(env);
286de8f6
CI
5748}
5749
929f45e3 5750static void kvm_init_debug(void)
6aa8b732 5751{
bc9e9e67
JZ
5752 const struct file_operations *fops;
5753 const struct _kvm_stats_desc *pdesc;
5754 int i;
6aa8b732 5755
76f7c879 5756 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
4f69b680 5757
bc9e9e67
JZ
5758 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5759 pdesc = &kvm_vm_stats_desc[i];
5760 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5761 fops = &vm_stat_fops;
5762 else
5763 fops = &vm_stat_readonly_fops;
5764 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5765 kvm_debugfs_dir,
5766 (void *)(long)pdesc->desc.offset, fops);
5767 }
5768
5769 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5770 pdesc = &kvm_vcpu_stats_desc[i];
5771 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5772 fops = &vcpu_stat_fops;
5773 else
5774 fops = &vcpu_stat_readonly_fops;
5775 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5776 kvm_debugfs_dir,
5777 (void *)(long)pdesc->desc.offset, fops);
4f69b680 5778 }
6aa8b732
AK
5779}
5780
fb3600cc 5781static int kvm_suspend(void)
59ae6c6b 5782{
10474ae8 5783 if (kvm_usage_count)
75b7127c 5784 hardware_disable_nolock(NULL);
59ae6c6b
AK
5785 return 0;
5786}
5787
fb3600cc 5788static void kvm_resume(void)
59ae6c6b 5789{
ca84d1a2 5790 if (kvm_usage_count) {
4cb9a998 5791 lockdep_assert_not_held(&kvm_count_lock);
75b7127c 5792 hardware_enable_nolock(NULL);
ca84d1a2 5793 }
59ae6c6b
AK
5794}
5795
fb3600cc 5796static struct syscore_ops kvm_syscore_ops = {
59ae6c6b
AK
5797 .suspend = kvm_suspend,
5798 .resume = kvm_resume,
5799};
5800
15ad7146
AK
5801static inline
5802struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5803{
5804 return container_of(pn, struct kvm_vcpu, preempt_notifier);
5805}
5806
5807static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5808{
5809 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
f95ef0cd 5810
046ddeed 5811 WRITE_ONCE(vcpu->preempted, false);
d73eb57b 5812 WRITE_ONCE(vcpu->ready, false);
15ad7146 5813
7495e22b 5814 __this_cpu_write(kvm_running_vcpu, vcpu);
e790d9ef 5815 kvm_arch_sched_in(vcpu, cpu);
e9b11c17 5816 kvm_arch_vcpu_load(vcpu, cpu);
15ad7146
AK
5817}
5818
5819static void kvm_sched_out(struct preempt_notifier *pn,
5820 struct task_struct *next)
5821{
5822 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5823
3ba9f93b 5824 if (current->on_rq) {
046ddeed 5825 WRITE_ONCE(vcpu->preempted, true);
d73eb57b
WL
5826 WRITE_ONCE(vcpu->ready, true);
5827 }
e9b11c17 5828 kvm_arch_vcpu_put(vcpu);
7495e22b
PB
5829 __this_cpu_write(kvm_running_vcpu, NULL);
5830}
5831
5832/**
5833 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
1f03b2bc
MZ
5834 *
5835 * We can disable preemption locally around accessing the per-CPU variable,
5836 * and use the resolved vcpu pointer after enabling preemption again,
5837 * because even if the current thread is migrated to another CPU, reading
5838 * the per-CPU value later will give us the same value as we update the
5839 * per-CPU variable in the preempt notifier handlers.
7495e22b
PB
5840 */
5841struct kvm_vcpu *kvm_get_running_vcpu(void)
5842{
1f03b2bc
MZ
5843 struct kvm_vcpu *vcpu;
5844
5845 preempt_disable();
5846 vcpu = __this_cpu_read(kvm_running_vcpu);
5847 preempt_enable();
5848
5849 return vcpu;
7495e22b 5850}
379a3c8e 5851EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
7495e22b
PB
5852
5853/**
5854 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5855 */
5856struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5857{
5858 return &kvm_running_vcpu;
15ad7146
AK
5859}
5860
e1bfc245
SC
5861#ifdef CONFIG_GUEST_PERF_EVENTS
5862static unsigned int kvm_guest_state(void)
5863{
5864 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
5865 unsigned int state;
5866
5867 if (!kvm_arch_pmi_in_guest(vcpu))
5868 return 0;
5869
5870 state = PERF_GUEST_ACTIVE;
5871 if (!kvm_arch_vcpu_in_kernel(vcpu))
5872 state |= PERF_GUEST_USER;
5873
5874 return state;
5875}
5876
5877static unsigned long kvm_guest_get_ip(void)
5878{
5879 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
5880
5881 /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
5882 if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
5883 return 0;
5884
5885 return kvm_arch_vcpu_get_ip(vcpu);
5886}
5887
5888static struct perf_guest_info_callbacks kvm_guest_cbs = {
5889 .state = kvm_guest_state,
5890 .get_ip = kvm_guest_get_ip,
5891 .handle_intel_pt_intr = NULL,
5892};
5893
5894void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
5895{
5896 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
5897 perf_register_guest_info_callbacks(&kvm_guest_cbs);
5898}
5899void kvm_unregister_perf_callbacks(void)
5900{
5901 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5902}
5903#endif
5904
b9904085
SC
5905struct kvm_cpu_compat_check {
5906 void *opaque;
5907 int *ret;
5908};
5909
5910static void check_processor_compat(void *data)
f257d6dc 5911{
b9904085
SC
5912 struct kvm_cpu_compat_check *c = data;
5913
5914 *c->ret = kvm_arch_check_processor_compat(c->opaque);
f257d6dc
SC
5915}
5916
0ee75bea 5917int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
c16f862d 5918 struct module *module)
6aa8b732 5919{
b9904085 5920 struct kvm_cpu_compat_check c;
6aa8b732 5921 int r;
002c7f7c 5922 int cpu;
6aa8b732 5923
f8c16bba
ZX
5924 r = kvm_arch_init(opaque);
5925 if (r)
d2308784 5926 goto out_fail;
cb498ea2 5927
7dac16c3
AH
5928 /*
5929 * kvm_arch_init makes sure there's at most one caller
5930 * for architectures that support multiple implementations,
5931 * like intel and amd on x86.
36343f6e
PB
5932 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5933 * conflicts in case kvm is already setup for another implementation.
7dac16c3 5934 */
36343f6e
PB
5935 r = kvm_irqfd_init();
5936 if (r)
5937 goto out_irqfd;
7dac16c3 5938
8437a617 5939 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
7f59f492
RR
5940 r = -ENOMEM;
5941 goto out_free_0;
5942 }
5943
b9904085 5944 r = kvm_arch_hardware_setup(opaque);
6aa8b732 5945 if (r < 0)
faf0be22 5946 goto out_free_1;
6aa8b732 5947
b9904085
SC
5948 c.ret = &r;
5949 c.opaque = opaque;
002c7f7c 5950 for_each_online_cpu(cpu) {
b9904085 5951 smp_call_function_single(cpu, check_processor_compat, &c, 1);
002c7f7c 5952 if (r < 0)
faf0be22 5953 goto out_free_2;
002c7f7c
YS
5954 }
5955
73c1b41e 5956 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
8c18b2d2 5957 kvm_starting_cpu, kvm_dying_cpu);
774c47f1 5958 if (r)
d2308784 5959 goto out_free_2;
6aa8b732
AK
5960 register_reboot_notifier(&kvm_reboot_notifier);
5961
c16f862d 5962 /* A kmem cache lets us meet the alignment requirements of fx_save. */
0ee75bea
AK
5963 if (!vcpu_align)
5964 vcpu_align = __alignof__(struct kvm_vcpu);
46515736
PB
5965 kvm_vcpu_cache =
5966 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5967 SLAB_ACCOUNT,
5968 offsetof(struct kvm_vcpu, arch),
ce55c049
JZ
5969 offsetofend(struct kvm_vcpu, stats_id)
5970 - offsetof(struct kvm_vcpu, arch),
46515736 5971 NULL);
c16f862d
RR
5972 if (!kvm_vcpu_cache) {
5973 r = -ENOMEM;
fb3600cc 5974 goto out_free_3;
c16f862d
RR
5975 }
5976
baff59cc
VK
5977 for_each_possible_cpu(cpu) {
5978 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5979 GFP_KERNEL, cpu_to_node(cpu))) {
5980 r = -ENOMEM;
5981 goto out_free_4;
5982 }
5983 }
5984
af585b92
GN
5985 r = kvm_async_pf_init();
5986 if (r)
5a2a961b 5987 goto out_free_4;
af585b92 5988
6aa8b732
AK
5989 kvm_chardev_ops.owner = module;
5990
5991 r = misc_register(&kvm_dev);
5992 if (r) {
1170adc6 5993 pr_err("kvm: misc device register failed\n");
af585b92 5994 goto out_unreg;
6aa8b732
AK
5995 }
5996
fb3600cc
RW
5997 register_syscore_ops(&kvm_syscore_ops);
5998
15ad7146
AK
5999 kvm_preempt_ops.sched_in = kvm_sched_in;
6000 kvm_preempt_ops.sched_out = kvm_sched_out;
6001
929f45e3 6002 kvm_init_debug();
0ea4ed8e 6003
3c3c29fd
PB
6004 r = kvm_vfio_ops_init();
6005 WARN_ON(r);
6006
c7addb90 6007 return 0;
6aa8b732 6008
af585b92
GN
6009out_unreg:
6010 kvm_async_pf_deinit();
5a2a961b 6011out_free_4:
baff59cc
VK
6012 for_each_possible_cpu(cpu)
6013 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
c16f862d 6014 kmem_cache_destroy(kvm_vcpu_cache);
d2308784 6015out_free_3:
6aa8b732 6016 unregister_reboot_notifier(&kvm_reboot_notifier);
8c18b2d2 6017 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
d2308784 6018out_free_2:
e9b11c17 6019 kvm_arch_hardware_unsetup();
faf0be22 6020out_free_1:
7f59f492 6021 free_cpumask_var(cpus_hardware_enabled);
d2308784 6022out_free_0:
a0f155e9 6023 kvm_irqfd_exit();
36343f6e 6024out_irqfd:
7dac16c3
AH
6025 kvm_arch_exit();
6026out_fail:
6aa8b732
AK
6027 return r;
6028}
cb498ea2 6029EXPORT_SYMBOL_GPL(kvm_init);
6aa8b732 6030
cb498ea2 6031void kvm_exit(void)
6aa8b732 6032{
baff59cc
VK
6033 int cpu;
6034
4bd33b56 6035 debugfs_remove_recursive(kvm_debugfs_dir);
6aa8b732 6036 misc_deregister(&kvm_dev);
baff59cc
VK
6037 for_each_possible_cpu(cpu)
6038 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
c16f862d 6039 kmem_cache_destroy(kvm_vcpu_cache);
af585b92 6040 kvm_async_pf_deinit();
fb3600cc 6041 unregister_syscore_ops(&kvm_syscore_ops);
6aa8b732 6042 unregister_reboot_notifier(&kvm_reboot_notifier);
8c18b2d2 6043 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
75b7127c 6044 on_each_cpu(hardware_disable_nolock, NULL, 1);
e9b11c17 6045 kvm_arch_hardware_unsetup();
f8c16bba 6046 kvm_arch_exit();
a0f155e9 6047 kvm_irqfd_exit();
7f59f492 6048 free_cpumask_var(cpus_hardware_enabled);
571ee1b6 6049 kvm_vfio_ops_exit();
6aa8b732 6050}
cb498ea2 6051EXPORT_SYMBOL_GPL(kvm_exit);
c57c8046
JS
6052
6053struct kvm_vm_worker_thread_context {
6054 struct kvm *kvm;
6055 struct task_struct *parent;
6056 struct completion init_done;
6057 kvm_vm_thread_fn_t thread_fn;
6058 uintptr_t data;
6059 int err;
6060};
6061
6062static int kvm_vm_worker_thread(void *context)
6063{
6064 /*
6065 * The init_context is allocated on the stack of the parent thread, so
6066 * we have to locally copy anything that is needed beyond initialization
6067 */
6068 struct kvm_vm_worker_thread_context *init_context = context;
e45cce30 6069 struct task_struct *parent;
c57c8046
JS
6070 struct kvm *kvm = init_context->kvm;
6071 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6072 uintptr_t data = init_context->data;
6073 int err;
6074
6075 err = kthread_park(current);
6076 /* kthread_park(current) is never supposed to return an error */
6077 WARN_ON(err != 0);
6078 if (err)
6079 goto init_complete;
6080
6081 err = cgroup_attach_task_all(init_context->parent, current);
6082 if (err) {
6083 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6084 __func__, err);
6085 goto init_complete;
6086 }
6087
6088 set_user_nice(current, task_nice(init_context->parent));
6089
6090init_complete:
6091 init_context->err = err;
6092 complete(&init_context->init_done);
6093 init_context = NULL;
6094
6095 if (err)
e45cce30 6096 goto out;
c57c8046
JS
6097
6098 /* Wait to be woken up by the spawner before proceeding. */
6099 kthread_parkme();
6100
6101 if (!kthread_should_stop())
6102 err = thread_fn(kvm, data);
6103
e45cce30
VS
6104out:
6105 /*
6106 * Move kthread back to its original cgroup to prevent it lingering in
6107 * the cgroup of the VM process, after the latter finishes its
6108 * execution.
6109 *
6110 * kthread_stop() waits on the 'exited' completion condition which is
6111 * set in exit_mm(), via mm_release(), in do_exit(). However, the
6112 * kthread is removed from the cgroup in the cgroup_exit() which is
6113 * called after the exit_mm(). This causes the kthread_stop() to return
6114 * before the kthread actually quits the cgroup.
6115 */
6116 rcu_read_lock();
6117 parent = rcu_dereference(current->real_parent);
6118 get_task_struct(parent);
6119 rcu_read_unlock();
6120 cgroup_attach_task_all(parent, current);
6121 put_task_struct(parent);
6122
c57c8046
JS
6123 return err;
6124}
6125
6126int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6127 uintptr_t data, const char *name,
6128 struct task_struct **thread_ptr)
6129{
6130 struct kvm_vm_worker_thread_context init_context = {};
6131 struct task_struct *thread;
6132
6133 *thread_ptr = NULL;
6134 init_context.kvm = kvm;
6135 init_context.parent = current;
6136 init_context.thread_fn = thread_fn;
6137 init_context.data = data;
6138 init_completion(&init_context.init_done);
6139
6140 thread = kthread_run(kvm_vm_worker_thread, &init_context,
6141 "%s-%d", name, task_pid_nr(current));
6142 if (IS_ERR(thread))
6143 return PTR_ERR(thread);
6144
6145 /* kthread_run is never supposed to return NULL */
6146 WARN_ON(thread == NULL);
6147
6148 wait_for_completion(&init_context.init_done);
6149
6150 if (!init_context.err)
6151 *thread_ptr = thread;
6152
6153 return init_context.err;
6154}