]> git.ipfire.org Git - people/ms/linux.git/blame - arch/x86/kvm/mmu/mmu.c
Merge tag 'at91-fixes-6.0-2' of https://git.kernel.org/pub/scm/linux/kernel/git/at91...
[people/ms/linux.git] / arch / x86 / kvm / mmu / mmu.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
6aa8b732
AK
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * MMU support
9 *
10 * Copyright (C) 2006 Qumranet, Inc.
9611c187 11 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
12 *
13 * Authors:
14 * Yaniv Kamay <yaniv@qumranet.com>
15 * Avi Kivity <avi@qumranet.com>
6aa8b732 16 */
e495606d 17
af585b92 18#include "irq.h"
88197e6a 19#include "ioapic.h"
1d737c8a 20#include "mmu.h"
6ca9a6f3 21#include "mmu_internal.h"
fe5db27d 22#include "tdp_mmu.h"
836a1b3c 23#include "x86.h"
6de4f3ad 24#include "kvm_cache_regs.h"
2f728d66 25#include "kvm_emulate.h"
5f7dde7b 26#include "cpuid.h"
5a9624af 27#include "spte.h"
e495606d 28
edf88417 29#include <linux/kvm_host.h>
6aa8b732
AK
30#include <linux/types.h>
31#include <linux/string.h>
6aa8b732
AK
32#include <linux/mm.h>
33#include <linux/highmem.h>
1767e931
PG
34#include <linux/moduleparam.h>
35#include <linux/export.h>
448353ca 36#include <linux/swap.h>
05da4558 37#include <linux/hugetlb.h>
2f333bcb 38#include <linux/compiler.h>
bc6678a3 39#include <linux/srcu.h>
5a0e3ad6 40#include <linux/slab.h>
3f07c014 41#include <linux/sched/signal.h>
bf998156 42#include <linux/uaccess.h>
114df303 43#include <linux/hash.h>
f160c7b7 44#include <linux/kern_levels.h>
1aa9b957 45#include <linux/kthread.h>
6aa8b732 46
e495606d 47#include <asm/page.h>
eb243d1d 48#include <asm/memtype.h>
e495606d 49#include <asm/cmpxchg.h>
4e542370 50#include <asm/io.h>
4a98623d 51#include <asm/set_memory.h>
13673a90 52#include <asm/vmx.h>
3d0c27ad 53#include <asm/kvm_page_track.h>
1261bfa3 54#include "trace.h"
6aa8b732 55
b8e8c830
PB
56extern bool itlb_multihit_kvm_mitigation;
57
a9d6496d 58int __read_mostly nx_huge_pages = -1;
4dfe4f40 59static uint __read_mostly nx_huge_pages_recovery_period_ms;
13fb5927
PB
60#ifdef CONFIG_PREEMPT_RT
61/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
62static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
63#else
1aa9b957 64static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
13fb5927 65#endif
b8e8c830
PB
66
67static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
4dfe4f40 68static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
b8e8c830 69
d5d6c18d 70static const struct kernel_param_ops nx_huge_pages_ops = {
b8e8c830
PB
71 .set = set_nx_huge_pages,
72 .get = param_get_bool,
73};
74
4dfe4f40
JS
75static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
76 .set = set_nx_huge_pages_recovery_param,
1aa9b957
JS
77 .get = param_get_uint,
78};
79
b8e8c830
PB
80module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
81__MODULE_PARM_TYPE(nx_huge_pages, "bool");
4dfe4f40 82module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
1aa9b957
JS
83 &nx_huge_pages_recovery_ratio, 0644);
84__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
4dfe4f40
JS
85module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
86 &nx_huge_pages_recovery_period_ms, 0644);
87__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
b8e8c830 88
71fe7013
SC
89static bool __read_mostly force_flush_and_sync_on_reuse;
90module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
91
18552672
JR
92/*
93 * When setting this variable to true it enables Two-Dimensional-Paging
94 * where the hardware walks 2 page tables:
95 * 1. the guest-virtual to guest-physical
96 * 2. while doing 1. it walks guest-physical to host-physical
97 * If the hardware supports that we don't need to do shadow paging.
98 */
2f333bcb 99bool tdp_enabled = false;
18552672 100
1d92d2e8 101static int max_huge_page_level __read_mostly;
746700d2 102static int tdp_root_level __read_mostly;
83013059 103static int max_tdp_level __read_mostly;
703c335d 104
37a7d8b0 105#ifdef MMU_DEBUG
5a9624af 106bool dbg = 0;
fa4a2c08 107module_param(dbg, bool, 0644);
d6c69ee9 108#endif
6aa8b732 109
957ed9ef
XG
110#define PTE_PREFETCH_NUM 8
111
90bb6fc5
AK
112#include <trace/events/kvm.h>
113
dc1cff96 114/* make pte_list_desc fit well in cache lines */
13236e25 115#define PTE_LIST_EXT 14
220f773a 116
13236e25
PX
117/*
118 * Slight optimization of cacheline layout, by putting `more' and `spte_count'
119 * at the start; then accessing it will only use one single cacheline for
120 * either full (entries==PTE_LIST_EXT) case or entries<=6.
121 */
53c07b18 122struct pte_list_desc {
53c07b18 123 struct pte_list_desc *more;
13236e25
PX
124 /*
125 * Stores number of entries stored in the pte_list_desc. No need to be
126 * u64 but just for easier alignment. When PTE_LIST_EXT, means full.
127 */
128 u64 spte_count;
129 u64 *sptes[PTE_LIST_EXT];
cd4a4e53
AK
130};
131
2d11123a
AK
132struct kvm_shadow_walk_iterator {
133 u64 addr;
134 hpa_t shadow_addr;
2d11123a 135 u64 *sptep;
dd3bfd59 136 int level;
2d11123a
AK
137 unsigned index;
138};
139
7eb77e9f
JS
140#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
141 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
142 (_root), (_addr)); \
143 shadow_walk_okay(&(_walker)); \
144 shadow_walk_next(&(_walker)))
145
146#define for_each_shadow_entry(_vcpu, _addr, _walker) \
2d11123a
AK
147 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
148 shadow_walk_okay(&(_walker)); \
149 shadow_walk_next(&(_walker)))
150
c2a2ac2b
XG
151#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
152 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
153 shadow_walk_okay(&(_walker)) && \
154 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
155 __shadow_walk_next(&(_walker), spte))
156
53c07b18 157static struct kmem_cache *pte_list_desc_cache;
02c00b3a 158struct kmem_cache *mmu_page_header_cache;
45221ab6 159static struct percpu_counter kvm_total_used_mmu_pages;
b5a33a75 160
ce88decf
XG
161static void mmu_spte_set(u64 *sptep, u64 spte);
162
594e91a1
SC
163struct kvm_mmu_role_regs {
164 const unsigned long cr0;
165 const unsigned long cr4;
166 const u64 efer;
167};
168
335e192a
PB
169#define CREATE_TRACE_POINTS
170#include "mmutrace.h"
171
594e91a1
SC
172/*
173 * Yes, lot's of underscores. They're a hint that you probably shouldn't be
7a458f0e 174 * reading from the role_regs. Once the root_role is constructed, it becomes
594e91a1
SC
175 * the single source of truth for the MMU's state.
176 */
177#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
82ffa13f
PB
178static inline bool __maybe_unused \
179____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
594e91a1
SC
180{ \
181 return !!(regs->reg & flag); \
182}
183BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
184BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
185BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
186BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
187BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
188BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
189BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
190BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
191BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
192BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
193
60667724
SC
194/*
195 * The MMU itself (with a valid role) is the single source of truth for the
196 * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
197 * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
198 * and the vCPU may be incorrect/irrelevant.
199 */
200#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
4ac21457 201static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
60667724 202{ \
e5ed0fb0 203 return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
60667724 204}
60667724
SC
205BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
206BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
60667724
SC
207BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
208BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
209BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
210BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
211BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
56b321f9 212BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
60667724 213
faf72962
PB
214static inline bool is_cr0_pg(struct kvm_mmu *mmu)
215{
216 return mmu->cpu_role.base.level > 0;
217}
218
219static inline bool is_cr4_pae(struct kvm_mmu *mmu)
220{
221 return !mmu->cpu_role.base.has_4_byte_gpte;
222}
223
594e91a1
SC
224static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
225{
226 struct kvm_mmu_role_regs regs = {
227 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
228 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
229 .efer = vcpu->arch.efer,
230 };
231
232 return regs;
233}
40ef75a7
LT
234
235static inline bool kvm_available_flush_tlb_with_range(void)
236{
afaf0b2f 237 return kvm_x86_ops.tlb_remote_flush_with_range;
40ef75a7
LT
238}
239
240static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
241 struct kvm_tlb_range *range)
242{
243 int ret = -ENOTSUPP;
244
afaf0b2f 245 if (range && kvm_x86_ops.tlb_remote_flush_with_range)
b3646477 246 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
40ef75a7
LT
247
248 if (ret)
249 kvm_flush_remote_tlbs(kvm);
250}
251
2f2fad08 252void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
40ef75a7
LT
253 u64 start_gfn, u64 pages)
254{
255 struct kvm_tlb_range range;
256
257 range.start_gfn = start_gfn;
258 range.pages = pages;
259
260 kvm_flush_remote_tlbs_with_range(kvm, &range);
261}
262
8f79b064
BG
263static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
264 unsigned int access)
265{
c236d962 266 u64 spte = make_mmio_spte(vcpu, gfn, access);
8f79b064 267
c236d962
SC
268 trace_mark_mmio_spte(sptep, gfn, spte);
269 mmu_spte_set(sptep, spte);
ce88decf
XG
270}
271
ce88decf
XG
272static gfn_t get_mmio_spte_gfn(u64 spte)
273{
daa07cbc 274 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
28a1f3ac 275
8a967d65 276 gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
28a1f3ac
JS
277 & shadow_nonpresent_or_rsvd_mask;
278
279 return gpa >> PAGE_SHIFT;
ce88decf
XG
280}
281
282static unsigned get_mmio_spte_access(u64 spte)
283{
4af77151 284 return spte & shadow_mmio_access_mask;
ce88decf
XG
285}
286
54bf36aa 287static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
f8f55942 288{
cae7ed3c 289 u64 kvm_gen, spte_gen, gen;
089504c0 290
cae7ed3c
SC
291 gen = kvm_vcpu_memslots(vcpu)->generation;
292 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
293 return false;
089504c0 294
cae7ed3c 295 kvm_gen = gen & MMIO_SPTE_GEN_MASK;
089504c0
XG
296 spte_gen = get_mmio_spte_generation(spte);
297
298 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
299 return likely(kvm_gen == spte_gen);
f8f55942
XG
300}
301
6aa8b732
AK
302static int is_cpuid_PSE36(void)
303{
304 return 1;
305}
306
603e0651 307#ifdef CONFIG_X86_64
d555c333 308static void __set_spte(u64 *sptep, u64 spte)
e663ee64 309{
b19ee2ff 310 WRITE_ONCE(*sptep, spte);
e663ee64
AK
311}
312
603e0651 313static void __update_clear_spte_fast(u64 *sptep, u64 spte)
a9221dd5 314{
b19ee2ff 315 WRITE_ONCE(*sptep, spte);
603e0651
XG
316}
317
318static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
319{
320 return xchg(sptep, spte);
321}
c2a2ac2b
XG
322
323static u64 __get_spte_lockless(u64 *sptep)
324{
6aa7de05 325 return READ_ONCE(*sptep);
c2a2ac2b 326}
a9221dd5 327#else
603e0651
XG
328union split_spte {
329 struct {
330 u32 spte_low;
331 u32 spte_high;
332 };
333 u64 spte;
334};
a9221dd5 335
c2a2ac2b
XG
336static void count_spte_clear(u64 *sptep, u64 spte)
337{
57354682 338 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
c2a2ac2b
XG
339
340 if (is_shadow_present_pte(spte))
341 return;
342
343 /* Ensure the spte is completely set before we increase the count */
344 smp_wmb();
345 sp->clear_spte_count++;
346}
347
603e0651
XG
348static void __set_spte(u64 *sptep, u64 spte)
349{
350 union split_spte *ssptep, sspte;
a9221dd5 351
603e0651
XG
352 ssptep = (union split_spte *)sptep;
353 sspte = (union split_spte)spte;
354
355 ssptep->spte_high = sspte.spte_high;
356
357 /*
358 * If we map the spte from nonpresent to present, We should store
359 * the high bits firstly, then set present bit, so cpu can not
360 * fetch this spte while we are setting the spte.
361 */
362 smp_wmb();
363
b19ee2ff 364 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
a9221dd5
AK
365}
366
603e0651
XG
367static void __update_clear_spte_fast(u64 *sptep, u64 spte)
368{
369 union split_spte *ssptep, sspte;
370
371 ssptep = (union split_spte *)sptep;
372 sspte = (union split_spte)spte;
373
b19ee2ff 374 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
603e0651
XG
375
376 /*
377 * If we map the spte from present to nonpresent, we should clear
378 * present bit firstly to avoid vcpu fetch the old high bits.
379 */
380 smp_wmb();
381
382 ssptep->spte_high = sspte.spte_high;
c2a2ac2b 383 count_spte_clear(sptep, spte);
603e0651
XG
384}
385
386static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
387{
388 union split_spte *ssptep, sspte, orig;
389
390 ssptep = (union split_spte *)sptep;
391 sspte = (union split_spte)spte;
392
393 /* xchg acts as a barrier before the setting of the high bits */
394 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
41bc3186
ZJ
395 orig.spte_high = ssptep->spte_high;
396 ssptep->spte_high = sspte.spte_high;
c2a2ac2b 397 count_spte_clear(sptep, spte);
603e0651
XG
398
399 return orig.spte;
400}
c2a2ac2b
XG
401
402/*
403 * The idea using the light way get the spte on x86_32 guest is from
39656e83 404 * gup_get_pte (mm/gup.c).
accaefe0 405 *
aed02fe3 406 * An spte tlb flush may be pending, because kvm_set_pte_rmap
accaefe0
XG
407 * coalesces them and we are running out of the MMU lock. Therefore
408 * we need to protect against in-progress updates of the spte.
409 *
410 * Reading the spte while an update is in progress may get the old value
411 * for the high part of the spte. The race is fine for a present->non-present
412 * change (because the high part of the spte is ignored for non-present spte),
413 * but for a present->present change we must reread the spte.
414 *
415 * All such changes are done in two steps (present->non-present and
416 * non-present->present), hence it is enough to count the number of
417 * present->non-present updates: if it changed while reading the spte,
418 * we might have hit the race. This is done using clear_spte_count.
c2a2ac2b
XG
419 */
420static u64 __get_spte_lockless(u64 *sptep)
421{
57354682 422 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
c2a2ac2b
XG
423 union split_spte spte, *orig = (union split_spte *)sptep;
424 int count;
425
426retry:
427 count = sp->clear_spte_count;
428 smp_rmb();
429
430 spte.spte_low = orig->spte_low;
431 smp_rmb();
432
433 spte.spte_high = orig->spte_high;
434 smp_rmb();
435
436 if (unlikely(spte.spte_low != orig->spte_low ||
437 count != sp->clear_spte_count))
438 goto retry;
439
440 return spte.spte;
441}
603e0651
XG
442#endif
443
1df9f2dc
XG
444/* Rules for using mmu_spte_set:
445 * Set the sptep from nonpresent to present.
446 * Note: the sptep being assigned *must* be either not present
447 * or in a state where the hardware will not attempt to update
448 * the spte.
449 */
450static void mmu_spte_set(u64 *sptep, u64 new_spte)
451{
452 WARN_ON(is_shadow_present_pte(*sptep));
453 __set_spte(sptep, new_spte);
454}
455
f39a058d
JS
456/*
457 * Update the SPTE (excluding the PFN), but do not track changes in its
458 * accessed/dirty status.
1df9f2dc 459 */
f39a058d 460static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
b79b93f9 461{
c7ba5b48 462 u64 old_spte = *sptep;
4132779b 463
afd28fe1 464 WARN_ON(!is_shadow_present_pte(new_spte));
115111ef 465 check_spte_writable_invariants(new_spte);
b79b93f9 466
6e7d0354
XG
467 if (!is_shadow_present_pte(old_spte)) {
468 mmu_spte_set(sptep, new_spte);
f39a058d 469 return old_spte;
6e7d0354 470 }
4132779b 471
c7ba5b48 472 if (!spte_has_volatile_bits(old_spte))
603e0651 473 __update_clear_spte_fast(sptep, new_spte);
4132779b 474 else
603e0651 475 old_spte = __update_clear_spte_slow(sptep, new_spte);
4132779b 476
83ef6c81
JS
477 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
478
f39a058d
JS
479 return old_spte;
480}
481
482/* Rules for using mmu_spte_update:
483 * Update the state bits, it means the mapped pfn is not changed.
484 *
02844ac1
DM
485 * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
486 * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
487 * spte, even though the writable spte might be cached on a CPU's TLB.
f39a058d
JS
488 *
489 * Returns true if the TLB needs to be flushed
490 */
491static bool mmu_spte_update(u64 *sptep, u64 new_spte)
492{
493 bool flush = false;
494 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
495
496 if (!is_shadow_present_pte(old_spte))
497 return false;
498
c7ba5b48
XG
499 /*
500 * For the spte updated out of mmu-lock is safe, since
6a6256f9 501 * we always atomically update it, see the comments in
c7ba5b48
XG
502 * spte_has_volatile_bits().
503 */
706c9c55 504 if (is_mmu_writable_spte(old_spte) &&
7f31c959 505 !is_writable_pte(new_spte))
83ef6c81 506 flush = true;
4132779b 507
7e71a59b 508 /*
83ef6c81 509 * Flush TLB when accessed/dirty states are changed in the page tables,
7e71a59b
KH
510 * to guarantee consistency between TLB and page tables.
511 */
7e71a59b 512
83ef6c81
JS
513 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
514 flush = true;
4132779b 515 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
83ef6c81
JS
516 }
517
518 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
519 flush = true;
4132779b 520 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
83ef6c81 521 }
6e7d0354 522
83ef6c81 523 return flush;
b79b93f9
AK
524}
525
1df9f2dc
XG
526/*
527 * Rules for using mmu_spte_clear_track_bits:
528 * It sets the sptep from present to nonpresent, and track the
529 * state bits, it is used to clear the last level sptep.
7fa2a347 530 * Returns the old PTE.
1df9f2dc 531 */
35d539c3 532static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
1df9f2dc 533{
ba049e93 534 kvm_pfn_t pfn;
1df9f2dc 535 u64 old_spte = *sptep;
71f51d2c 536 int level = sptep_to_sp(sptep)->role.level;
b14b2690 537 struct page *page;
1df9f2dc 538
54eb3ef5
SC
539 if (!is_shadow_present_pte(old_spte) ||
540 !spte_has_volatile_bits(old_spte))
603e0651 541 __update_clear_spte_fast(sptep, 0ull);
1df9f2dc 542 else
603e0651 543 old_spte = __update_clear_spte_slow(sptep, 0ull);
1df9f2dc 544
afd28fe1 545 if (!is_shadow_present_pte(old_spte))
7fa2a347 546 return old_spte;
1df9f2dc 547
71f51d2c
MZ
548 kvm_update_page_stats(kvm, level, -1);
549
1df9f2dc 550 pfn = spte_to_pfn(old_spte);
86fde74c
XG
551
552 /*
b14b2690
SC
553 * KVM doesn't hold a reference to any pages mapped into the guest, and
554 * instead uses the mmu_notifier to ensure that KVM unmaps any pages
555 * before they are reclaimed. Sanity check that, if the pfn is backed
556 * by a refcounted page, the refcount is elevated.
86fde74c 557 */
b14b2690
SC
558 page = kvm_pfn_to_refcounted_page(pfn);
559 WARN_ON(page && !page_count(page));
86fde74c 560
83ef6c81 561 if (is_accessed_spte(old_spte))
1df9f2dc 562 kvm_set_pfn_accessed(pfn);
83ef6c81
JS
563
564 if (is_dirty_spte(old_spte))
1df9f2dc 565 kvm_set_pfn_dirty(pfn);
83ef6c81 566
7fa2a347 567 return old_spte;
1df9f2dc
XG
568}
569
570/*
571 * Rules for using mmu_spte_clear_no_track:
572 * Directly clear spte without caring the state bits of sptep,
573 * it is used to set the upper level spte.
574 */
575static void mmu_spte_clear_no_track(u64 *sptep)
576{
603e0651 577 __update_clear_spte_fast(sptep, 0ull);
1df9f2dc
XG
578}
579
c2a2ac2b
XG
580static u64 mmu_spte_get_lockless(u64 *sptep)
581{
582 return __get_spte_lockless(sptep);
583}
584
f160c7b7
JS
585/* Returns the Accessed status of the PTE and resets it at the same time. */
586static bool mmu_spte_age(u64 *sptep)
587{
588 u64 spte = mmu_spte_get_lockless(sptep);
589
590 if (!is_accessed_spte(spte))
591 return false;
592
ac8d57e5 593 if (spte_ad_enabled(spte)) {
f160c7b7
JS
594 clear_bit((ffs(shadow_accessed_mask) - 1),
595 (unsigned long *)sptep);
596 } else {
597 /*
598 * Capture the dirty status of the page, so that it doesn't get
599 * lost when the SPTE is marked for access tracking.
600 */
601 if (is_writable_pte(spte))
602 kvm_set_pfn_dirty(spte_to_pfn(spte));
603
604 spte = mark_spte_for_access_track(spte);
605 mmu_spte_update_no_track(sptep, spte);
606 }
607
608 return true;
609}
610
c2a2ac2b
XG
611static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
612{
c5c8c7c5
DM
613 if (is_tdp_mmu(vcpu->arch.mmu)) {
614 kvm_tdp_mmu_walk_lockless_begin();
615 } else {
616 /*
617 * Prevent page table teardown by making any free-er wait during
618 * kvm_flush_remote_tlbs() IPI to all active vcpus.
619 */
620 local_irq_disable();
36ca7e0a 621
c5c8c7c5
DM
622 /*
623 * Make sure a following spte read is not reordered ahead of the write
624 * to vcpu->mode.
625 */
626 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
627 }
c2a2ac2b
XG
628}
629
630static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
631{
c5c8c7c5
DM
632 if (is_tdp_mmu(vcpu->arch.mmu)) {
633 kvm_tdp_mmu_walk_lockless_end();
634 } else {
635 /*
636 * Make sure the write to vcpu->mode is not reordered in front of
637 * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
638 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
639 */
640 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
641 local_irq_enable();
642 }
c2a2ac2b
XG
643}
644
378f5cd6 645static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
714b93da 646{
e2dec939
AK
647 int r;
648
531281ad 649 /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
94ce87ef
SC
650 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
651 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
d3d25b04 652 if (r)
284aa868 653 return r;
94ce87ef
SC
654 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
655 PT64_ROOT_MAX_LEVEL);
d3d25b04 656 if (r)
171a90d7 657 return r;
378f5cd6 658 if (maybe_indirect) {
6a97575d 659 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
94ce87ef 660 PT64_ROOT_MAX_LEVEL);
378f5cd6
SC
661 if (r)
662 return r;
663 }
94ce87ef
SC
664 return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
665 PT64_ROOT_MAX_LEVEL);
714b93da
AK
666}
667
668static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
669{
94ce87ef
SC
670 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
671 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
6a97575d 672 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
94ce87ef 673 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
714b93da
AK
674}
675
53c07b18 676static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
714b93da 677{
53c07b18 678 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
714b93da
AK
679}
680
6a97575d
DM
681static bool sp_has_gptes(struct kvm_mmu_page *sp);
682
2032a93d
LJ
683static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
684{
84e5ffd0
LJ
685 if (sp->role.passthrough)
686 return sp->gfn;
687
2032a93d 688 if (!sp->role.direct)
6a97575d 689 return sp->shadowed_translation[index] >> PAGE_SHIFT;
2032a93d 690
2ca3129e 691 return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
2032a93d
LJ
692}
693
6a97575d
DM
694/*
695 * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
696 * that the SPTE itself may have a more constrained access permissions that
697 * what the guest enforces. For example, a guest may create an executable
698 * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
699 */
700static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
2032a93d 701{
6a97575d
DM
702 if (sp_has_gptes(sp))
703 return sp->shadowed_translation[index] & ACC_ALL;
84e5ffd0 704
6a97575d
DM
705 /*
706 * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
707 * KVM is not shadowing any guest page tables, so the "guest access
708 * permissions" are just ACC_ALL.
709 *
710 * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
711 * is shadowing a guest huge page with small pages, the guest access
712 * permissions being shadowed are the access permissions of the huge
713 * page.
714 *
715 * In both cases, sp->role.access contains the correct access bits.
716 */
717 return sp->role.access;
718}
719
72ae5822
SC
720static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
721 gfn_t gfn, unsigned int access)
6a97575d
DM
722{
723 if (sp_has_gptes(sp)) {
724 sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
e9f2a760
PB
725 return;
726 }
727
6a97575d
DM
728 WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
729 "access mismatch under %s page %llx (expected %u, got %u)\n",
730 sp->role.passthrough ? "passthrough" : "direct",
731 sp->gfn, kvm_mmu_page_get_access(sp, index), access);
732
733 WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
734 "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
735 sp->role.passthrough ? "passthrough" : "direct",
736 sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
737}
738
72ae5822
SC
739static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
740 unsigned int access)
6a97575d
DM
741{
742 gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
743
744 kvm_mmu_page_set_translation(sp, index, gfn, access);
2032a93d
LJ
745}
746
05da4558 747/*
d4dbf470
TY
748 * Return the pointer to the large page information for a given gfn,
749 * handling slots that are not large page aligned.
05da4558 750 */
d4dbf470 751static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
8ca6f063 752 const struct kvm_memory_slot *slot, int level)
05da4558
MT
753{
754 unsigned long idx;
755
fb03cb6f 756 idx = gfn_to_index(gfn, slot->base_gfn, level);
db3fe4eb 757 return &slot->arch.lpage_info[level - 2][idx];
05da4558
MT
758}
759
269e9552 760static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
547ffaed
XG
761 gfn_t gfn, int count)
762{
763 struct kvm_lpage_info *linfo;
764 int i;
765
3bae0459 766 for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
547ffaed
XG
767 linfo = lpage_info_slot(gfn, slot, i);
768 linfo->disallow_lpage += count;
769 WARN_ON(linfo->disallow_lpage < 0);
770 }
771}
772
269e9552 773void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
547ffaed
XG
774{
775 update_gfn_disallow_lpage_count(slot, gfn, 1);
776}
777
269e9552 778void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
547ffaed
XG
779{
780 update_gfn_disallow_lpage_count(slot, gfn, -1);
781}
782
3ed1a478 783static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
05da4558 784{
699023e2 785 struct kvm_memslots *slots;
d25797b2 786 struct kvm_memory_slot *slot;
3ed1a478 787 gfn_t gfn;
05da4558 788
56ca57f9 789 kvm->arch.indirect_shadow_pages++;
3ed1a478 790 gfn = sp->gfn;
699023e2
PB
791 slots = kvm_memslots_for_spte_role(kvm, sp->role);
792 slot = __gfn_to_memslot(slots, gfn);
56ca57f9
XG
793
794 /* the non-leaf shadow pages are keeping readonly. */
3bae0459 795 if (sp->role.level > PG_LEVEL_4K)
56ca57f9
XG
796 return kvm_slot_page_track_add_page(kvm, slot, gfn,
797 KVM_PAGE_TRACK_WRITE);
798
547ffaed 799 kvm_mmu_gfn_disallow_lpage(slot, gfn);
be911771
DM
800
801 if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
802 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
05da4558
MT
803}
804
29cf0f50 805void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
b8e8c830
PB
806{
807 if (sp->lpage_disallowed)
808 return;
809
810 ++kvm->stat.nx_lpage_splits;
1aa9b957
JS
811 list_add_tail(&sp->lpage_disallowed_link,
812 &kvm->arch.lpage_disallowed_mmu_pages);
b8e8c830
PB
813 sp->lpage_disallowed = true;
814}
815
3ed1a478 816static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
05da4558 817{
699023e2 818 struct kvm_memslots *slots;
d25797b2 819 struct kvm_memory_slot *slot;
3ed1a478 820 gfn_t gfn;
05da4558 821
56ca57f9 822 kvm->arch.indirect_shadow_pages--;
3ed1a478 823 gfn = sp->gfn;
699023e2
PB
824 slots = kvm_memslots_for_spte_role(kvm, sp->role);
825 slot = __gfn_to_memslot(slots, gfn);
3bae0459 826 if (sp->role.level > PG_LEVEL_4K)
56ca57f9
XG
827 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
828 KVM_PAGE_TRACK_WRITE);
829
547ffaed 830 kvm_mmu_gfn_allow_lpage(slot, gfn);
05da4558
MT
831}
832
29cf0f50 833void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
b8e8c830
PB
834{
835 --kvm->stat.nx_lpage_splits;
836 sp->lpage_disallowed = false;
1aa9b957 837 list_del(&sp->lpage_disallowed_link);
b8e8c830
PB
838}
839
5d163b1c
XG
840static struct kvm_memory_slot *
841gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
842 bool no_dirty_log)
05da4558
MT
843{
844 struct kvm_memory_slot *slot;
5d163b1c 845
54bf36aa 846 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
91b0d268
PB
847 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
848 return NULL;
044c59c4 849 if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
91b0d268 850 return NULL;
5d163b1c
XG
851
852 return slot;
853}
854
290fc38d 855/*
018aabb5 856 * About rmap_head encoding:
cd4a4e53 857 *
018aabb5
TY
858 * If the bit zero of rmap_head->val is clear, then it points to the only spte
859 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
53c07b18 860 * pte_list_desc containing more mappings.
018aabb5
TY
861 */
862
863/*
864 * Returns the number of pointers in the rmap chain, not counting the new one.
cd4a4e53 865 */
2ff9039a 866static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
018aabb5 867 struct kvm_rmap_head *rmap_head)
cd4a4e53 868{
53c07b18 869 struct pte_list_desc *desc;
13236e25 870 int count = 0;
cd4a4e53 871
018aabb5 872 if (!rmap_head->val) {
805a0f83 873 rmap_printk("%p %llx 0->1\n", spte, *spte);
018aabb5
TY
874 rmap_head->val = (unsigned long)spte;
875 } else if (!(rmap_head->val & 1)) {
805a0f83 876 rmap_printk("%p %llx 1->many\n", spte, *spte);
2ff9039a 877 desc = kvm_mmu_memory_cache_alloc(cache);
018aabb5 878 desc->sptes[0] = (u64 *)rmap_head->val;
d555c333 879 desc->sptes[1] = spte;
13236e25 880 desc->spte_count = 2;
018aabb5 881 rmap_head->val = (unsigned long)desc | 1;
cb16a7b3 882 ++count;
cd4a4e53 883 } else {
805a0f83 884 rmap_printk("%p %llx many->many\n", spte, *spte);
018aabb5 885 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
13236e25 886 while (desc->spte_count == PTE_LIST_EXT) {
53c07b18 887 count += PTE_LIST_EXT;
c6c4f961 888 if (!desc->more) {
2ff9039a 889 desc->more = kvm_mmu_memory_cache_alloc(cache);
c6c4f961 890 desc = desc->more;
13236e25 891 desc->spte_count = 0;
c6c4f961
LR
892 break;
893 }
cd4a4e53
AK
894 desc = desc->more;
895 }
13236e25
PX
896 count += desc->spte_count;
897 desc->sptes[desc->spte_count++] = spte;
cd4a4e53 898 }
53a27b39 899 return count;
cd4a4e53
AK
900}
901
53c07b18 902static void
018aabb5
TY
903pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
904 struct pte_list_desc *desc, int i,
905 struct pte_list_desc *prev_desc)
cd4a4e53 906{
13236e25 907 int j = desc->spte_count - 1;
cd4a4e53 908
d555c333
AK
909 desc->sptes[i] = desc->sptes[j];
910 desc->sptes[j] = NULL;
13236e25
PX
911 desc->spte_count--;
912 if (desc->spte_count)
cd4a4e53
AK
913 return;
914 if (!prev_desc && !desc->more)
fe3c2b4c 915 rmap_head->val = 0;
cd4a4e53
AK
916 else
917 if (prev_desc)
918 prev_desc->more = desc->more;
919 else
018aabb5 920 rmap_head->val = (unsigned long)desc->more | 1;
53c07b18 921 mmu_free_pte_list_desc(desc);
cd4a4e53
AK
922}
923
3c2e1037 924static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
cd4a4e53 925{
53c07b18
XG
926 struct pte_list_desc *desc;
927 struct pte_list_desc *prev_desc;
cd4a4e53
AK
928 int i;
929
018aabb5 930 if (!rmap_head->val) {
8daf3462 931 pr_err("%s: %p 0->BUG\n", __func__, spte);
cd4a4e53 932 BUG();
018aabb5 933 } else if (!(rmap_head->val & 1)) {
805a0f83 934 rmap_printk("%p 1->0\n", spte);
018aabb5 935 if ((u64 *)rmap_head->val != spte) {
8daf3462 936 pr_err("%s: %p 1->BUG\n", __func__, spte);
cd4a4e53
AK
937 BUG();
938 }
018aabb5 939 rmap_head->val = 0;
cd4a4e53 940 } else {
805a0f83 941 rmap_printk("%p many->many\n", spte);
018aabb5 942 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
cd4a4e53
AK
943 prev_desc = NULL;
944 while (desc) {
13236e25 945 for (i = 0; i < desc->spte_count; ++i) {
d555c333 946 if (desc->sptes[i] == spte) {
018aabb5
TY
947 pte_list_desc_remove_entry(rmap_head,
948 desc, i, prev_desc);
cd4a4e53
AK
949 return;
950 }
018aabb5 951 }
cd4a4e53
AK
952 prev_desc = desc;
953 desc = desc->more;
954 }
8daf3462 955 pr_err("%s: %p many->many\n", __func__, spte);
cd4a4e53
AK
956 BUG();
957 }
958}
959
9202aee8
SC
960static void kvm_zap_one_rmap_spte(struct kvm *kvm,
961 struct kvm_rmap_head *rmap_head, u64 *sptep)
e7912386 962{
71f51d2c 963 mmu_spte_clear_track_bits(kvm, sptep);
3c2e1037 964 pte_list_remove(sptep, rmap_head);
e7912386
WY
965}
966
9202aee8
SC
967/* Return true if at least one SPTE was zapped, false otherwise */
968static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
969 struct kvm_rmap_head *rmap_head)
a75b5404
PX
970{
971 struct pte_list_desc *desc, *next;
972 int i;
973
974 if (!rmap_head->val)
975 return false;
976
977 if (!(rmap_head->val & 1)) {
71f51d2c 978 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
a75b5404
PX
979 goto out;
980 }
981
982 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
983
984 for (; desc; desc = next) {
985 for (i = 0; i < desc->spte_count; i++)
71f51d2c 986 mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
a75b5404
PX
987 next = desc->more;
988 mmu_free_pte_list_desc(desc);
989 }
990out:
991 /* rmap_head is meaningless now, remember to reset it */
992 rmap_head->val = 0;
993 return true;
994}
995
3bcd0662
PX
996unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
997{
998 struct pte_list_desc *desc;
999 unsigned int count = 0;
1000
1001 if (!rmap_head->val)
1002 return 0;
1003 else if (!(rmap_head->val & 1))
1004 return 1;
1005
1006 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1007
1008 while (desc) {
1009 count += desc->spte_count;
1010 desc = desc->more;
1011 }
1012
1013 return count;
1014}
1015
93e083d4
DM
1016static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1017 const struct kvm_memory_slot *slot)
53c07b18 1018{
77d11309 1019 unsigned long idx;
53c07b18 1020
77d11309 1021 idx = gfn_to_index(gfn, slot->base_gfn, level);
3bae0459 1022 return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
53c07b18
XG
1023}
1024
f759e2b4
XG
1025static bool rmap_can_add(struct kvm_vcpu *vcpu)
1026{
356ec69a 1027 struct kvm_mmu_memory_cache *mc;
f759e2b4 1028
356ec69a 1029 mc = &vcpu->arch.mmu_pte_list_desc_cache;
94ce87ef 1030 return kvm_mmu_memory_cache_nr_free_objects(mc);
f759e2b4
XG
1031}
1032
53c07b18
XG
1033static void rmap_remove(struct kvm *kvm, u64 *spte)
1034{
601f8af0
DM
1035 struct kvm_memslots *slots;
1036 struct kvm_memory_slot *slot;
53c07b18
XG
1037 struct kvm_mmu_page *sp;
1038 gfn_t gfn;
018aabb5 1039 struct kvm_rmap_head *rmap_head;
53c07b18 1040
57354682 1041 sp = sptep_to_sp(spte);
79e48cec 1042 gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
601f8af0
DM
1043
1044 /*
68be1306
DM
1045 * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1046 * so we have to determine which memslots to use based on context
1047 * information in sp->role.
601f8af0
DM
1048 */
1049 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1050
1051 slot = __gfn_to_memslot(slots, gfn);
93e083d4 1052 rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
601f8af0 1053
3c2e1037 1054 pte_list_remove(spte, rmap_head);
53c07b18
XG
1055}
1056
1e3f42f0
TY
1057/*
1058 * Used by the following functions to iterate through the sptes linked by a
1059 * rmap. All fields are private and not assumed to be used outside.
1060 */
1061struct rmap_iterator {
1062 /* private fields */
1063 struct pte_list_desc *desc; /* holds the sptep if not NULL */
1064 int pos; /* index of the sptep */
1065};
1066
1067/*
1068 * Iteration must be started by this function. This should also be used after
1069 * removing/dropping sptes from the rmap link because in such cases the
0a03cbda 1070 * information in the iterator may not be valid.
1e3f42f0
TY
1071 *
1072 * Returns sptep if found, NULL otherwise.
1073 */
018aabb5
TY
1074static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1075 struct rmap_iterator *iter)
1e3f42f0 1076{
77fbbbd2
TY
1077 u64 *sptep;
1078
018aabb5 1079 if (!rmap_head->val)
1e3f42f0
TY
1080 return NULL;
1081
018aabb5 1082 if (!(rmap_head->val & 1)) {
1e3f42f0 1083 iter->desc = NULL;
77fbbbd2
TY
1084 sptep = (u64 *)rmap_head->val;
1085 goto out;
1e3f42f0
TY
1086 }
1087
018aabb5 1088 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1e3f42f0 1089 iter->pos = 0;
77fbbbd2
TY
1090 sptep = iter->desc->sptes[iter->pos];
1091out:
1092 BUG_ON(!is_shadow_present_pte(*sptep));
1093 return sptep;
1e3f42f0
TY
1094}
1095
1096/*
1097 * Must be used with a valid iterator: e.g. after rmap_get_first().
1098 *
1099 * Returns sptep if found, NULL otherwise.
1100 */
1101static u64 *rmap_get_next(struct rmap_iterator *iter)
1102{
77fbbbd2
TY
1103 u64 *sptep;
1104
1e3f42f0
TY
1105 if (iter->desc) {
1106 if (iter->pos < PTE_LIST_EXT - 1) {
1e3f42f0
TY
1107 ++iter->pos;
1108 sptep = iter->desc->sptes[iter->pos];
1109 if (sptep)
77fbbbd2 1110 goto out;
1e3f42f0
TY
1111 }
1112
1113 iter->desc = iter->desc->more;
1114
1115 if (iter->desc) {
1116 iter->pos = 0;
1117 /* desc->sptes[0] cannot be NULL */
77fbbbd2
TY
1118 sptep = iter->desc->sptes[iter->pos];
1119 goto out;
1e3f42f0
TY
1120 }
1121 }
1122
1123 return NULL;
77fbbbd2
TY
1124out:
1125 BUG_ON(!is_shadow_present_pte(*sptep));
1126 return sptep;
1e3f42f0
TY
1127}
1128
018aabb5
TY
1129#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1130 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
77fbbbd2 1131 _spte_; _spte_ = rmap_get_next(_iter_))
0d536790 1132
c3707958 1133static void drop_spte(struct kvm *kvm, u64 *sptep)
e4b502ea 1134{
71f51d2c 1135 u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
7fa2a347
SC
1136
1137 if (is_shadow_present_pte(old_spte))
eb45fda4 1138 rmap_remove(kvm, sptep);
be38d276
AK
1139}
1140
03787394 1141static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
8e22f955 1142{
0cd8dc73 1143 struct kvm_mmu_page *sp;
8e22f955 1144
0cd8dc73
PB
1145 sp = sptep_to_sp(sptep);
1146 WARN_ON(sp->role.level == PG_LEVEL_4K);
c3134ce2 1147
0cd8dc73 1148 drop_spte(kvm, sptep);
03787394
PB
1149
1150 if (flush)
1151 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
c3134ce2 1152 KVM_PAGES_PER_HPAGE(sp->role.level));
8e22f955
XG
1153}
1154
1155/*
49fde340 1156 * Write-protect on the specified @sptep, @pt_protect indicates whether
c126d94f 1157 * spte write-protection is caused by protecting shadow page table.
49fde340 1158 *
b4619660 1159 * Note: write protection is difference between dirty logging and spte
49fde340
XG
1160 * protection:
1161 * - for dirty logging, the spte can be set to writable at anytime if
1162 * its dirty bitmap is properly set.
1163 * - for spte protection, the spte can be writable only after unsync-ing
1164 * shadow page.
8e22f955 1165 *
c126d94f 1166 * Return true if tlb need be flushed.
8e22f955 1167 */
c4f138b4 1168static bool spte_write_protect(u64 *sptep, bool pt_protect)
d13bc5b5
XG
1169{
1170 u64 spte = *sptep;
1171
49fde340 1172 if (!is_writable_pte(spte) &&
706c9c55 1173 !(pt_protect && is_mmu_writable_spte(spte)))
d13bc5b5
XG
1174 return false;
1175
805a0f83 1176 rmap_printk("spte %p %llx\n", sptep, *sptep);
d13bc5b5 1177
49fde340 1178 if (pt_protect)
5fc3424f 1179 spte &= ~shadow_mmu_writable_mask;
d13bc5b5 1180 spte = spte & ~PT_WRITABLE_MASK;
49fde340 1181
c126d94f 1182 return mmu_spte_update(sptep, spte);
d13bc5b5
XG
1183}
1184
1346bbb6
DM
1185static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1186 bool pt_protect)
98348e95 1187{
1e3f42f0
TY
1188 u64 *sptep;
1189 struct rmap_iterator iter;
d13bc5b5 1190 bool flush = false;
374cbac0 1191
018aabb5 1192 for_each_rmap_spte(rmap_head, &iter, sptep)
c4f138b4 1193 flush |= spte_write_protect(sptep, pt_protect);
855149aa 1194
d13bc5b5 1195 return flush;
a0ed4607
TY
1196}
1197
c4f138b4 1198static bool spte_clear_dirty(u64 *sptep)
f4b4b180
KH
1199{
1200 u64 spte = *sptep;
1201
805a0f83 1202 rmap_printk("spte %p %llx\n", sptep, *sptep);
f4b4b180 1203
1f4e5fc8 1204 MMU_WARN_ON(!spte_ad_enabled(spte));
f4b4b180 1205 spte &= ~shadow_dirty_mask;
f4b4b180
KH
1206 return mmu_spte_update(sptep, spte);
1207}
1208
1f4e5fc8 1209static bool spte_wrprot_for_clear_dirty(u64 *sptep)
ac8d57e5
PF
1210{
1211 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1212 (unsigned long *)sptep);
1f4e5fc8 1213 if (was_writable && !spte_ad_enabled(*sptep))
ac8d57e5
PF
1214 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1215
1216 return was_writable;
1217}
1218
1219/*
1220 * Gets the GFN ready for another round of dirty logging by clearing the
1221 * - D bit on ad-enabled SPTEs, and
1222 * - W bit on ad-disabled SPTEs.
1223 * Returns true iff any D or W bits were cleared.
1224 */
0a234f5d 1225static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
269e9552 1226 const struct kvm_memory_slot *slot)
f4b4b180
KH
1227{
1228 u64 *sptep;
1229 struct rmap_iterator iter;
1230 bool flush = false;
1231
018aabb5 1232 for_each_rmap_spte(rmap_head, &iter, sptep)
1f4e5fc8
PB
1233 if (spte_ad_need_write_protect(*sptep))
1234 flush |= spte_wrprot_for_clear_dirty(sptep);
ac8d57e5 1235 else
1f4e5fc8 1236 flush |= spte_clear_dirty(sptep);
f4b4b180
KH
1237
1238 return flush;
1239}
1240
5dc99b23 1241/**
3b0f1d01 1242 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
5dc99b23
TY
1243 * @kvm: kvm instance
1244 * @slot: slot to protect
1245 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1246 * @mask: indicates which pages we should protect
1247 *
89212919 1248 * Used when we do not need to care about huge page mappings.
5dc99b23 1249 */
3b0f1d01 1250static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
5dc99b23
TY
1251 struct kvm_memory_slot *slot,
1252 gfn_t gfn_offset, unsigned long mask)
a0ed4607 1253{
018aabb5 1254 struct kvm_rmap_head *rmap_head;
a0ed4607 1255
897218ff 1256 if (is_tdp_mmu_enabled(kvm))
a6a0b05d
BG
1257 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1258 slot->base_gfn + gfn_offset, mask, true);
e2209710
BG
1259
1260 if (!kvm_memslots_have_rmaps(kvm))
1261 return;
1262
5dc99b23 1263 while (mask) {
93e083d4
DM
1264 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1265 PG_LEVEL_4K, slot);
1346bbb6 1266 rmap_write_protect(rmap_head, false);
05da4558 1267
5dc99b23
TY
1268 /* clear the first set bit */
1269 mask &= mask - 1;
1270 }
374cbac0
AK
1271}
1272
f4b4b180 1273/**
ac8d57e5
PF
1274 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1275 * protect the page if the D-bit isn't supported.
f4b4b180
KH
1276 * @kvm: kvm instance
1277 * @slot: slot to clear D-bit
1278 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1279 * @mask: indicates which pages we should clear D-bit
1280 *
1281 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1282 */
a018eba5
SC
1283static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1284 struct kvm_memory_slot *slot,
1285 gfn_t gfn_offset, unsigned long mask)
f4b4b180 1286{
018aabb5 1287 struct kvm_rmap_head *rmap_head;
f4b4b180 1288
897218ff 1289 if (is_tdp_mmu_enabled(kvm))
a6a0b05d
BG
1290 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1291 slot->base_gfn + gfn_offset, mask, false);
e2209710
BG
1292
1293 if (!kvm_memslots_have_rmaps(kvm))
1294 return;
1295
f4b4b180 1296 while (mask) {
93e083d4
DM
1297 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1298 PG_LEVEL_4K, slot);
0a234f5d 1299 __rmap_clear_dirty(kvm, rmap_head, slot);
f4b4b180
KH
1300
1301 /* clear the first set bit */
1302 mask &= mask - 1;
1303 }
1304}
f4b4b180 1305
3b0f1d01
KH
1306/**
1307 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1308 * PT level pages.
1309 *
1310 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1311 * enable dirty logging for them.
1312 *
89212919
KZ
1313 * We need to care about huge page mappings: e.g. during dirty logging we may
1314 * have such mappings.
3b0f1d01
KH
1315 */
1316void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1317 struct kvm_memory_slot *slot,
1318 gfn_t gfn_offset, unsigned long mask)
1319{
89212919
KZ
1320 /*
1321 * Huge pages are NOT write protected when we start dirty logging in
1322 * initially-all-set mode; must write protect them here so that they
1323 * are split to 4K on the first write.
1324 *
1325 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1326 * of memslot has no such restriction, so the range can cross two large
1327 * pages.
1328 */
1329 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1330 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1331 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1332
cb00a70b
DM
1333 if (READ_ONCE(eager_page_split))
1334 kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
1335
89212919
KZ
1336 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1337
1338 /* Cross two large pages? */
1339 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1340 ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1341 kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1342 PG_LEVEL_2M);
1343 }
1344
1345 /* Now handle 4K PTEs. */
a018eba5
SC
1346 if (kvm_x86_ops.cpu_dirty_log_size)
1347 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
88178fd4
KH
1348 else
1349 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
3b0f1d01
KH
1350}
1351
fb04a1ed
PX
1352int kvm_cpu_dirty_log_size(void)
1353{
6dd03800 1354 return kvm_x86_ops.cpu_dirty_log_size;
fb04a1ed
PX
1355}
1356
aeecee2e 1357bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
3ad93562
KZ
1358 struct kvm_memory_slot *slot, u64 gfn,
1359 int min_level)
95d4c16c 1360{
018aabb5 1361 struct kvm_rmap_head *rmap_head;
5dc99b23 1362 int i;
2f84569f 1363 bool write_protected = false;
95d4c16c 1364
e2209710
BG
1365 if (kvm_memslots_have_rmaps(kvm)) {
1366 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
93e083d4 1367 rmap_head = gfn_to_rmap(gfn, i, slot);
1346bbb6 1368 write_protected |= rmap_write_protect(rmap_head, true);
e2209710 1369 }
5dc99b23
TY
1370 }
1371
897218ff 1372 if (is_tdp_mmu_enabled(kvm))
46044f72 1373 write_protected |=
3ad93562 1374 kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
46044f72 1375
5dc99b23 1376 return write_protected;
95d4c16c
TY
1377}
1378
cf48f9e2 1379static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
aeecee2e
XG
1380{
1381 struct kvm_memory_slot *slot;
1382
1383 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3ad93562 1384 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
aeecee2e
XG
1385}
1386
f8480721
SC
1387static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1388 const struct kvm_memory_slot *slot)
e930bffe 1389{
9202aee8 1390 return kvm_zap_all_rmap_sptes(kvm, rmap_head);
6a49f85c
XG
1391}
1392
f8480721
SC
1393static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1394 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1395 pte_t unused)
6a49f85c 1396{
f8480721 1397 return __kvm_zap_rmap(kvm, rmap_head, slot);
e930bffe
AA
1398}
1399
aed02fe3
SC
1400static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1401 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1402 pte_t pte)
3da0dd43 1403{
1e3f42f0
TY
1404 u64 *sptep;
1405 struct rmap_iterator iter;
98a26b69 1406 bool need_flush = false;
1e3f42f0 1407 u64 new_spte;
ba049e93 1408 kvm_pfn_t new_pfn;
3da0dd43 1409
3039bcc7
SC
1410 WARN_ON(pte_huge(pte));
1411 new_pfn = pte_pfn(pte);
1e3f42f0 1412
0d536790 1413restart:
018aabb5 1414 for_each_rmap_spte(rmap_head, &iter, sptep) {
805a0f83 1415 rmap_printk("spte %p %llx gfn %llx (%d)\n",
f160c7b7 1416 sptep, *sptep, gfn, level);
1e3f42f0 1417
98a26b69 1418 need_flush = true;
1e3f42f0 1419
3039bcc7 1420 if (pte_write(pte)) {
9202aee8 1421 kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
0d536790 1422 goto restart;
3da0dd43 1423 } else {
cb3eedab
PB
1424 new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1425 *sptep, new_pfn);
1e3f42f0 1426
71f51d2c 1427 mmu_spte_clear_track_bits(kvm, sptep);
1e3f42f0 1428 mmu_spte_set(sptep, new_spte);
3da0dd43
IE
1429 }
1430 }
1e3f42f0 1431
3cc5ea94
LT
1432 if (need_flush && kvm_available_flush_tlb_with_range()) {
1433 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
98a26b69 1434 return false;
3cc5ea94
LT
1435 }
1436
0cf853c5 1437 return need_flush;
3da0dd43
IE
1438}
1439
6ce1f4e2
XG
1440struct slot_rmap_walk_iterator {
1441 /* input fields. */
269e9552 1442 const struct kvm_memory_slot *slot;
6ce1f4e2
XG
1443 gfn_t start_gfn;
1444 gfn_t end_gfn;
1445 int start_level;
1446 int end_level;
1447
1448 /* output fields. */
1449 gfn_t gfn;
018aabb5 1450 struct kvm_rmap_head *rmap;
6ce1f4e2
XG
1451 int level;
1452
1453 /* private field. */
018aabb5 1454 struct kvm_rmap_head *end_rmap;
6ce1f4e2
XG
1455};
1456
1457static void
1458rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1459{
1460 iterator->level = level;
1461 iterator->gfn = iterator->start_gfn;
93e083d4
DM
1462 iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1463 iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
6ce1f4e2
XG
1464}
1465
1466static void
1467slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
269e9552 1468 const struct kvm_memory_slot *slot, int start_level,
6ce1f4e2
XG
1469 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1470{
1471 iterator->slot = slot;
1472 iterator->start_level = start_level;
1473 iterator->end_level = end_level;
1474 iterator->start_gfn = start_gfn;
1475 iterator->end_gfn = end_gfn;
1476
1477 rmap_walk_init_level(iterator, iterator->start_level);
1478}
1479
1480static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1481{
1482 return !!iterator->rmap;
1483}
1484
1485static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1486{
6ba1e04f 1487 while (++iterator->rmap <= iterator->end_rmap) {
6ce1f4e2 1488 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
6ba1e04f
VS
1489
1490 if (iterator->rmap->val)
1491 return;
6ce1f4e2
XG
1492 }
1493
1494 if (++iterator->level > iterator->end_level) {
1495 iterator->rmap = NULL;
1496 return;
1497 }
1498
1499 rmap_walk_init_level(iterator, iterator->level);
1500}
1501
1502#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1503 _start_gfn, _end_gfn, _iter_) \
1504 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1505 _end_level_, _start_gfn, _end_gfn); \
1506 slot_rmap_walk_okay(_iter_); \
1507 slot_rmap_walk_next(_iter_))
1508
3039bcc7
SC
1509typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1510 struct kvm_memory_slot *slot, gfn_t gfn,
1511 int level, pte_t pte);
c1b91493 1512
3039bcc7
SC
1513static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1514 struct kvm_gfn_range *range,
1515 rmap_handler_t handler)
e930bffe 1516{
6ce1f4e2 1517 struct slot_rmap_walk_iterator iterator;
3039bcc7 1518 bool ret = false;
e930bffe 1519
3039bcc7
SC
1520 for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1521 range->start, range->end - 1, &iterator)
1522 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1523 iterator.level, range->pte);
e930bffe 1524
f395302e 1525 return ret;
e930bffe
AA
1526}
1527
3039bcc7 1528bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
84504ef3 1529{
e2209710 1530 bool flush = false;
063afacd 1531
e2209710 1532 if (kvm_memslots_have_rmaps(kvm))
f8480721 1533 flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
063afacd 1534
897218ff 1535 if (is_tdp_mmu_enabled(kvm))
c7785d85 1536 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
063afacd 1537
3039bcc7 1538 return flush;
b3ae2096
TY
1539}
1540
3039bcc7 1541bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
3da0dd43 1542{
e2209710 1543 bool flush = false;
1d8dd6b3 1544
e2209710 1545 if (kvm_memslots_have_rmaps(kvm))
aed02fe3 1546 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
1d8dd6b3 1547
897218ff 1548 if (is_tdp_mmu_enabled(kvm))
3039bcc7 1549 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1d8dd6b3 1550
3039bcc7 1551 return flush;
e930bffe
AA
1552}
1553
aed02fe3
SC
1554static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1555 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1556 pte_t unused)
e930bffe 1557{
1e3f42f0 1558 u64 *sptep;
3f649ab7 1559 struct rmap_iterator iter;
e930bffe
AA
1560 int young = 0;
1561
f160c7b7
JS
1562 for_each_rmap_spte(rmap_head, &iter, sptep)
1563 young |= mmu_spte_age(sptep);
0d536790 1564
e930bffe
AA
1565 return young;
1566}
1567
aed02fe3
SC
1568static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1569 struct kvm_memory_slot *slot, gfn_t gfn,
1570 int level, pte_t unused)
8ee53820 1571{
1e3f42f0
TY
1572 u64 *sptep;
1573 struct rmap_iterator iter;
8ee53820 1574
83ef6c81
JS
1575 for_each_rmap_spte(rmap_head, &iter, sptep)
1576 if (is_accessed_spte(*sptep))
98a26b69
VM
1577 return true;
1578 return false;
8ee53820
AA
1579}
1580
53a27b39
MT
1581#define RMAP_RECYCLE_THRESHOLD 1000
1582
2ff9039a
DM
1583static void __rmap_add(struct kvm *kvm,
1584 struct kvm_mmu_memory_cache *cache,
1585 const struct kvm_memory_slot *slot,
72ae5822 1586 u64 *spte, gfn_t gfn, unsigned int access)
53a27b39 1587{
852e3c19 1588 struct kvm_mmu_page *sp;
68be1306
DM
1589 struct kvm_rmap_head *rmap_head;
1590 int rmap_count;
852e3c19 1591
57354682 1592 sp = sptep_to_sp(spte);
79e48cec 1593 kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
81cb4657
DM
1594 kvm_update_page_stats(kvm, sp->role.level, 1);
1595
93e083d4 1596 rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
2ff9039a 1597 rmap_count = pte_list_add(cache, spte, rmap_head);
53a27b39 1598
68be1306 1599 if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
9202aee8 1600 kvm_zap_all_rmap_sptes(kvm, rmap_head);
68be1306 1601 kvm_flush_remote_tlbs_with_address(
2ff9039a 1602 kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
68be1306 1603 }
53a27b39
MT
1604}
1605
2ff9039a 1606static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
72ae5822 1607 u64 *spte, gfn_t gfn, unsigned int access)
2ff9039a
DM
1608{
1609 struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
1610
6a97575d 1611 __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
2ff9039a
DM
1612}
1613
3039bcc7 1614bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
e930bffe 1615{
e2209710 1616 bool young = false;
3039bcc7 1617
e2209710 1618 if (kvm_memslots_have_rmaps(kvm))
aed02fe3 1619 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
f8e14497 1620
897218ff 1621 if (is_tdp_mmu_enabled(kvm))
3039bcc7 1622 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
f8e14497
BG
1623
1624 return young;
e930bffe
AA
1625}
1626
3039bcc7 1627bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
8ee53820 1628{
e2209710 1629 bool young = false;
3039bcc7 1630
e2209710 1631 if (kvm_memslots_have_rmaps(kvm))
aed02fe3 1632 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
f8e14497 1633
897218ff 1634 if (is_tdp_mmu_enabled(kvm))
3039bcc7 1635 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
f8e14497
BG
1636
1637 return young;
8ee53820
AA
1638}
1639
d6c69ee9 1640#ifdef MMU_DEBUG
47ad8e68 1641static int is_empty_shadow_page(u64 *spt)
6aa8b732 1642{
139bdb2d
AK
1643 u64 *pos;
1644 u64 *end;
1645
47ad8e68 1646 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
3c915510 1647 if (is_shadow_present_pte(*pos)) {
b8688d51 1648 printk(KERN_ERR "%s: %p %llx\n", __func__,
139bdb2d 1649 pos, *pos);
6aa8b732 1650 return 0;
139bdb2d 1651 }
6aa8b732
AK
1652 return 1;
1653}
d6c69ee9 1654#endif
6aa8b732 1655
45221ab6
DH
1656/*
1657 * This value is the sum of all of the kvm instances's
1658 * kvm->arch.n_used_mmu_pages values. We need a global,
1659 * aggregate version in order to make the slab shrinker
1660 * faster
1661 */
d5aaad6f 1662static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
45221ab6
DH
1663{
1664 kvm->arch.n_used_mmu_pages += nr;
1665 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1666}
1667
87654643 1668static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
260746c0 1669{
fa4a2c08 1670 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
7775834a 1671 hlist_del(&sp->hash_link);
bd4c86ea
XG
1672 list_del(&sp->link);
1673 free_page((unsigned long)sp->spt);
834be0d8 1674 if (!sp->role.direct)
6a97575d 1675 free_page((unsigned long)sp->shadowed_translation);
e8ad9a70 1676 kmem_cache_free(mmu_page_header_cache, sp);
260746c0
AK
1677}
1678
cea0f0e7
AK
1679static unsigned kvm_page_table_hashfn(gfn_t gfn)
1680{
114df303 1681 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
cea0f0e7
AK
1682}
1683
2ff9039a 1684static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
4db35314 1685 struct kvm_mmu_page *sp, u64 *parent_pte)
cea0f0e7 1686{
cea0f0e7
AK
1687 if (!parent_pte)
1688 return;
cea0f0e7 1689
2ff9039a 1690 pte_list_add(cache, parent_pte, &sp->parent_ptes);
cea0f0e7
AK
1691}
1692
4db35314 1693static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
cea0f0e7
AK
1694 u64 *parent_pte)
1695{
3c2e1037 1696 pte_list_remove(parent_pte, &sp->parent_ptes);
cea0f0e7
AK
1697}
1698
bcdd9a93
XG
1699static void drop_parent_pte(struct kvm_mmu_page *sp,
1700 u64 *parent_pte)
1701{
1702 mmu_page_remove_parent_pte(sp, parent_pte);
1df9f2dc 1703 mmu_spte_clear_no_track(parent_pte);
bcdd9a93
XG
1704}
1705
67052b35 1706static void mark_unsync(u64 *spte);
1047df1f 1707static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
0074ff63 1708{
74c4e63a
TY
1709 u64 *sptep;
1710 struct rmap_iterator iter;
1711
1712 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1713 mark_unsync(sptep);
1714 }
0074ff63
MT
1715}
1716
67052b35 1717static void mark_unsync(u64 *spte)
0074ff63 1718{
67052b35 1719 struct kvm_mmu_page *sp;
0074ff63 1720
57354682 1721 sp = sptep_to_sp(spte);
79e48cec 1722 if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
0074ff63 1723 return;
1047df1f 1724 if (sp->unsync_children++)
0074ff63 1725 return;
1047df1f 1726 kvm_mmu_mark_parents_unsync(sp);
0074ff63
MT
1727}
1728
e8bc217a 1729static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
a4a8e6f7 1730 struct kvm_mmu_page *sp)
e8bc217a 1731{
c3e5e415 1732 return -1;
e8bc217a
MT
1733}
1734
60c8aec6
MT
1735#define KVM_PAGE_ARRAY_NR 16
1736
1737struct kvm_mmu_pages {
1738 struct mmu_page_and_offset {
1739 struct kvm_mmu_page *sp;
1740 unsigned int idx;
1741 } page[KVM_PAGE_ARRAY_NR];
1742 unsigned int nr;
1743};
1744
cded19f3
HE
1745static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1746 int idx)
4731d4c7 1747{
60c8aec6 1748 int i;
4731d4c7 1749
60c8aec6
MT
1750 if (sp->unsync)
1751 for (i=0; i < pvec->nr; i++)
1752 if (pvec->page[i].sp == sp)
1753 return 0;
1754
1755 pvec->page[pvec->nr].sp = sp;
1756 pvec->page[pvec->nr].idx = idx;
1757 pvec->nr++;
1758 return (pvec->nr == KVM_PAGE_ARRAY_NR);
1759}
1760
fd951457
TY
1761static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1762{
1763 --sp->unsync_children;
1764 WARN_ON((int)sp->unsync_children < 0);
1765 __clear_bit(idx, sp->unsync_child_bitmap);
1766}
1767
60c8aec6
MT
1768static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1769 struct kvm_mmu_pages *pvec)
1770{
1771 int i, ret, nr_unsync_leaf = 0;
4731d4c7 1772
37178b8b 1773 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
7a8f1a74 1774 struct kvm_mmu_page *child;
4731d4c7
MT
1775 u64 ent = sp->spt[i];
1776
fd951457
TY
1777 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1778 clear_unsync_child_bit(sp, i);
1779 continue;
1780 }
7a8f1a74 1781
2ca3129e 1782 child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
7a8f1a74
XG
1783
1784 if (child->unsync_children) {
1785 if (mmu_pages_add(pvec, child, i))
1786 return -ENOSPC;
1787
1788 ret = __mmu_unsync_walk(child, pvec);
fd951457
TY
1789 if (!ret) {
1790 clear_unsync_child_bit(sp, i);
1791 continue;
1792 } else if (ret > 0) {
7a8f1a74 1793 nr_unsync_leaf += ret;
fd951457 1794 } else
7a8f1a74
XG
1795 return ret;
1796 } else if (child->unsync) {
1797 nr_unsync_leaf++;
1798 if (mmu_pages_add(pvec, child, i))
1799 return -ENOSPC;
1800 } else
fd951457 1801 clear_unsync_child_bit(sp, i);
4731d4c7
MT
1802 }
1803
60c8aec6
MT
1804 return nr_unsync_leaf;
1805}
1806
e23d3fef
XG
1807#define INVALID_INDEX (-1)
1808
60c8aec6
MT
1809static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1810 struct kvm_mmu_pages *pvec)
1811{
0a47cd85 1812 pvec->nr = 0;
60c8aec6
MT
1813 if (!sp->unsync_children)
1814 return 0;
1815
e23d3fef 1816 mmu_pages_add(pvec, sp, INVALID_INDEX);
60c8aec6 1817 return __mmu_unsync_walk(sp, pvec);
4731d4c7
MT
1818}
1819
4731d4c7
MT
1820static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1821{
1822 WARN_ON(!sp->unsync);
5e1b3ddb 1823 trace_kvm_mmu_sync_page(sp);
4731d4c7
MT
1824 sp->unsync = 0;
1825 --kvm->stat.mmu_unsync;
1826}
1827
83cdb568
SC
1828static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1829 struct list_head *invalid_list);
7775834a
XG
1830static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1831 struct list_head *invalid_list);
4731d4c7 1832
767d8d8d
LJ
1833static bool sp_has_gptes(struct kvm_mmu_page *sp)
1834{
1835 if (sp->role.direct)
1836 return false;
1837
84e5ffd0
LJ
1838 if (sp->role.passthrough)
1839 return false;
1840
767d8d8d
LJ
1841 return true;
1842}
1843
ac101b7c
SC
1844#define for_each_valid_sp(_kvm, _sp, _list) \
1845 hlist_for_each_entry(_sp, _list, hash_link) \
fac026da 1846 if (is_obsolete_sp((_kvm), (_sp))) { \
f3414bc7 1847 } else
1044b030 1848
767d8d8d 1849#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
ac101b7c
SC
1850 for_each_valid_sp(_kvm, _sp, \
1851 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
767d8d8d 1852 if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
7ae680eb 1853
8d5678a7 1854static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
479a1efc 1855 struct list_head *invalid_list)
4731d4c7 1856{
c3e5e415
LJ
1857 int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
1858
8d5678a7 1859 if (ret < 0)
d98ba053 1860 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
8d5678a7 1861 return ret;
4731d4c7
MT
1862}
1863
a2113634
SC
1864static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1865 struct list_head *invalid_list,
1866 bool remote_flush)
1867{
cfd32acf 1868 if (!remote_flush && list_empty(invalid_list))
a2113634
SC
1869 return false;
1870
1871 if (!list_empty(invalid_list))
1872 kvm_mmu_commit_zap_page(kvm, invalid_list);
1873 else
1874 kvm_flush_remote_tlbs(kvm);
1875 return true;
1876}
1877
002c5f73
SC
1878static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1879{
a955cad8
SC
1880 if (sp->role.invalid)
1881 return true;
1882
1883 /* TDP MMU pages due not use the MMU generation. */
1884 return !sp->tdp_mmu_page &&
fac026da 1885 unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
002c5f73
SC
1886}
1887
60c8aec6 1888struct mmu_page_path {
2a7266a8
YZ
1889 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
1890 unsigned int idx[PT64_ROOT_MAX_LEVEL];
4731d4c7
MT
1891};
1892
60c8aec6 1893#define for_each_sp(pvec, sp, parents, i) \
0a47cd85 1894 for (i = mmu_pages_first(&pvec, &parents); \
60c8aec6
MT
1895 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
1896 i = mmu_pages_next(&pvec, &parents, i))
1897
cded19f3
HE
1898static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1899 struct mmu_page_path *parents,
1900 int i)
60c8aec6
MT
1901{
1902 int n;
1903
1904 for (n = i+1; n < pvec->nr; n++) {
1905 struct kvm_mmu_page *sp = pvec->page[n].sp;
0a47cd85
PB
1906 unsigned idx = pvec->page[n].idx;
1907 int level = sp->role.level;
60c8aec6 1908
0a47cd85 1909 parents->idx[level-1] = idx;
3bae0459 1910 if (level == PG_LEVEL_4K)
0a47cd85 1911 break;
60c8aec6 1912
0a47cd85 1913 parents->parent[level-2] = sp;
60c8aec6
MT
1914 }
1915
1916 return n;
1917}
1918
0a47cd85
PB
1919static int mmu_pages_first(struct kvm_mmu_pages *pvec,
1920 struct mmu_page_path *parents)
1921{
1922 struct kvm_mmu_page *sp;
1923 int level;
1924
1925 if (pvec->nr == 0)
1926 return 0;
1927
e23d3fef
XG
1928 WARN_ON(pvec->page[0].idx != INVALID_INDEX);
1929
0a47cd85
PB
1930 sp = pvec->page[0].sp;
1931 level = sp->role.level;
3bae0459 1932 WARN_ON(level == PG_LEVEL_4K);
0a47cd85
PB
1933
1934 parents->parent[level-2] = sp;
1935
1936 /* Also set up a sentinel. Further entries in pvec are all
1937 * children of sp, so this element is never overwritten.
1938 */
1939 parents->parent[level-1] = NULL;
1940 return mmu_pages_next(pvec, parents, 0);
1941}
1942
cded19f3 1943static void mmu_pages_clear_parents(struct mmu_page_path *parents)
4731d4c7 1944{
60c8aec6
MT
1945 struct kvm_mmu_page *sp;
1946 unsigned int level = 0;
1947
1948 do {
1949 unsigned int idx = parents->idx[level];
60c8aec6
MT
1950 sp = parents->parent[level];
1951 if (!sp)
1952 return;
1953
e23d3fef 1954 WARN_ON(idx == INVALID_INDEX);
fd951457 1955 clear_unsync_child_bit(sp, idx);
60c8aec6 1956 level++;
0a47cd85 1957 } while (!sp->unsync_children);
60c8aec6 1958}
4731d4c7 1959
65855ed8
LJ
1960static int mmu_sync_children(struct kvm_vcpu *vcpu,
1961 struct kvm_mmu_page *parent, bool can_yield)
60c8aec6
MT
1962{
1963 int i;
1964 struct kvm_mmu_page *sp;
1965 struct mmu_page_path parents;
1966 struct kvm_mmu_pages pages;
d98ba053 1967 LIST_HEAD(invalid_list);
50c9e6f3 1968 bool flush = false;
60c8aec6 1969
60c8aec6 1970 while (mmu_unsync_walk(parent, &pages)) {
2f84569f 1971 bool protected = false;
b1a36821
MT
1972
1973 for_each_sp(pages, sp, parents, i)
cf48f9e2 1974 protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
b1a36821 1975
50c9e6f3 1976 if (protected) {
5591c069 1977 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
50c9e6f3
PB
1978 flush = false;
1979 }
b1a36821 1980
60c8aec6 1981 for_each_sp(pages, sp, parents, i) {
479a1efc 1982 kvm_unlink_unsync_page(vcpu->kvm, sp);
8d5678a7 1983 flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
60c8aec6
MT
1984 mmu_pages_clear_parents(&parents);
1985 }
531810ca 1986 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
c3e5e415 1987 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
65855ed8
LJ
1988 if (!can_yield) {
1989 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1990 return -EINTR;
1991 }
1992
531810ca 1993 cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
50c9e6f3
PB
1994 flush = false;
1995 }
60c8aec6 1996 }
50c9e6f3 1997
c3e5e415 1998 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
65855ed8 1999 return 0;
4731d4c7
MT
2000}
2001
a30f47cb
XG
2002static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2003{
e5691a81 2004 atomic_set(&sp->write_flooding_count, 0);
a30f47cb
XG
2005}
2006
2007static void clear_sp_write_flooding_count(u64 *spte)
2008{
57354682 2009 __clear_sp_write_flooding_count(sptep_to_sp(spte));
a30f47cb
XG
2010}
2011
cbd858b1
DM
2012/*
2013 * The vCPU is required when finding indirect shadow pages; the shadow
2014 * page may already exist and syncing it needs the vCPU pointer in
2015 * order to read guest page tables. Direct shadow pages are never
2016 * unsync, thus @vcpu can be NULL if @role.direct is true.
2017 */
3cc736b3
DM
2018static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
2019 struct kvm_vcpu *vcpu,
94c81364
DM
2020 gfn_t gfn,
2021 struct hlist_head *sp_list,
2022 union kvm_mmu_page_role role)
cea0f0e7 2023{
9f1a122f 2024 struct kvm_mmu_page *sp;
8d5678a7 2025 int ret;
f3414bc7 2026 int collisions = 0;
2a74003a 2027 LIST_HEAD(invalid_list);
cea0f0e7 2028
3cc736b3 2029 for_each_valid_sp(kvm, sp, sp_list) {
f3414bc7
DM
2030 if (sp->gfn != gfn) {
2031 collisions++;
2032 continue;
2033 }
2034
ddc16abb
SC
2035 if (sp->role.word != role.word) {
2036 /*
2037 * If the guest is creating an upper-level page, zap
2038 * unsync pages for the same gfn. While it's possible
2039 * the guest is using recursive page tables, in all
2040 * likelihood the guest has stopped using the unsync
2041 * page and is installing a completely unrelated page.
2042 * Unsync pages must not be left as is, because the new
2043 * upper-level page will be write-protected.
2044 */
2e65e842 2045 if (role.level > PG_LEVEL_4K && sp->unsync)
3cc736b3 2046 kvm_mmu_prepare_zap_page(kvm, sp,
ddc16abb 2047 &invalid_list);
7ae680eb 2048 continue;
ddc16abb 2049 }
4731d4c7 2050
bb924ca6
DM
2051 /* unsync and write-flooding only apply to indirect SPs. */
2052 if (sp->role.direct)
94c81364 2053 goto out;
fb58a9c3 2054
2a74003a 2055 if (sp->unsync) {
cbd858b1
DM
2056 if (KVM_BUG_ON(!vcpu, kvm))
2057 break;
2058
07dc4f35 2059 /*
479a1efc 2060 * The page is good, but is stale. kvm_sync_page does
07dc4f35
SC
2061 * get the latest guest state, but (unlike mmu_unsync_children)
2062 * it doesn't write-protect the page or mark it synchronized!
2063 * This way the validity of the mapping is ensured, but the
2064 * overhead of write protection is not incurred until the
2065 * guest invalidates the TLB mapping. This allows multiple
2066 * SPs for a single gfn to be unsync.
2067 *
2068 * If the sync fails, the page is zapped. If so, break
2069 * in order to rebuild it.
2a74003a 2070 */
8d5678a7
HW
2071 ret = kvm_sync_page(vcpu, sp, &invalid_list);
2072 if (ret < 0)
2a74003a
PB
2073 break;
2074
2075 WARN_ON(!list_empty(&invalid_list));
8d5678a7 2076 if (ret > 0)
3cc736b3 2077 kvm_flush_remote_tlbs(kvm);
2a74003a 2078 }
e02aa901 2079
a30f47cb 2080 __clear_sp_write_flooding_count(sp);
fb58a9c3 2081
f3414bc7 2082 goto out;
7ae680eb 2083 }
47005792 2084
94c81364 2085 sp = NULL;
3cc736b3 2086 ++kvm->stat.mmu_cache_miss;
47005792 2087
94c81364 2088out:
3cc736b3 2089 kvm_mmu_commit_zap_page(kvm, &invalid_list);
94c81364 2090
3cc736b3
DM
2091 if (collisions > kvm->stat.max_mmu_page_hash_collisions)
2092 kvm->stat.max_mmu_page_hash_collisions = collisions;
94c81364
DM
2093 return sp;
2094}
2095
2f8b1b53
DM
2096/* Caches used when allocating a new shadow page. */
2097struct shadow_page_caches {
2098 struct kvm_mmu_memory_cache *page_header_cache;
2099 struct kvm_mmu_memory_cache *shadow_page_cache;
6a97575d 2100 struct kvm_mmu_memory_cache *shadowed_info_cache;
2f8b1b53
DM
2101};
2102
336081fb 2103static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
2f8b1b53 2104 struct shadow_page_caches *caches,
94c81364
DM
2105 gfn_t gfn,
2106 struct hlist_head *sp_list,
2107 union kvm_mmu_page_role role)
2108{
c306aec8
DM
2109 struct kvm_mmu_page *sp;
2110
2f8b1b53
DM
2111 sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
2112 sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
c306aec8 2113 if (!role.direct)
6a97575d 2114 sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
c306aec8
DM
2115
2116 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2117
2118 /*
2119 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2120 * depends on valid pages being added to the head of the list. See
2121 * comments in kvm_zap_obsolete_pages().
2122 */
336081fb
DM
2123 sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
2124 list_add(&sp->link, &kvm->arch.active_mmu_pages);
2125 kvm_mod_used_mmu_pages(kvm, +1);
47005792 2126
4db35314
AK
2127 sp->gfn = gfn;
2128 sp->role = role;
ac101b7c 2129 hlist_add_head(&sp->hash_link, sp_list);
be911771 2130 if (sp_has_gptes(sp))
336081fb 2131 account_shadowed(kvm, sp);
ddc16abb 2132
94c81364
DM
2133 return sp;
2134}
2135
cbd858b1 2136/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
3cc736b3
DM
2137static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
2138 struct kvm_vcpu *vcpu,
2f8b1b53
DM
2139 struct shadow_page_caches *caches,
2140 gfn_t gfn,
2141 union kvm_mmu_page_role role)
94c81364
DM
2142{
2143 struct hlist_head *sp_list;
2144 struct kvm_mmu_page *sp;
2145 bool created = false;
2146
3cc736b3 2147 sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
94c81364 2148
3cc736b3 2149 sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
94c81364
DM
2150 if (!sp) {
2151 created = true;
3cc736b3 2152 sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
94c81364
DM
2153 }
2154
2155 trace_kvm_mmu_get_page(sp, created);
4db35314 2156 return sp;
cea0f0e7
AK
2157}
2158
2f8b1b53
DM
2159static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
2160 gfn_t gfn,
2161 union kvm_mmu_page_role role)
2162{
2163 struct shadow_page_caches caches = {
2164 .page_header_cache = &vcpu->arch.mmu_page_header_cache,
2165 .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
6a97575d 2166 .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
2f8b1b53
DM
2167 };
2168
3cc736b3 2169 return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
2f8b1b53
DM
2170}
2171
39944ab9
SC
2172static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
2173 unsigned int access)
2e65e842
DM
2174{
2175 struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
2176 union kvm_mmu_page_role role;
2177
2178 role = parent_sp->role;
2179 role.level--;
2180 role.access = access;
2181 role.direct = direct;
2182 role.passthrough = 0;
2183
2184 /*
2185 * If the guest has 4-byte PTEs then that means it's using 32-bit,
2186 * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
2187 * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
2188 * shadow each guest page table with multiple shadow page tables, which
2189 * requires extra bookkeeping in the role.
2190 *
2191 * Specifically, to shadow the guest's page directory (which covers a
2192 * 4GiB address space), KVM uses 4 PAE page directories, each mapping
2193 * 1GiB of the address space. @role.quadrant encodes which quarter of
2194 * the address space each maps.
2195 *
2196 * To shadow the guest's page tables (which each map a 4MiB region), KVM
2197 * uses 2 PAE page tables, each mapping a 2MiB region. For these,
2198 * @role.quadrant encodes which half of the region they map.
2199 *
39944ab9
SC
2200 * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
2201 * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow
2202 * PDPTEs; those 4 PAE page directories are pre-allocated and their
2203 * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes
2204 * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume
2205 * bit 21 in the PTE (the child here), KVM propagates that bit to the
2206 * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE
2207 * covers bit 21 (see above), thus the quadrant is calculated from the
2208 * _least_ significant bit of the PDE index.
2e65e842
DM
2209 */
2210 if (role.has_4_byte_gpte) {
2211 WARN_ON_ONCE(role.level != PG_LEVEL_4K);
79e48cec 2212 role.quadrant = spte_index(sptep) & 1;
2e65e842
DM
2213 }
2214
2215 return role;
2216}
2217
2218static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
2219 u64 *sptep, gfn_t gfn,
2220 bool direct, unsigned int access)
2221{
2222 union kvm_mmu_page_role role;
2223
0cd8dc73
PB
2224 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
2225 return ERR_PTR(-EEXIST);
2226
2e65e842 2227 role = kvm_mmu_child_role(sptep, direct, access);
87654643 2228 return kvm_mmu_get_shadow_page(vcpu, gfn, role);
2e65e842
DM
2229}
2230
7eb77e9f
JS
2231static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2232 struct kvm_vcpu *vcpu, hpa_t root,
2233 u64 addr)
2d11123a
AK
2234{
2235 iterator->addr = addr;
7eb77e9f 2236 iterator->shadow_addr = root;
a972e29c 2237 iterator->level = vcpu->arch.mmu->root_role.level;
81407ca5 2238
12ec33a7 2239 if (iterator->level >= PT64_ROOT_4LEVEL &&
4d25502a 2240 vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
347a0d0d 2241 !vcpu->arch.mmu->root_role.direct)
12ec33a7 2242 iterator->level = PT32E_ROOT_LEVEL;
81407ca5 2243
2d11123a 2244 if (iterator->level == PT32E_ROOT_LEVEL) {
7eb77e9f
JS
2245 /*
2246 * prev_root is currently only used for 64-bit hosts. So only
2247 * the active root_hpa is valid here.
2248 */
b9e5603c 2249 BUG_ON(root != vcpu->arch.mmu->root.hpa);
7eb77e9f 2250
2d11123a 2251 iterator->shadow_addr
44dd3ffa 2252 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2ca3129e 2253 iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
2d11123a
AK
2254 --iterator->level;
2255 if (!iterator->shadow_addr)
2256 iterator->level = 0;
2257 }
2258}
2259
7eb77e9f
JS
2260static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2261 struct kvm_vcpu *vcpu, u64 addr)
2262{
b9e5603c 2263 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
7eb77e9f
JS
2264 addr);
2265}
2266
2d11123a
AK
2267static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2268{
3bae0459 2269 if (iterator->level < PG_LEVEL_4K)
2d11123a 2270 return false;
4d88954d 2271
2ca3129e 2272 iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
2d11123a
AK
2273 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2274 return true;
2275}
2276
c2a2ac2b
XG
2277static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2278 u64 spte)
2d11123a 2279{
3e44dce4 2280 if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
052331be
XG
2281 iterator->level = 0;
2282 return;
2283 }
2284
2ca3129e 2285 iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
2d11123a
AK
2286 --iterator->level;
2287}
2288
c2a2ac2b
XG
2289static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2290{
bb606a9b 2291 __shadow_walk_next(iterator, *iterator->sptep);
c2a2ac2b
XG
2292}
2293
0cd8dc73
PB
2294static void __link_shadow_page(struct kvm *kvm,
2295 struct kvm_mmu_memory_cache *cache, u64 *sptep,
03787394 2296 struct kvm_mmu_page *sp, bool flush)
cc4674d0
BG
2297{
2298 u64 spte;
2299
2300 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2301
0cd8dc73
PB
2302 /*
2303 * If an SPTE is present already, it must be a leaf and therefore
03787394
PB
2304 * a large one. Drop it, and flush the TLB if needed, before
2305 * installing sp.
0cd8dc73
PB
2306 */
2307 if (is_shadow_present_pte(*sptep))
03787394 2308 drop_large_spte(kvm, sptep, flush);
0cd8dc73 2309
cc4674d0
BG
2310 spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2311
1df9f2dc 2312 mmu_spte_set(sptep, spte);
98bba238 2313
2ff9039a 2314 mmu_page_add_parent_pte(cache, sp, sptep);
98bba238
TY
2315
2316 if (sp->unsync_children || sp->unsync)
2317 mark_unsync(sptep);
32ef26a3
AK
2318}
2319
2ff9039a
DM
2320static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2321 struct kvm_mmu_page *sp)
2322{
03787394 2323 __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
2ff9039a
DM
2324}
2325
a357bd22
AK
2326static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2327 unsigned direct_access)
2328{
2329 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2330 struct kvm_mmu_page *child;
2331
2332 /*
2333 * For the direct sp, if the guest pte's dirty bit
2334 * changed form clean to dirty, it will corrupt the
2335 * sp's access: allow writable in the read-only sp,
2336 * so we should update the spte at this point to get
2337 * a new sp with the correct access.
2338 */
2ca3129e 2339 child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
a357bd22
AK
2340 if (child->role.access == direct_access)
2341 return;
2342
bcdd9a93 2343 drop_parent_pte(child, sptep);
c3134ce2 2344 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
a357bd22
AK
2345 }
2346}
2347
2de4085c
BG
2348/* Returns the number of zapped non-leaf child shadow pages. */
2349static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2350 u64 *spte, struct list_head *invalid_list)
38e3b2b2
XG
2351{
2352 u64 pte;
2353 struct kvm_mmu_page *child;
2354
2355 pte = *spte;
2356 if (is_shadow_present_pte(pte)) {
505aef8f 2357 if (is_last_spte(pte, sp->role.level)) {
c3707958 2358 drop_spte(kvm, spte);
505aef8f 2359 } else {
2ca3129e 2360 child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
bcdd9a93 2361 drop_parent_pte(child, spte);
2de4085c
BG
2362
2363 /*
2364 * Recursively zap nested TDP SPs, parentless SPs are
2365 * unlikely to be used again in the near future. This
2366 * avoids retaining a large number of stale nested SPs.
2367 */
2368 if (tdp_enabled && invalid_list &&
2369 child->role.guest_mode && !child->parent_ptes.val)
2370 return kvm_mmu_prepare_zap_page(kvm, child,
2371 invalid_list);
38e3b2b2 2372 }
ace569e0 2373 } else if (is_mmio_spte(pte)) {
ce88decf 2374 mmu_spte_clear_no_track(spte);
ace569e0 2375 }
2de4085c 2376 return 0;
38e3b2b2
XG
2377}
2378
2de4085c
BG
2379static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2380 struct kvm_mmu_page *sp,
2381 struct list_head *invalid_list)
a436036b 2382{
2de4085c 2383 int zapped = 0;
697fe2e2 2384 unsigned i;
697fe2e2 2385
2ca3129e 2386 for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
2de4085c
BG
2387 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2388
2389 return zapped;
a436036b
AK
2390}
2391
61827671 2392static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
a436036b 2393{
1e3f42f0
TY
2394 u64 *sptep;
2395 struct rmap_iterator iter;
a436036b 2396
018aabb5 2397 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
1e3f42f0 2398 drop_parent_pte(sp, sptep);
31aa2b44
AK
2399}
2400
60c8aec6 2401static int mmu_zap_unsync_children(struct kvm *kvm,
7775834a
XG
2402 struct kvm_mmu_page *parent,
2403 struct list_head *invalid_list)
4731d4c7 2404{
60c8aec6
MT
2405 int i, zapped = 0;
2406 struct mmu_page_path parents;
2407 struct kvm_mmu_pages pages;
4731d4c7 2408
3bae0459 2409 if (parent->role.level == PG_LEVEL_4K)
4731d4c7 2410 return 0;
60c8aec6 2411
60c8aec6
MT
2412 while (mmu_unsync_walk(parent, &pages)) {
2413 struct kvm_mmu_page *sp;
2414
2415 for_each_sp(pages, sp, parents, i) {
7775834a 2416 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
60c8aec6 2417 mmu_pages_clear_parents(&parents);
77662e00 2418 zapped++;
60c8aec6 2419 }
60c8aec6
MT
2420 }
2421
2422 return zapped;
4731d4c7
MT
2423}
2424
83cdb568
SC
2425static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2426 struct kvm_mmu_page *sp,
2427 struct list_head *invalid_list,
2428 int *nr_zapped)
31aa2b44 2429{
527d5cd7 2430 bool list_unstable, zapped_root = false;
f691fe1d 2431
7775834a 2432 trace_kvm_mmu_prepare_zap_page(sp);
31aa2b44 2433 ++kvm->stat.mmu_shadow_zapped;
83cdb568 2434 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2de4085c 2435 *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
61827671 2436 kvm_mmu_unlink_parents(sp);
5304b8d3 2437
83cdb568
SC
2438 /* Zapping children means active_mmu_pages has become unstable. */
2439 list_unstable = *nr_zapped;
2440
767d8d8d 2441 if (!sp->role.invalid && sp_has_gptes(sp))
3ed1a478 2442 unaccount_shadowed(kvm, sp);
5304b8d3 2443
4731d4c7
MT
2444 if (sp->unsync)
2445 kvm_unlink_unsync_page(kvm, sp);
4db35314 2446 if (!sp->root_count) {
54a4f023 2447 /* Count self */
83cdb568 2448 (*nr_zapped)++;
f95eec9b
SC
2449
2450 /*
2451 * Already invalid pages (previously active roots) are not on
2452 * the active page list. See list_del() in the "else" case of
2453 * !sp->root_count.
2454 */
2455 if (sp->role.invalid)
2456 list_add(&sp->link, invalid_list);
2457 else
2458 list_move(&sp->link, invalid_list);
aa6bd187 2459 kvm_mod_used_mmu_pages(kvm, -1);
2e53d63a 2460 } else {
f95eec9b
SC
2461 /*
2462 * Remove the active root from the active page list, the root
2463 * will be explicitly freed when the root_count hits zero.
2464 */
2465 list_del(&sp->link);
05988d72 2466
10605204
SC
2467 /*
2468 * Obsolete pages cannot be used on any vCPUs, see the comment
2469 * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
2470 * treats invalid shadow pages as being obsolete.
2471 */
527d5cd7 2472 zapped_root = !is_obsolete_sp(kvm, sp);
2e53d63a 2473 }
7775834a 2474
b8e8c830
PB
2475 if (sp->lpage_disallowed)
2476 unaccount_huge_nx_page(kvm, sp);
2477
7775834a 2478 sp->role.invalid = 1;
527d5cd7
SC
2479
2480 /*
2481 * Make the request to free obsolete roots after marking the root
2482 * invalid, otherwise other vCPUs may not see it as invalid.
2483 */
2484 if (zapped_root)
2485 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
83cdb568
SC
2486 return list_unstable;
2487}
2488
2489static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2490 struct list_head *invalid_list)
2491{
2492 int nr_zapped;
2493
2494 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2495 return nr_zapped;
a436036b
AK
2496}
2497
7775834a
XG
2498static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2499 struct list_head *invalid_list)
2500{
945315b9 2501 struct kvm_mmu_page *sp, *nsp;
7775834a
XG
2502
2503 if (list_empty(invalid_list))
2504 return;
2505
c142786c 2506 /*
9753f529
LT
2507 * We need to make sure everyone sees our modifications to
2508 * the page tables and see changes to vcpu->mode here. The barrier
2509 * in the kvm_flush_remote_tlbs() achieves this. This pairs
2510 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2511 *
2512 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2513 * guest mode and/or lockless shadow page table walks.
c142786c
AK
2514 */
2515 kvm_flush_remote_tlbs(kvm);
c2a2ac2b 2516
945315b9 2517 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
7775834a 2518 WARN_ON(!sp->role.invalid || sp->root_count);
87654643 2519 kvm_mmu_free_shadow_page(sp);
945315b9 2520 }
7775834a
XG
2521}
2522
6b82ef2c
SC
2523static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2524 unsigned long nr_to_zap)
5da59607 2525{
6b82ef2c
SC
2526 unsigned long total_zapped = 0;
2527 struct kvm_mmu_page *sp, *tmp;
ba7888dd 2528 LIST_HEAD(invalid_list);
6b82ef2c
SC
2529 bool unstable;
2530 int nr_zapped;
5da59607
TY
2531
2532 if (list_empty(&kvm->arch.active_mmu_pages))
ba7888dd
SC
2533 return 0;
2534
6b82ef2c 2535restart:
8fc51726 2536 list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
6b82ef2c
SC
2537 /*
2538 * Don't zap active root pages, the page itself can't be freed
2539 * and zapping it will just force vCPUs to realloc and reload.
2540 */
2541 if (sp->root_count)
2542 continue;
2543
2544 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2545 &nr_zapped);
2546 total_zapped += nr_zapped;
2547 if (total_zapped >= nr_to_zap)
ba7888dd
SC
2548 break;
2549
6b82ef2c
SC
2550 if (unstable)
2551 goto restart;
ba7888dd 2552 }
5da59607 2553
6b82ef2c
SC
2554 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2555
2556 kvm->stat.mmu_recycled += total_zapped;
2557 return total_zapped;
2558}
2559
afe8d7e6
SC
2560static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2561{
2562 if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2563 return kvm->arch.n_max_mmu_pages -
2564 kvm->arch.n_used_mmu_pages;
2565
2566 return 0;
5da59607
TY
2567}
2568
ba7888dd
SC
2569static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2570{
6b82ef2c 2571 unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
ba7888dd 2572
6b82ef2c 2573 if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
ba7888dd
SC
2574 return 0;
2575
6b82ef2c 2576 kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
ba7888dd 2577
6e6ec584
SC
2578 /*
2579 * Note, this check is intentionally soft, it only guarantees that one
2580 * page is available, while the caller may end up allocating as many as
2581 * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
2582 * exceeding the (arbitrary by default) limit will not harm the host,
c4342633 2583 * being too aggressive may unnecessarily kill the guest, and getting an
6e6ec584
SC
2584 * exact count is far more trouble than it's worth, especially in the
2585 * page fault paths.
2586 */
ba7888dd
SC
2587 if (!kvm_mmu_available_pages(vcpu->kvm))
2588 return -ENOSPC;
2589 return 0;
2590}
2591
82ce2c96
IE
2592/*
2593 * Changing the number of mmu pages allocated to the vm
49d5ca26 2594 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
82ce2c96 2595 */
bc8a3d89 2596void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
82ce2c96 2597{
531810ca 2598 write_lock(&kvm->mmu_lock);
b34cb590 2599
49d5ca26 2600 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
6b82ef2c
SC
2601 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2602 goal_nr_mmu_pages);
82ce2c96 2603
49d5ca26 2604 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
82ce2c96 2605 }
82ce2c96 2606
49d5ca26 2607 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
b34cb590 2608
531810ca 2609 write_unlock(&kvm->mmu_lock);
82ce2c96
IE
2610}
2611
1cb3f3ae 2612int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
a436036b 2613{
4db35314 2614 struct kvm_mmu_page *sp;
d98ba053 2615 LIST_HEAD(invalid_list);
a436036b
AK
2616 int r;
2617
9ad17b10 2618 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
a436036b 2619 r = 0;
531810ca 2620 write_lock(&kvm->mmu_lock);
767d8d8d 2621 for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
9ad17b10 2622 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
7ae680eb
XG
2623 sp->role.word);
2624 r = 1;
f41d335a 2625 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
7ae680eb 2626 }
d98ba053 2627 kvm_mmu_commit_zap_page(kvm, &invalid_list);
531810ca 2628 write_unlock(&kvm->mmu_lock);
1cb3f3ae 2629
a436036b 2630 return r;
cea0f0e7 2631}
96ad91ae
SC
2632
2633static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2634{
2635 gpa_t gpa;
2636 int r;
2637
347a0d0d 2638 if (vcpu->arch.mmu->root_role.direct)
96ad91ae
SC
2639 return 0;
2640
2641 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2642
2643 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2644
2645 return r;
2646}
cea0f0e7 2647
4d78d0b3 2648static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
9cf5cf5a
XG
2649{
2650 trace_kvm_mmu_unsync_page(sp);
4d78d0b3 2651 ++kvm->stat.mmu_unsync;
9cf5cf5a
XG
2652 sp->unsync = 1;
2653
2654 kvm_mmu_mark_parents_unsync(sp);
9cf5cf5a
XG
2655}
2656
0337f585
SC
2657/*
2658 * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2659 * KVM is creating a writable mapping for said gfn. Returns 0 if all pages
2660 * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2661 * be write-protected.
2662 */
8283e36a 2663int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2839180c 2664 gfn_t gfn, bool can_unsync, bool prefetch)
4731d4c7 2665{
5c520e90 2666 struct kvm_mmu_page *sp;
ce25681d 2667 bool locked = false;
4731d4c7 2668
0337f585
SC
2669 /*
2670 * Force write-protection if the page is being tracked. Note, the page
2671 * track machinery is used to write-protect upper-level shadow pages,
2672 * i.e. this guards the role.level == 4K assertion below!
2673 */
4d78d0b3 2674 if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
0337f585 2675 return -EPERM;
9cf5cf5a 2676
0337f585
SC
2677 /*
2678 * The page is not write-tracked, mark existing shadow pages unsync
2679 * unless KVM is synchronizing an unsync SP (can_unsync = false). In
2680 * that case, KVM must complete emulation of the guest TLB flush before
2681 * allowing shadow pages to become unsync (writable by the guest).
2682 */
767d8d8d 2683 for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
36a2e677 2684 if (!can_unsync)
0337f585 2685 return -EPERM;
36a2e677 2686
5c520e90
XG
2687 if (sp->unsync)
2688 continue;
9cf5cf5a 2689
2839180c 2690 if (prefetch)
f1c4a88c
LJ
2691 return -EEXIST;
2692
ce25681d
SC
2693 /*
2694 * TDP MMU page faults require an additional spinlock as they
2695 * run with mmu_lock held for read, not write, and the unsync
2696 * logic is not thread safe. Take the spinklock regardless of
2697 * the MMU type to avoid extra conditionals/parameters, there's
2698 * no meaningful penalty if mmu_lock is held for write.
2699 */
2700 if (!locked) {
2701 locked = true;
4d78d0b3 2702 spin_lock(&kvm->arch.mmu_unsync_pages_lock);
ce25681d
SC
2703
2704 /*
2705 * Recheck after taking the spinlock, a different vCPU
2706 * may have since marked the page unsync. A false
2707 * positive on the unprotected check above is not
2708 * possible as clearing sp->unsync _must_ hold mmu_lock
2709 * for write, i.e. unsync cannot transition from 0->1
2710 * while this CPU holds mmu_lock for read (or write).
2711 */
2712 if (READ_ONCE(sp->unsync))
2713 continue;
2714 }
2715
3bae0459 2716 WARN_ON(sp->role.level != PG_LEVEL_4K);
4d78d0b3 2717 kvm_unsync_page(kvm, sp);
4731d4c7 2718 }
ce25681d 2719 if (locked)
4d78d0b3 2720 spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
3d0c27ad 2721
578e1c4d
JS
2722 /*
2723 * We need to ensure that the marking of unsync pages is visible
2724 * before the SPTE is updated to allow writes because
2725 * kvm_mmu_sync_roots() checks the unsync flags without holding
2726 * the MMU lock and so can race with this. If the SPTE was updated
2727 * before the page had been marked as unsync-ed, something like the
2728 * following could happen:
2729 *
2730 * CPU 1 CPU 2
2731 * ---------------------------------------------------------------------
2732 * 1.2 Host updates SPTE
2733 * to be writable
2734 * 2.1 Guest writes a GPTE for GVA X.
2735 * (GPTE being in the guest page table shadowed
2736 * by the SP from CPU 1.)
2737 * This reads SPTE during the page table walk.
2738 * Since SPTE.W is read as 1, there is no
2739 * fault.
2740 *
2741 * 2.2 Guest issues TLB flush.
2742 * That causes a VM Exit.
2743 *
0337f585
SC
2744 * 2.3 Walking of unsync pages sees sp->unsync is
2745 * false and skips the page.
578e1c4d
JS
2746 *
2747 * 2.4 Guest accesses GVA X.
2748 * Since the mapping in the SP was not updated,
2749 * so the old mapping for GVA X incorrectly
2750 * gets used.
2751 * 1.1 Host marks SP
2752 * as unsync
2753 * (sp->unsync = true)
2754 *
2755 * The write barrier below ensures that 1.1 happens before 1.2 and thus
264d3dc1
LJ
2756 * the situation in 2.4 does not arise. It pairs with the read barrier
2757 * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
578e1c4d
JS
2758 */
2759 smp_wmb();
2760
0337f585 2761 return 0;
4731d4c7
MT
2762}
2763
8a9f566a
DM
2764static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2765 u64 *sptep, unsigned int pte_access, gfn_t gfn,
a12f4381 2766 kvm_pfn_t pfn, struct kvm_page_fault *fault)
1e73f9dd 2767{
d786c778 2768 struct kvm_mmu_page *sp = sptep_to_sp(sptep);
eb5cd7ff 2769 int level = sp->role.level;
1e73f9dd 2770 int was_rmapped = 0;
c4371c2a 2771 int ret = RET_PF_FIXED;
c2a4eadf 2772 bool flush = false;
ad67e480 2773 bool wrprot;
d786c778 2774 u64 spte;
1e73f9dd 2775
a12f4381
PB
2776 /* Prefetching always gets a writable pfn. */
2777 bool host_writable = !fault || fault->map_writable;
2839180c 2778 bool prefetch = !fault || fault->prefetch;
a12f4381 2779 bool write_fault = fault && fault->write;
1e73f9dd 2780
f7616203
XG
2781 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2782 *sptep, write_fault, gfn);
1e73f9dd 2783
a54aa15c 2784 if (unlikely(is_noslot_pfn(pfn))) {
1075d41e 2785 vcpu->stat.pf_mmio_spte_created++;
a54aa15c
SC
2786 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2787 return RET_PF_EMULATE;
2788 }
2789
afd28fe1 2790 if (is_shadow_present_pte(*sptep)) {
1e73f9dd
MT
2791 /*
2792 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2793 * the parent of the now unreachable PTE.
2794 */
3bae0459 2795 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
1e73f9dd 2796 struct kvm_mmu_page *child;
d555c333 2797 u64 pte = *sptep;
1e73f9dd 2798
2ca3129e 2799 child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
bcdd9a93 2800 drop_parent_pte(child, sptep);
c2a4eadf 2801 flush = true;
d555c333 2802 } else if (pfn != spte_to_pfn(*sptep)) {
9ad17b10 2803 pgprintk("hfn old %llx new %llx\n",
d555c333 2804 spte_to_pfn(*sptep), pfn);
c3707958 2805 drop_spte(vcpu->kvm, sptep);
c2a4eadf 2806 flush = true;
6bed6b9e
JR
2807 } else
2808 was_rmapped = 1;
1e73f9dd 2809 }
852e3c19 2810
2839180c 2811 wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
7158bee4 2812 true, host_writable, &spte);
d786c778
PB
2813
2814 if (*sptep == spte) {
2815 ret = RET_PF_SPURIOUS;
2816 } else {
d786c778 2817 flush |= mmu_spte_update(sptep, spte);
5959ff4a 2818 trace_kvm_mmu_set_spte(level, gfn, sptep);
d786c778
PB
2819 }
2820
ad67e480 2821 if (wrprot) {
1e73f9dd 2822 if (write_fault)
9b8ebbdb 2823 ret = RET_PF_EMULATE;
a378b4e6 2824 }
c3134ce2 2825
d786c778 2826 if (flush)
c3134ce2
LT
2827 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
2828 KVM_PAGES_PER_HPAGE(level));
1e73f9dd 2829
d555c333 2830 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
05da4558 2831
4293ddb7 2832 if (!was_rmapped) {
d786c778 2833 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
6a97575d
DM
2834 rmap_add(vcpu, slot, sptep, gfn, pte_access);
2835 } else {
2836 /* Already rmapped but the pte_access bits may have changed. */
79e48cec 2837 kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
1c4f1fd6 2838 }
cb9aaa30 2839
9b8ebbdb 2840 return ret;
1c4f1fd6
AK
2841}
2842
957ed9ef
XG
2843static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2844 struct kvm_mmu_page *sp,
2845 u64 *start, u64 *end)
2846{
2847 struct page *pages[PTE_PREFETCH_NUM];
d9ef13c2 2848 struct kvm_memory_slot *slot;
0a2b64c5 2849 unsigned int access = sp->role.access;
957ed9ef
XG
2850 int i, ret;
2851 gfn_t gfn;
2852
79e48cec 2853 gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
d9ef13c2
PB
2854 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2855 if (!slot)
957ed9ef
XG
2856 return -1;
2857
d9ef13c2 2858 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
957ed9ef
XG
2859 if (ret <= 0)
2860 return -1;
2861
43fdcda9 2862 for (i = 0; i < ret; i++, gfn++, start++) {
8a9f566a 2863 mmu_set_spte(vcpu, slot, start, access, gfn,
a12f4381 2864 page_to_pfn(pages[i]), NULL);
43fdcda9
JS
2865 put_page(pages[i]);
2866 }
957ed9ef
XG
2867
2868 return 0;
2869}
2870
2871static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2872 struct kvm_mmu_page *sp, u64 *sptep)
2873{
2874 u64 *spte, *start = NULL;
2875 int i;
2876
2877 WARN_ON(!sp->role.direct);
2878
79e48cec 2879 i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
957ed9ef
XG
2880 spte = sp->spt + i;
2881
2882 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
c3707958 2883 if (is_shadow_present_pte(*spte) || spte == sptep) {
957ed9ef
XG
2884 if (!start)
2885 continue;
2886 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
c6cecc4b 2887 return;
957ed9ef
XG
2888 start = NULL;
2889 } else if (!start)
2890 start = spte;
2891 }
c6cecc4b
SC
2892 if (start)
2893 direct_pte_prefetch_many(vcpu, sp, start, spte);
957ed9ef
XG
2894}
2895
2896static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2897{
2898 struct kvm_mmu_page *sp;
2899
57354682 2900 sp = sptep_to_sp(sptep);
ac8d57e5 2901
957ed9ef 2902 /*
ac8d57e5
PF
2903 * Without accessed bits, there's no way to distinguish between
2904 * actually accessed translations and prefetched, so disable pte
2905 * prefetch if accessed bits aren't available.
957ed9ef 2906 */
ac8d57e5 2907 if (sp_ad_disabled(sp))
957ed9ef
XG
2908 return;
2909
3bae0459 2910 if (sp->role.level > PG_LEVEL_4K)
957ed9ef
XG
2911 return;
2912
4a42d848
DS
2913 /*
2914 * If addresses are being invalidated, skip prefetching to avoid
2915 * accidentally prefetching those addresses.
2916 */
20ec3ebd 2917 if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
4a42d848
DS
2918 return;
2919
957ed9ef
XG
2920 __direct_pte_prefetch(vcpu, sp, sptep);
2921}
2922
65e3b446
SC
2923/*
2924 * Lookup the mapping level for @gfn in the current mm.
2925 *
2926 * WARNING! Use of host_pfn_mapping_level() requires the caller and the end
2927 * consumer to be tied into KVM's handlers for MMU notifier events!
2928 *
2929 * There are several ways to safely use this helper:
2930 *
20ec3ebd 2931 * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
65e3b446
SC
2932 * consuming it. In this case, mmu_lock doesn't need to be held during the
2933 * lookup, but it does need to be held while checking the MMU notifier.
2934 *
2935 * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
2936 * event for the hva. This can be done by explicit checking the MMU notifier
2937 * or by ensuring that KVM already has a valid mapping that covers the hva.
2938 *
2939 * - Do not use the result to install new mappings, e.g. use the host mapping
2940 * level only to decide whether or not to zap an entry. In this case, it's
2941 * not required to hold mmu_lock (though it's highly likely the caller will
2942 * want to hold mmu_lock anyways, e.g. to modify SPTEs).
2943 *
2944 * Note! The lookup can still race with modifications to host page tables, but
2945 * the above "rules" ensure KVM will not _consume_ the result of the walk if a
2946 * race with the primary MMU occurs.
2947 */
a8ac499b 2948static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
8ca6f063 2949 const struct kvm_memory_slot *slot)
db543216 2950{
284dc493 2951 int level = PG_LEVEL_4K;
db543216 2952 unsigned long hva;
44187235 2953 unsigned long flags;
44187235
MZ
2954 pgd_t pgd;
2955 p4d_t p4d;
2956 pud_t pud;
2957 pmd_t pmd;
db543216 2958
293e306e
SC
2959 /*
2960 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
2961 * is not solely for performance, it's also necessary to avoid the
2962 * "writable" check in __gfn_to_hva_many(), which will always fail on
2963 * read-only memslots due to gfn_to_hva() assuming writes. Earlier
2964 * page fault steps have already verified the guest isn't writing a
2965 * read-only memslot.
2966 */
db543216
SC
2967 hva = __gfn_to_hva_memslot(slot, gfn);
2968
44187235 2969 /*
65e3b446
SC
2970 * Disable IRQs to prevent concurrent tear down of host page tables,
2971 * e.g. if the primary MMU promotes a P*D to a huge page and then frees
2972 * the original page table.
44187235
MZ
2973 */
2974 local_irq_save(flags);
2975
65e3b446
SC
2976 /*
2977 * Read each entry once. As above, a non-leaf entry can be promoted to
2978 * a huge page _during_ this walk. Re-reading the entry could send the
2979 * walk into the weeks, e.g. p*d_large() returns false (sees the old
2980 * value) and then p*d_offset() walks into the target huge page instead
2981 * of the old page table (sees the new value).
2982 */
44187235
MZ
2983 pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
2984 if (pgd_none(pgd))
2985 goto out;
2986
2987 p4d = READ_ONCE(*p4d_offset(&pgd, hva));
2988 if (p4d_none(p4d) || !p4d_present(p4d))
2989 goto out;
db543216 2990
44187235
MZ
2991 pud = READ_ONCE(*pud_offset(&p4d, hva));
2992 if (pud_none(pud) || !pud_present(pud))
2993 goto out;
2994
2995 if (pud_large(pud)) {
2996 level = PG_LEVEL_1G;
2997 goto out;
2998 }
2999
3000 pmd = READ_ONCE(*pmd_offset(&pud, hva));
3001 if (pmd_none(pmd) || !pmd_present(pmd))
3002 goto out;
3003
3004 if (pmd_large(pmd))
3005 level = PG_LEVEL_2M;
3006
3007out:
3008 local_irq_restore(flags);
db543216
SC
3009 return level;
3010}
3011
8ca6f063
BG
3012int kvm_mmu_max_mapping_level(struct kvm *kvm,
3013 const struct kvm_memory_slot *slot, gfn_t gfn,
a8ac499b 3014 int max_level)
1b6d9d9e
SC
3015{
3016 struct kvm_lpage_info *linfo;
ec607a56 3017 int host_level;
1b6d9d9e
SC
3018
3019 max_level = min(max_level, max_huge_page_level);
3020 for ( ; max_level > PG_LEVEL_4K; max_level--) {
3021 linfo = lpage_info_slot(gfn, slot, max_level);
3022 if (!linfo->disallow_lpage)
3023 break;
3024 }
3025
3026 if (max_level == PG_LEVEL_4K)
3027 return PG_LEVEL_4K;
3028
a8ac499b 3029 host_level = host_pfn_mapping_level(kvm, gfn, slot);
ec607a56 3030 return min(host_level, max_level);
1b6d9d9e
SC
3031}
3032
73a3c659 3033void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
0885904d 3034{
e710c5f6 3035 struct kvm_memory_slot *slot = fault->slot;
17eff019
SC
3036 kvm_pfn_t mask;
3037
73a3c659 3038 fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
3cf06612 3039
73a3c659
PB
3040 if (unlikely(fault->max_level == PG_LEVEL_4K))
3041 return;
17eff019 3042
5d49f08c 3043 if (is_error_noslot_pfn(fault->pfn))
73a3c659 3044 return;
17eff019 3045
e710c5f6 3046 if (kvm_slot_dirty_track_enabled(slot))
73a3c659 3047 return;
293e306e 3048
3cf06612
SC
3049 /*
3050 * Enforce the iTLB multihit workaround after capturing the requested
3051 * level, which will be used to do precise, accurate accounting.
3052 */
73a3c659 3053 fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
a8ac499b 3054 fault->gfn, fault->max_level);
73a3c659
PB
3055 if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
3056 return;
0885904d
SC
3057
3058 /*
20ec3ebd 3059 * mmu_invalidate_retry() was successful and mmu_lock is held, so
17eff019 3060 * the pmd can't be split from under us.
0885904d 3061 */
73a3c659
PB
3062 fault->goal_level = fault->req_level;
3063 mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
3064 VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
3065 fault->pfn &= ~mask;
0885904d
SC
3066}
3067
536f0e6a 3068void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
b8e8c830 3069{
536f0e6a
PB
3070 if (cur_level > PG_LEVEL_4K &&
3071 cur_level == fault->goal_level &&
b8e8c830
PB
3072 is_shadow_present_pte(spte) &&
3073 !is_large_pte(spte)) {
3074 /*
3075 * A small SPTE exists for this pfn, but FNAME(fetch)
3076 * and __direct_map would like to create a large PTE
3077 * instead: just force them to go down another level,
3078 * patching back for them into pfn the next 9 bits of
3079 * the address.
3080 */
536f0e6a
PB
3081 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
3082 KVM_PAGES_PER_HPAGE(cur_level - 1);
3083 fault->pfn |= fault->gfn & page_mask;
3084 fault->goal_level--;
b8e8c830
PB
3085 }
3086}
3087
43b74355 3088static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
140754bc 3089{
3fcf2d1b 3090 struct kvm_shadow_walk_iterator it;
140754bc 3091 struct kvm_mmu_page *sp;
73a3c659 3092 int ret;
43b74355 3093 gfn_t base_gfn = fault->gfn;
6aa8b732 3094
73a3c659 3095 kvm_mmu_hugepage_adjust(vcpu, fault);
4cd071d1 3096
f0066d94 3097 trace_kvm_mmu_spte_requested(fault);
43b74355 3098 for_each_shadow_entry(vcpu, fault->addr, it) {
b8e8c830
PB
3099 /*
3100 * We cannot overwrite existing page tables with an NX
3101 * large page, as the leaf could be executable.
3102 */
73a3c659 3103 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 3104 disallowed_hugepage_adjust(fault, *it.sptep, it.level);
b8e8c830 3105
43b74355 3106 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
73a3c659 3107 if (it.level == fault->goal_level)
9f652d21 3108 break;
6aa8b732 3109
2e65e842 3110 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
0cd8dc73
PB
3111 if (sp == ERR_PTR(-EEXIST))
3112 continue;
03fffc54
SC
3113
3114 link_shadow_page(vcpu, it.sptep, sp);
73a3c659
PB
3115 if (fault->is_tdp && fault->huge_page_disallowed &&
3116 fault->req_level >= it.level)
03fffc54 3117 account_huge_nx_page(vcpu->kvm, sp);
9f652d21 3118 }
3fcf2d1b 3119
b1a429fb
SC
3120 if (WARN_ON_ONCE(it.level != fault->goal_level))
3121 return -EFAULT;
3122
8a9f566a 3123 ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
a12f4381 3124 base_gfn, fault->pfn, fault);
12703759
SC
3125 if (ret == RET_PF_SPURIOUS)
3126 return ret;
3127
3fcf2d1b 3128 direct_pte_prefetch(vcpu, it.sptep);
3fcf2d1b 3129 return ret;
6aa8b732
AK
3130}
3131
77db5cbd 3132static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
bf998156 3133{
585a8b9b 3134 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
bf998156
HY
3135}
3136
ba049e93 3137static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
bf998156 3138{
4d8b81ab
XG
3139 /*
3140 * Do not cache the mmio info caused by writing the readonly gfn
3141 * into the spte otherwise read access on readonly gfn also can
3142 * caused mmio page fault and treat it as mmio access.
4d8b81ab
XG
3143 */
3144 if (pfn == KVM_PFN_ERR_RO_FAULT)
9b8ebbdb 3145 return RET_PF_EMULATE;
4d8b81ab 3146
e6c1502b 3147 if (pfn == KVM_PFN_ERR_HWPOISON) {
54bf36aa 3148 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
9b8ebbdb 3149 return RET_PF_RETRY;
d7c55201 3150 }
edba23e5 3151
2c151b25 3152 return -EFAULT;
bf998156
HY
3153}
3154
5276c616
SC
3155static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
3156 unsigned int access)
d7c55201 3157{
d7c55201 3158 /* The pfn is invalid, report the error! */
5276c616
SC
3159 if (unlikely(is_error_pfn(fault->pfn)))
3160 return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
d7c55201 3161
e710c5f6 3162 if (unlikely(!fault->slot)) {
3a13f4fe
PB
3163 gva_t gva = fault->is_tdp ? 0 : fault->addr;
3164
3165 vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
4af77151 3166 access & shadow_mmio_access_mask);
30ab5901
SC
3167 /*
3168 * If MMIO caching is disabled, emulate immediately without
3169 * touching the shadow page tables as attempting to install an
86931ff7
SC
3170 * MMIO SPTE will just be an expensive nop. Do not cache MMIO
3171 * whose gfn is greater than host.MAXPHYADDR, any guest that
3172 * generates such gfns is running nested and is being tricked
3173 * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
3174 * and only if L1's MAXPHYADDR is inaccurate with respect to
3175 * the hardware's).
30ab5901 3176 */
8b9e74bf 3177 if (unlikely(!enable_mmio_caching) ||
5276c616
SC
3178 unlikely(fault->gfn > kvm_mmu_max_gfn()))
3179 return RET_PF_EMULATE;
30ab5901 3180 }
d7c55201 3181
5276c616 3182 return RET_PF_CONTINUE;
d7c55201
XG
3183}
3184
3c8ad5a6 3185static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
c7ba5b48 3186{
1c118b82 3187 /*
5c64aba5
SC
3188 * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
3189 * reach the common page fault handler if the SPTE has an invalid MMIO
3190 * generation number. Refreshing the MMIO generation needs to go down
3191 * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag!
1c118b82 3192 */
3c8ad5a6 3193 if (fault->rsvd)
1c118b82
XG
3194 return false;
3195
c7ba5b48 3196 /*
f160c7b7 3197 * #PF can be fast if:
f160c7b7 3198 *
54275f74
SC
3199 * 1. The shadow page table entry is not present and A/D bits are
3200 * disabled _by KVM_, which could mean that the fault is potentially
3201 * caused by access tracking (if enabled). If A/D bits are enabled
3202 * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
3203 * bits for L2 and employ access tracking, but the fast page fault
3204 * mechanism only supports direct MMUs.
3205 * 2. The shadow page table entry is present, the access is a write,
3206 * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
3207 * the fault was caused by a write-protection violation. If the
3208 * SPTE is MMU-writable (determined later), the fault can be fixed
3209 * by setting the Writable bit, which can be done out of mmu_lock.
c7ba5b48 3210 */
5c64aba5
SC
3211 if (!fault->present)
3212 return !kvm_ad_enabled();
3213
3214 /*
3215 * Note, instruction fetches and writes are mutually exclusive, ignore
3216 * the "exec" flag.
3217 */
3218 return fault->write;
c7ba5b48
XG
3219}
3220
97dceba2
JS
3221/*
3222 * Returns true if the SPTE was fixed successfully. Otherwise,
3223 * someone else modified the SPTE from its original value.
3224 */
c7ba5b48 3225static bool
e710c5f6 3226fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
d3e328f2 3227 u64 *sptep, u64 old_spte, u64 new_spte)
c7ba5b48 3228{
9b51a630
KH
3229 /*
3230 * Theoretically we could also set dirty bit (and flush TLB) here in
3231 * order to eliminate unnecessary PML logging. See comments in
3232 * set_spte. But fast_page_fault is very unlikely to happen with PML
3233 * enabled, so we do not do this. This might result in the same GPA
3234 * to be logged in PML buffer again when the write really happens, and
3235 * eventually to be called by mark_page_dirty twice. But it's also no
3236 * harm. This also avoids the TLB flush needed after setting dirty bit
3237 * so non-PML cases won't be impacted.
3238 *
3239 * Compare with set_spte where instead shadow_dirty_mask is set.
3240 */
2db2f46f 3241 if (!try_cmpxchg64(sptep, &old_spte, new_spte))
97dceba2
JS
3242 return false;
3243
e710c5f6
DM
3244 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3245 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
c7ba5b48
XG
3246
3247 return true;
3248}
3249
3c8ad5a6 3250static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
d3e328f2 3251{
3c8ad5a6 3252 if (fault->exec)
d3e328f2
JS
3253 return is_executable_pte(spte);
3254
3c8ad5a6 3255 if (fault->write)
d3e328f2
JS
3256 return is_writable_pte(spte);
3257
3258 /* Fault was on Read access */
3259 return spte & PT_PRESENT_MASK;
3260}
3261
6e8eb206
DM
3262/*
3263 * Returns the last level spte pointer of the shadow page walk for the given
3264 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3265 * walk could be performed, returns NULL and *spte does not contain valid data.
3266 *
3267 * Contract:
3268 * - Must be called between walk_shadow_page_lockless_{begin,end}.
3269 * - The returned sptep must not be used after walk_shadow_page_lockless_end.
3270 */
3271static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3272{
3273 struct kvm_shadow_walk_iterator iterator;
3274 u64 old_spte;
3275 u64 *sptep = NULL;
3276
3277 for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3278 sptep = iterator.sptep;
3279 *spte = old_spte;
6e8eb206
DM
3280 }
3281
3282 return sptep;
3283}
3284
c7ba5b48 3285/*
c4371c2a 3286 * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
c7ba5b48 3287 */
3c8ad5a6 3288static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
c7ba5b48 3289{
92a476cb 3290 struct kvm_mmu_page *sp;
c4371c2a 3291 int ret = RET_PF_INVALID;
c7ba5b48 3292 u64 spte = 0ull;
6e8eb206 3293 u64 *sptep = NULL;
97dceba2 3294 uint retry_count = 0;
c7ba5b48 3295
3c8ad5a6 3296 if (!page_fault_can_be_fast(fault))
c4371c2a 3297 return ret;
c7ba5b48
XG
3298
3299 walk_shadow_page_lockless_begin(vcpu);
c7ba5b48 3300
97dceba2 3301 do {
d3e328f2 3302 u64 new_spte;
c7ba5b48 3303
6e8eb206 3304 if (is_tdp_mmu(vcpu->arch.mmu))
3c8ad5a6 3305 sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
6e8eb206 3306 else
3c8ad5a6 3307 sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
d162f30a 3308
ec89e643
SC
3309 if (!is_shadow_present_pte(spte))
3310 break;
3311
6e8eb206 3312 sp = sptep_to_sp(sptep);
97dceba2
JS
3313 if (!is_last_spte(spte, sp->role.level))
3314 break;
c7ba5b48 3315
97dceba2 3316 /*
f160c7b7
JS
3317 * Check whether the memory access that caused the fault would
3318 * still cause it if it were to be performed right now. If not,
3319 * then this is a spurious fault caused by TLB lazily flushed,
3320 * or some other CPU has already fixed the PTE after the
3321 * current CPU took the fault.
97dceba2
JS
3322 *
3323 * Need not check the access of upper level table entries since
3324 * they are always ACC_ALL.
3325 */
3c8ad5a6 3326 if (is_access_allowed(fault, spte)) {
c4371c2a 3327 ret = RET_PF_SPURIOUS;
d3e328f2
JS
3328 break;
3329 }
f160c7b7 3330
d3e328f2
JS
3331 new_spte = spte;
3332
54275f74
SC
3333 /*
3334 * KVM only supports fixing page faults outside of MMU lock for
3335 * direct MMUs, nested MMUs are always indirect, and KVM always
3336 * uses A/D bits for non-nested MMUs. Thus, if A/D bits are
3337 * enabled, the SPTE can't be an access-tracked SPTE.
3338 */
3339 if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
d3e328f2
JS
3340 new_spte = restore_acc_track_spte(new_spte);
3341
3342 /*
54275f74
SC
3343 * To keep things simple, only SPTEs that are MMU-writable can
3344 * be made fully writable outside of mmu_lock, e.g. only SPTEs
3345 * that were write-protected for dirty-logging or access
3346 * tracking are handled here. Don't bother checking if the
3347 * SPTE is writable to prioritize running with A/D bits enabled.
3348 * The is_access_allowed() check above handles the common case
3349 * of the fault being spurious, and the SPTE is known to be
3350 * shadow-present, i.e. except for access tracking restoration
3351 * making the new SPTE writable, the check is wasteful.
d3e328f2 3352 */
706c9c55 3353 if (fault->write && is_mmu_writable_spte(spte)) {
d3e328f2 3354 new_spte |= PT_WRITABLE_MASK;
f160c7b7
JS
3355
3356 /*
10c30de0
JS
3357 * Do not fix write-permission on the large spte when
3358 * dirty logging is enabled. Since we only dirty the
3359 * first page into the dirty-bitmap in
d3e328f2
JS
3360 * fast_pf_fix_direct_spte(), other pages are missed
3361 * if its slot has dirty logging enabled.
3362 *
3363 * Instead, we let the slow page fault path create a
3364 * normal spte to fix the access.
f160c7b7 3365 */
10c30de0
JS
3366 if (sp->role.level > PG_LEVEL_4K &&
3367 kvm_slot_dirty_track_enabled(fault->slot))
f160c7b7 3368 break;
97dceba2 3369 }
c7ba5b48 3370
f160c7b7 3371 /* Verify that the fault can be handled in the fast path */
d3e328f2 3372 if (new_spte == spte ||
3c8ad5a6 3373 !is_access_allowed(fault, new_spte))
97dceba2
JS
3374 break;
3375
3376 /*
3377 * Currently, fast page fault only works for direct mapping
3378 * since the gfn is not stable for indirect shadow page. See
3ecad8c2 3379 * Documentation/virt/kvm/locking.rst to get more detail.
97dceba2 3380 */
e710c5f6 3381 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
c4371c2a 3382 ret = RET_PF_FIXED;
97dceba2 3383 break;
c4371c2a 3384 }
97dceba2
JS
3385
3386 if (++retry_count > 4) {
3387 printk_once(KERN_WARNING
3388 "kvm: Fast #PF retrying more than 4 times.\n");
3389 break;
3390 }
3391
97dceba2 3392 } while (true);
c126d94f 3393
f0066d94 3394 trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
c7ba5b48
XG
3395 walk_shadow_page_lockless_end(vcpu);
3396
1075d41e
SC
3397 if (ret != RET_PF_INVALID)
3398 vcpu->stat.pf_fast++;
3399
c4371c2a 3400 return ret;
c7ba5b48
XG
3401}
3402
74b566e6
JS
3403static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3404 struct list_head *invalid_list)
17ac10ad 3405{
4db35314 3406 struct kvm_mmu_page *sp;
17ac10ad 3407
74b566e6 3408 if (!VALID_PAGE(*root_hpa))
7b53aa56 3409 return;
35af577a 3410
2ca3129e 3411 sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
9191b8f0
PB
3412 if (WARN_ON(!sp))
3413 return;
02c00b3a 3414
2bdb3d84 3415 if (is_tdp_mmu_page(sp))
6103bc07 3416 kvm_tdp_mmu_put_root(kvm, sp, false);
76eb54e7
BG
3417 else if (!--sp->root_count && sp->role.invalid)
3418 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
17ac10ad 3419
74b566e6
JS
3420 *root_hpa = INVALID_PAGE;
3421}
3422
08fb59d8 3423/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
0c1c92f1 3424void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
6a82cd1c 3425 ulong roots_to_free)
74b566e6
JS
3426{
3427 int i;
3428 LIST_HEAD(invalid_list);
594bef79 3429 bool free_active_root;
74b566e6 3430
b94742c9 3431 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
74b566e6 3432
08fb59d8 3433 /* Before acquiring the MMU lock, see if we need to do any real work. */
594bef79
PB
3434 free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3435 && VALID_PAGE(mmu->root.hpa);
3436
3437 if (!free_active_root) {
b94742c9
JS
3438 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3439 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3440 VALID_PAGE(mmu->prev_roots[i].hpa))
3441 break;
3442
3443 if (i == KVM_MMU_NUM_PREV_ROOTS)
3444 return;
3445 }
35af577a 3446
531810ca 3447 write_lock(&kvm->mmu_lock);
17ac10ad 3448
b94742c9
JS
3449 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3450 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
4d710de9 3451 mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
b94742c9 3452 &invalid_list);
7c390d35 3453
08fb59d8 3454 if (free_active_root) {
594bef79 3455 if (to_shadow_page(mmu->root.hpa)) {
b9e5603c 3456 mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
04d45551 3457 } else if (mmu->pae_root) {
c834e5e4
SC
3458 for (i = 0; i < 4; ++i) {
3459 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3460 continue;
3461
3462 mmu_free_root_page(kvm, &mmu->pae_root[i],
3463 &invalid_list);
3464 mmu->pae_root[i] = INVALID_PAE_ROOT;
3465 }
08fb59d8 3466 }
b9e5603c
PB
3467 mmu->root.hpa = INVALID_PAGE;
3468 mmu->root.pgd = 0;
17ac10ad 3469 }
74b566e6 3470
4d710de9 3471 kvm_mmu_commit_zap_page(kvm, &invalid_list);
531810ca 3472 write_unlock(&kvm->mmu_lock);
17ac10ad 3473}
74b566e6 3474EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
17ac10ad 3475
0c1c92f1 3476void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
25b62c62
SC
3477{
3478 unsigned long roots_to_free = 0;
3479 hpa_t root_hpa;
3480 int i;
3481
3482 /*
3483 * This should not be called while L2 is active, L2 can't invalidate
3484 * _only_ its own roots, e.g. INVVPID unconditionally exits.
3485 */
7a458f0e 3486 WARN_ON_ONCE(mmu->root_role.guest_mode);
25b62c62
SC
3487
3488 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3489 root_hpa = mmu->prev_roots[i].hpa;
3490 if (!VALID_PAGE(root_hpa))
3491 continue;
3492
3493 if (!to_shadow_page(root_hpa) ||
3494 to_shadow_page(root_hpa)->role.guest_mode)
3495 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3496 }
3497
0c1c92f1 3498 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
25b62c62
SC
3499}
3500EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3501
3502
8986ecc0
MT
3503static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3504{
3505 int ret = 0;
3506
995decb6 3507 if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
a8eeb04a 3508 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
8986ecc0
MT
3509 ret = 1;
3510 }
3511
3512 return ret;
3513}
3514
2e65e842 3515static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
86938ab6 3516 u8 level)
651dd37a 3517{
2e65e842 3518 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
651dd37a 3519 struct kvm_mmu_page *sp;
8123f265 3520
2e65e842 3521 role.level = level;
7f497775 3522 role.quadrant = quadrant;
2e65e842 3523
7f497775
DM
3524 WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
3525 WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
2e65e842 3526
87654643 3527 sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
8123f265
SC
3528 ++sp->root_count;
3529
8123f265
SC
3530 return __pa(sp->spt);
3531}
3532
3533static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3534{
b37233c9 3535 struct kvm_mmu *mmu = vcpu->arch.mmu;
a972e29c 3536 u8 shadow_root_level = mmu->root_role.level;
8123f265 3537 hpa_t root;
7ebaf15e 3538 unsigned i;
4a38162e
PB
3539 int r;
3540
3541 write_lock(&vcpu->kvm->mmu_lock);
3542 r = make_mmu_pages_available(vcpu);
3543 if (r < 0)
3544 goto out_unlock;
651dd37a 3545
897218ff 3546 if (is_tdp_mmu_enabled(vcpu->kvm)) {
02c00b3a 3547 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
b9e5603c 3548 mmu->root.hpa = root;
02c00b3a 3549 } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
86938ab6 3550 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
b9e5603c 3551 mmu->root.hpa = root;
8123f265 3552 } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
4a38162e
PB
3553 if (WARN_ON_ONCE(!mmu->pae_root)) {
3554 r = -EIO;
3555 goto out_unlock;
3556 }
73ad1606 3557
651dd37a 3558 for (i = 0; i < 4; ++i) {
c834e5e4 3559 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
651dd37a 3560
7f497775 3561 root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
2e65e842 3562 PT32_ROOT_LEVEL);
17e368d9 3563 mmu->pae_root[i] = root | PT_PRESENT_MASK |
d2263de1 3564 shadow_me_value;
651dd37a 3565 }
b9e5603c 3566 mmu->root.hpa = __pa(mmu->pae_root);
73ad1606
SC
3567 } else {
3568 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
4a38162e
PB
3569 r = -EIO;
3570 goto out_unlock;
73ad1606 3571 }
3651c7fc 3572
b9e5603c
PB
3573 /* root.pgd is ignored for direct MMUs. */
3574 mmu->root.pgd = 0;
4a38162e
PB
3575out_unlock:
3576 write_unlock(&vcpu->kvm->mmu_lock);
3577 return r;
651dd37a
JR
3578}
3579
1e76a3ce
DS
3580static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3581{
3582 struct kvm_memslots *slots;
3583 struct kvm_memory_slot *slot;
a54d8066 3584 int r = 0, i, bkt;
1e76a3ce
DS
3585
3586 /*
3587 * Check if this is the first shadow root being allocated before
3588 * taking the lock.
3589 */
3590 if (kvm_shadow_root_allocated(kvm))
3591 return 0;
3592
3593 mutex_lock(&kvm->slots_arch_lock);
3594
3595 /* Recheck, under the lock, whether this is the first shadow root. */
3596 if (kvm_shadow_root_allocated(kvm))
3597 goto out_unlock;
3598
3599 /*
3600 * Check if anything actually needs to be allocated, e.g. all metadata
3601 * will be allocated upfront if TDP is disabled.
3602 */
3603 if (kvm_memslots_have_rmaps(kvm) &&
3604 kvm_page_track_write_tracking_enabled(kvm))
3605 goto out_success;
3606
3607 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
3608 slots = __kvm_memslots(kvm, i);
a54d8066 3609 kvm_for_each_memslot(slot, bkt, slots) {
1e76a3ce
DS
3610 /*
3611 * Both of these functions are no-ops if the target is
3612 * already allocated, so unconditionally calling both
3613 * is safe. Intentionally do NOT free allocations on
3614 * failure to avoid having to track which allocations
3615 * were made now versus when the memslot was created.
3616 * The metadata is guaranteed to be freed when the slot
3617 * is freed, and will be kept/used if userspace retries
3618 * KVM_RUN instead of killing the VM.
3619 */
3620 r = memslot_rmap_alloc(slot, slot->npages);
3621 if (r)
3622 goto out_unlock;
3623 r = kvm_page_track_write_tracking_alloc(slot);
3624 if (r)
3625 goto out_unlock;
3626 }
3627 }
3628
3629 /*
3630 * Ensure that shadow_root_allocated becomes true strictly after
3631 * all the related pointers are set.
3632 */
3633out_success:
3634 smp_store_release(&kvm->arch.shadow_root_allocated, true);
3635
3636out_unlock:
3637 mutex_unlock(&kvm->slots_arch_lock);
3638 return r;
3639}
3640
651dd37a 3641static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
17ac10ad 3642{
b37233c9 3643 struct kvm_mmu *mmu = vcpu->arch.mmu;
6e0918ae 3644 u64 pdptrs[4], pm_mask;
be01e8e2 3645 gfn_t root_gfn, root_pgd;
7f497775 3646 int quadrant, i, r;
8123f265 3647 hpa_t root;
3bb65a22 3648
b37233c9 3649 root_pgd = mmu->get_guest_pgd(vcpu);
be01e8e2 3650 root_gfn = root_pgd >> PAGE_SHIFT;
17ac10ad 3651
651dd37a
JR
3652 if (mmu_check_root(vcpu, root_gfn))
3653 return 1;
3654
4a38162e
PB
3655 /*
3656 * On SVM, reading PDPTRs might access guest memory, which might fault
3657 * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
3658 */
4d25502a 3659 if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
6e0918ae
SC
3660 for (i = 0; i < 4; ++i) {
3661 pdptrs[i] = mmu->get_pdptr(vcpu, i);
3662 if (!(pdptrs[i] & PT_PRESENT_MASK))
3663 continue;
3664
3665 if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
3666 return 1;
3667 }
3668 }
3669
1e76a3ce 3670 r = mmu_first_shadow_root_alloc(vcpu->kvm);
d501f747
BG
3671 if (r)
3672 return r;
3673
4a38162e
PB
3674 write_lock(&vcpu->kvm->mmu_lock);
3675 r = make_mmu_pages_available(vcpu);
3676 if (r < 0)
3677 goto out_unlock;
3678
651dd37a
JR
3679 /*
3680 * Do we shadow a long mode page table? If so we need to
3681 * write-protect the guests page table root.
3682 */
4d25502a 3683 if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
8123f265 3684 root = mmu_alloc_root(vcpu, root_gfn, 0,
86938ab6 3685 mmu->root_role.level);
b9e5603c 3686 mmu->root.hpa = root;
be01e8e2 3687 goto set_root_pgd;
17ac10ad 3688 }
f87f9288 3689
4a38162e
PB
3690 if (WARN_ON_ONCE(!mmu->pae_root)) {
3691 r = -EIO;
3692 goto out_unlock;
3693 }
73ad1606 3694
651dd37a
JR
3695 /*
3696 * We shadow a 32 bit page table. This may be a legacy 2-level
81407ca5
JR
3697 * or a PAE 3-level page table. In either case we need to be aware that
3698 * the shadow page table may be a PAE or a long mode page table.
651dd37a 3699 */
e54f1ff2 3700 pm_mask = PT_PRESENT_MASK | shadow_me_value;
a972e29c 3701 if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
81407ca5
JR
3702 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3703
03ca4589 3704 if (WARN_ON_ONCE(!mmu->pml4_root)) {
4a38162e
PB
3705 r = -EIO;
3706 goto out_unlock;
3707 }
03ca4589 3708 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
cb0f722a 3709
a972e29c 3710 if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
cb0f722a
WH
3711 if (WARN_ON_ONCE(!mmu->pml5_root)) {
3712 r = -EIO;
3713 goto out_unlock;
3714 }
3715 mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3716 }
04d45551
SC
3717 }
3718
17ac10ad 3719 for (i = 0; i < 4; ++i) {
c834e5e4 3720 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
6e6ec584 3721
4d25502a 3722 if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
6e0918ae 3723 if (!(pdptrs[i] & PT_PRESENT_MASK)) {
c834e5e4 3724 mmu->pae_root[i] = INVALID_PAE_ROOT;
417726a3
AK
3725 continue;
3726 }
6e0918ae 3727 root_gfn = pdptrs[i] >> PAGE_SHIFT;
5a7388c2 3728 }
8facbbff 3729
7f497775
DM
3730 /*
3731 * If shadowing 32-bit non-PAE page tables, each PAE page
3732 * directory maps one quarter of the guest's non-PAE page
3733 * directory. Othwerise each PAE page direct shadows one guest
3734 * PAE page directory so that quadrant should be 0.
3735 */
3736 quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
3737
3738 root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
b37233c9 3739 mmu->pae_root[i] = root | pm_mask;
17ac10ad 3740 }
81407ca5 3741
a972e29c 3742 if (mmu->root_role.level == PT64_ROOT_5LEVEL)
b9e5603c 3743 mmu->root.hpa = __pa(mmu->pml5_root);
a972e29c 3744 else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
b9e5603c 3745 mmu->root.hpa = __pa(mmu->pml4_root);
ba0a194f 3746 else
b9e5603c 3747 mmu->root.hpa = __pa(mmu->pae_root);
81407ca5 3748
be01e8e2 3749set_root_pgd:
b9e5603c 3750 mmu->root.pgd = root_pgd;
4a38162e
PB
3751out_unlock:
3752 write_unlock(&vcpu->kvm->mmu_lock);
ad7dc69a 3753
c6c937d6 3754 return r;
17ac10ad
AK
3755}
3756
748e52b9
SC
3757static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3758{
3759 struct kvm_mmu *mmu = vcpu->arch.mmu;
a972e29c 3760 bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
cb0f722a
WH
3761 u64 *pml5_root = NULL;
3762 u64 *pml4_root = NULL;
3763 u64 *pae_root;
81407ca5
JR
3764
3765 /*
748e52b9
SC
3766 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3767 * tables are allocated and initialized at root creation as there is no
3768 * equivalent level in the guest's NPT to shadow. Allocate the tables
3769 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
81407ca5 3770 */
347a0d0d
PB
3771 if (mmu->root_role.direct ||
3772 mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
a972e29c 3773 mmu->root_role.level < PT64_ROOT_4LEVEL)
748e52b9 3774 return 0;
81407ca5 3775
a717a780
SC
3776 /*
3777 * NPT, the only paging mode that uses this horror, uses a fixed number
3778 * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3779 * all MMus are 5-level. Thus, this can safely require that pml5_root
3780 * is allocated if the other roots are valid and pml5 is needed, as any
3781 * prior MMU would also have required pml5.
3782 */
3783 if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
748e52b9 3784 return 0;
81407ca5 3785
748e52b9
SC
3786 /*
3787 * The special roots should always be allocated in concert. Yell and
3788 * bail if KVM ends up in a state where only one of the roots is valid.
3789 */
cb0f722a 3790 if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
a717a780 3791 (need_pml5 && mmu->pml5_root)))
748e52b9 3792 return -EIO;
81407ca5 3793
4a98623d
SC
3794 /*
3795 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3796 * doesn't need to be decrypted.
3797 */
748e52b9
SC
3798 pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3799 if (!pae_root)
3800 return -ENOMEM;
81407ca5 3801
cb0f722a 3802#ifdef CONFIG_X86_64
03ca4589 3803 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
cb0f722a
WH
3804 if (!pml4_root)
3805 goto err_pml4;
3806
a717a780 3807 if (need_pml5) {
cb0f722a
WH
3808 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3809 if (!pml5_root)
3810 goto err_pml5;
81407ca5 3811 }
cb0f722a 3812#endif
81407ca5 3813
748e52b9 3814 mmu->pae_root = pae_root;
03ca4589 3815 mmu->pml4_root = pml4_root;
cb0f722a 3816 mmu->pml5_root = pml5_root;
ad7dc69a 3817
8986ecc0 3818 return 0;
cb0f722a
WH
3819
3820#ifdef CONFIG_X86_64
3821err_pml5:
3822 free_page((unsigned long)pml4_root);
3823err_pml4:
3824 free_page((unsigned long)pae_root);
3825 return -ENOMEM;
3826#endif
17ac10ad
AK
3827}
3828
264d3dc1
LJ
3829static bool is_unsync_root(hpa_t root)
3830{
3831 struct kvm_mmu_page *sp;
3832
61b05a9f
LJ
3833 if (!VALID_PAGE(root))
3834 return false;
3835
264d3dc1
LJ
3836 /*
3837 * The read barrier orders the CPU's read of SPTE.W during the page table
3838 * walk before the reads of sp->unsync/sp->unsync_children here.
3839 *
3840 * Even if another CPU was marking the SP as unsync-ed simultaneously,
3841 * any guest page table changes are not guaranteed to be visible anyway
3842 * until this VCPU issues a TLB flush strictly after those changes are
3843 * made. We only need to ensure that the other CPU sets these flags
3844 * before any actual changes to the page tables are made. The comments
3845 * in mmu_try_to_unsync_pages() describe what could go wrong if this
3846 * requirement isn't satisfied.
3847 */
3848 smp_rmb();
3849 sp = to_shadow_page(root);
5d6a3221
SC
3850
3851 /*
3852 * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
3853 * PDPTEs for a given PAE root need to be synchronized individually.
3854 */
3855 if (WARN_ON_ONCE(!sp))
3856 return false;
3857
264d3dc1
LJ
3858 if (sp->unsync || sp->unsync_children)
3859 return true;
3860
3861 return false;
3862}
3863
578e1c4d 3864void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
0ba73cda
MT
3865{
3866 int i;
3867 struct kvm_mmu_page *sp;
3868
347a0d0d 3869 if (vcpu->arch.mmu->root_role.direct)
81407ca5
JR
3870 return;
3871
b9e5603c 3872 if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
0ba73cda 3873 return;
6903074c 3874
56f17dd3 3875 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
578e1c4d 3876
4d25502a 3877 if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
b9e5603c 3878 hpa_t root = vcpu->arch.mmu->root.hpa;
e47c4aee 3879 sp = to_shadow_page(root);
578e1c4d 3880
264d3dc1 3881 if (!is_unsync_root(root))
578e1c4d
JS
3882 return;
3883
531810ca 3884 write_lock(&vcpu->kvm->mmu_lock);
65855ed8 3885 mmu_sync_children(vcpu, sp, true);
531810ca 3886 write_unlock(&vcpu->kvm->mmu_lock);
0ba73cda
MT
3887 return;
3888 }
578e1c4d 3889
531810ca 3890 write_lock(&vcpu->kvm->mmu_lock);
578e1c4d 3891
0ba73cda 3892 for (i = 0; i < 4; ++i) {
44dd3ffa 3893 hpa_t root = vcpu->arch.mmu->pae_root[i];
0ba73cda 3894
c834e5e4 3895 if (IS_VALID_PAE_ROOT(root)) {
2ca3129e 3896 root &= SPTE_BASE_ADDR_MASK;
e47c4aee 3897 sp = to_shadow_page(root);
65855ed8 3898 mmu_sync_children(vcpu, sp, true);
0ba73cda
MT
3899 }
3900 }
0ba73cda 3901
531810ca 3902 write_unlock(&vcpu->kvm->mmu_lock);
0ba73cda
MT
3903}
3904
61b05a9f
LJ
3905void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
3906{
3907 unsigned long roots_to_free = 0;
3908 int i;
3909
3910 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3911 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
3912 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3913
3914 /* sync prev_roots by simply freeing them */
0c1c92f1 3915 kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
61b05a9f
LJ
3916}
3917
1f5a21ee 3918static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5b22bbe7 3919 gpa_t vaddr, u64 access,
1f5a21ee 3920 struct x86_exception *exception)
6aa8b732 3921{
ab9ae313
AK
3922 if (exception)
3923 exception->error_code = 0;
c59a0f57 3924 return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
6539e738
JR
3925}
3926
ded58749 3927static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
ce88decf 3928{
9034e6e8
PB
3929 /*
3930 * A nested guest cannot use the MMIO cache if it is using nested
3931 * page tables, because cr2 is a nGPA while the cache stores GPAs.
3932 */
3933 if (mmu_is_nested(vcpu))
3934 return false;
3935
ce88decf
XG
3936 if (direct)
3937 return vcpu_match_mmio_gpa(vcpu, addr);
3938
3939 return vcpu_match_mmio_gva(vcpu, addr);
3940}
3941
95fb5b02
BG
3942/*
3943 * Return the level of the lowest level SPTE added to sptes.
3944 * That SPTE may be non-present.
c5c8c7c5
DM
3945 *
3946 * Must be called between walk_shadow_page_lockless_{begin,end}.
95fb5b02 3947 */
39b4d43e 3948static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
ce88decf
XG
3949{
3950 struct kvm_shadow_walk_iterator iterator;
2aa07893 3951 int leaf = -1;
95fb5b02 3952 u64 spte;
ce88decf 3953
39b4d43e
SC
3954 for (shadow_walk_init(&iterator, vcpu, addr),
3955 *root_level = iterator.level;
47ab8751
XG
3956 shadow_walk_okay(&iterator);
3957 __shadow_walk_next(&iterator, spte)) {
95fb5b02 3958 leaf = iterator.level;
47ab8751
XG
3959 spte = mmu_spte_get_lockless(iterator.sptep);
3960
dde81f94 3961 sptes[leaf] = spte;
95fb5b02
BG
3962 }
3963
95fb5b02
BG
3964 return leaf;
3965}
3966
9aa41879 3967/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
95fb5b02
BG
3968static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3969{
dde81f94 3970 u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
95fb5b02 3971 struct rsvd_bits_validate *rsvd_check;
39b4d43e 3972 int root, leaf, level;
95fb5b02
BG
3973 bool reserved = false;
3974
c5c8c7c5
DM
3975 walk_shadow_page_lockless_begin(vcpu);
3976
63c0cac9 3977 if (is_tdp_mmu(vcpu->arch.mmu))
39b4d43e 3978 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
95fb5b02 3979 else
39b4d43e 3980 leaf = get_walk(vcpu, addr, sptes, &root);
95fb5b02 3981
c5c8c7c5
DM
3982 walk_shadow_page_lockless_end(vcpu);
3983
2aa07893
SC
3984 if (unlikely(leaf < 0)) {
3985 *sptep = 0ull;
3986 return reserved;
3987 }
3988
9aa41879
SC
3989 *sptep = sptes[leaf];
3990
3991 /*
3992 * Skip reserved bits checks on the terminal leaf if it's not a valid
3993 * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by
3994 * design, always have reserved bits set. The purpose of the checks is
3995 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
3996 */
3997 if (!is_shadow_present_pte(sptes[leaf]))
3998 leaf++;
95fb5b02
BG
3999
4000 rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
4001
9aa41879 4002 for (level = root; level >= leaf; level--)
961f8445 4003 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
47ab8751 4004
47ab8751 4005 if (reserved) {
bb4cdf3a 4006 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
47ab8751 4007 __func__, addr);
95fb5b02 4008 for (level = root; level >= leaf; level--)
bb4cdf3a
SC
4009 pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
4010 sptes[level], level,
961f8445 4011 get_rsvd_bits(rsvd_check, sptes[level], level));
47ab8751 4012 }
ddce6208 4013
47ab8751 4014 return reserved;
ce88decf
XG
4015}
4016
e08d26f0 4017static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
ce88decf
XG
4018{
4019 u64 spte;
47ab8751 4020 bool reserved;
ce88decf 4021
ded58749 4022 if (mmio_info_in_cache(vcpu, addr, direct))
9b8ebbdb 4023 return RET_PF_EMULATE;
ce88decf 4024
95fb5b02 4025 reserved = get_mmio_spte(vcpu, addr, &spte);
450869d6 4026 if (WARN_ON(reserved))
9b8ebbdb 4027 return -EINVAL;
ce88decf
XG
4028
4029 if (is_mmio_spte(spte)) {
4030 gfn_t gfn = get_mmio_spte_gfn(spte);
0a2b64c5 4031 unsigned int access = get_mmio_spte_access(spte);
ce88decf 4032
54bf36aa 4033 if (!check_mmio_spte(vcpu, spte))
9b8ebbdb 4034 return RET_PF_INVALID;
f8f55942 4035
ce88decf
XG
4036 if (direct)
4037 addr = 0;
4f022648
XG
4038
4039 trace_handle_mmio_page_fault(addr, gfn, access);
ce88decf 4040 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
9b8ebbdb 4041 return RET_PF_EMULATE;
ce88decf
XG
4042 }
4043
ce88decf
XG
4044 /*
4045 * If the page table is zapped by other cpus, let CPU fault again on
4046 * the address.
4047 */
9b8ebbdb 4048 return RET_PF_RETRY;
ce88decf 4049}
ce88decf 4050
3d0c27ad 4051static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
b8a5d551 4052 struct kvm_page_fault *fault)
3d0c27ad 4053{
b8a5d551 4054 if (unlikely(fault->rsvd))
3d0c27ad
XG
4055 return false;
4056
b8a5d551 4057 if (!fault->present || !fault->write)
3d0c27ad
XG
4058 return false;
4059
4060 /*
4061 * guest is writing the page which is write tracked which can
4062 * not be fixed by page fault handler.
4063 */
9d395a0a 4064 if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
3d0c27ad
XG
4065 return true;
4066
4067 return false;
4068}
4069
e5691a81
XG
4070static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4071{
4072 struct kvm_shadow_walk_iterator iterator;
4073 u64 spte;
4074
e5691a81 4075 walk_shadow_page_lockless_begin(vcpu);
3e44dce4 4076 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
e5691a81 4077 clear_sp_write_flooding_count(iterator.sptep);
e5691a81
XG
4078 walk_shadow_page_lockless_end(vcpu);
4079}
4080
6f3c1fc5
LZ
4081static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
4082{
4083 /* make sure the token value is not 0 */
4084 u32 id = vcpu->arch.apf.id;
4085
4086 if (id << 12 == 0)
4087 vcpu->arch.apf.id = 1;
4088
4089 return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4090}
4091
e8c22266
VK
4092static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4093 gfn_t gfn)
af585b92
GN
4094{
4095 struct kvm_arch_async_pf arch;
fb67e14f 4096
6f3c1fc5 4097 arch.token = alloc_apf_token(vcpu);
af585b92 4098 arch.gfn = gfn;
347a0d0d 4099 arch.direct_map = vcpu->arch.mmu->root_role.direct;
d8dd54e0 4100 arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
af585b92 4101
9f1a8526
SC
4102 return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4103 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
af585b92
GN
4104}
4105
8a009d5b
SC
4106void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
4107{
4108 int r;
4109
4110 if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
4111 work->wakeup_all)
4112 return;
4113
4114 r = kvm_mmu_reload(vcpu);
4115 if (unlikely(r))
4116 return;
4117
4118 if (!vcpu->arch.mmu->root_role.direct &&
4119 work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
4120 return;
4121
4122 kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
4123}
4124
5276c616 4125static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
af585b92 4126{
e710c5f6 4127 struct kvm_memory_slot *slot = fault->slot;
af585b92
GN
4128 bool async;
4129
e0c37868
SC
4130 /*
4131 * Retry the page fault if the gfn hit a memslot that is being deleted
4132 * or moved. This ensures any existing SPTEs for the old memslot will
4133 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4134 */
4135 if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
5276c616 4136 return RET_PF_RETRY;
e0c37868 4137
9cc13d60
ML
4138 if (!kvm_is_visible_memslot(slot)) {
4139 /* Don't expose private memslots to L2. */
4140 if (is_guest_mode(vcpu)) {
e710c5f6 4141 fault->slot = NULL;
3647cd04
PB
4142 fault->pfn = KVM_PFN_NOSLOT;
4143 fault->map_writable = false;
5276c616 4144 return RET_PF_CONTINUE;
9cc13d60
ML
4145 }
4146 /*
4147 * If the APIC access page exists but is disabled, go directly
4148 * to emulation without caching the MMIO access or creating a
4149 * MMIO SPTE. That way the cache doesn't need to be purged
4150 * when the AVIC is re-enabled.
4151 */
4152 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
5276c616
SC
4153 !kvm_apicv_activated(vcpu->kvm))
4154 return RET_PF_EMULATE;
3a2936de
JM
4155 }
4156
3520469d 4157 async = false;
3647cd04
PB
4158 fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
4159 fault->write, &fault->map_writable,
4160 &fault->hva);
af585b92 4161 if (!async)
5276c616 4162 return RET_PF_CONTINUE; /* *pfn has correct page already */
af585b92 4163
2839180c 4164 if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
3647cd04
PB
4165 trace_kvm_try_async_get_page(fault->addr, fault->gfn);
4166 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
1685c0f3 4167 trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
af585b92 4168 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
5276c616
SC
4169 return RET_PF_RETRY;
4170 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
4171 return RET_PF_RETRY;
4172 }
af585b92
GN
4173 }
4174
3647cd04
PB
4175 fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
4176 fault->write, &fault->map_writable,
4177 &fault->hva);
5276c616 4178 return RET_PF_CONTINUE;
af585b92
GN
4179}
4180
a955cad8
SC
4181/*
4182 * Returns true if the page fault is stale and needs to be retried, i.e. if the
4183 * root was invalidated by a memslot update or a relevant mmu_notifier fired.
4184 */
4185static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
4186 struct kvm_page_fault *fault, int mmu_seq)
4187{
b9e5603c 4188 struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
18c841e1
SC
4189
4190 /* Special roots, e.g. pae_root, are not backed by shadow pages. */
4191 if (sp && is_obsolete_sp(vcpu->kvm, sp))
4192 return true;
4193
4194 /*
4195 * Roots without an associated shadow page are considered invalid if
4196 * there is a pending request to free obsolete roots. The request is
4197 * only a hint that the current root _may_ be obsolete and needs to be
4198 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4199 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4200 * to reload even if no vCPU is actively using the root.
4201 */
527d5cd7 4202 if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
a955cad8
SC
4203 return true;
4204
4205 return fault->slot &&
20ec3ebd 4206 mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
a955cad8
SC
4207}
4208
4326e57e 4209static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
6aa8b732 4210{
63c0cac9 4211 bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
6aa8b732 4212
0f90e1c1 4213 unsigned long mmu_seq;
83f06fa7 4214 int r;
ce88decf 4215
3c8ad5a6 4216 fault->gfn = fault->addr >> PAGE_SHIFT;
e710c5f6
DM
4217 fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
4218
b8a5d551 4219 if (page_fault_handle_page_track(vcpu, fault))
9b8ebbdb 4220 return RET_PF_EMULATE;
ce88decf 4221
3c8ad5a6 4222 r = fast_page_fault(vcpu, fault);
6e8eb206
DM
4223 if (r != RET_PF_INVALID)
4224 return r;
83291445 4225
378f5cd6 4226 r = mmu_topup_memory_caches(vcpu, false);
e2dec939
AK
4227 if (r)
4228 return r;
714b93da 4229
20ec3ebd 4230 mmu_seq = vcpu->kvm->mmu_invalidate_seq;
367fd790
SC
4231 smp_rmb();
4232
5276c616
SC
4233 r = kvm_faultin_pfn(vcpu, fault);
4234 if (r != RET_PF_CONTINUE)
8f32d5e5 4235 return r;
367fd790 4236
5276c616
SC
4237 r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
4238 if (r != RET_PF_CONTINUE)
367fd790 4239 return r;
6aa8b732 4240
367fd790 4241 r = RET_PF_RETRY;
a2855afc 4242
0b873fd7 4243 if (is_tdp_mmu_fault)
a2855afc
BG
4244 read_lock(&vcpu->kvm->mmu_lock);
4245 else
4246 write_lock(&vcpu->kvm->mmu_lock);
4247
a955cad8 4248 if (is_page_fault_stale(vcpu, fault, mmu_seq))
367fd790 4249 goto out_unlock;
a955cad8 4250
7bd7ded6
SC
4251 r = make_mmu_pages_available(vcpu);
4252 if (r)
367fd790 4253 goto out_unlock;
bb18842e 4254
0b873fd7 4255 if (is_tdp_mmu_fault)
2f6305dd 4256 r = kvm_tdp_mmu_map(vcpu, fault);
bb18842e 4257 else
43b74355 4258 r = __direct_map(vcpu, fault);
0f90e1c1 4259
367fd790 4260out_unlock:
0b873fd7 4261 if (is_tdp_mmu_fault)
a2855afc
BG
4262 read_unlock(&vcpu->kvm->mmu_lock);
4263 else
4264 write_unlock(&vcpu->kvm->mmu_lock);
3647cd04 4265 kvm_release_pfn_clean(fault->pfn);
367fd790 4266 return r;
6aa8b732
AK
4267}
4268
c501040a
PB
4269static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4270 struct kvm_page_fault *fault)
0f90e1c1 4271{
4326e57e 4272 pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
0f90e1c1
SC
4273
4274 /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4326e57e
PB
4275 fault->max_level = PG_LEVEL_2M;
4276 return direct_page_fault(vcpu, fault);
0f90e1c1
SC
4277}
4278
1261bfa3 4279int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
d0006530 4280 u64 fault_address, char *insn, int insn_len)
1261bfa3
WL
4281{
4282 int r = 1;
9ce372b3 4283 u32 flags = vcpu->arch.apf.host_apf_flags;
1261bfa3 4284
736c291c
SC
4285#ifndef CONFIG_X86_64
4286 /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4287 if (WARN_ON_ONCE(fault_address >> 32))
4288 return -EFAULT;
4289#endif
4290
c595ceee 4291 vcpu->arch.l1tf_flush_l1d = true;
9ce372b3 4292 if (!flags) {
1261bfa3
WL
4293 trace_kvm_page_fault(fault_address, error_code);
4294
d0006530 4295 if (kvm_event_needs_reinjection(vcpu))
1261bfa3
WL
4296 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4297 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4298 insn_len);
9ce372b3 4299 } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
68fd66f1 4300 vcpu->arch.apf.host_apf_flags = 0;
1261bfa3 4301 local_irq_disable();
6bca69ad 4302 kvm_async_pf_task_wait_schedule(fault_address);
1261bfa3 4303 local_irq_enable();
9ce372b3
VK
4304 } else {
4305 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
1261bfa3 4306 }
9ce372b3 4307
1261bfa3
WL
4308 return r;
4309}
4310EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4311
c501040a 4312int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
fb72d167 4313{
d5e90a69
SC
4314 /*
4315 * If the guest's MTRRs may be used to compute the "real" memtype,
4316 * restrict the mapping level to ensure KVM uses a consistent memtype
4317 * across the entire mapping. If the host MTRRs are ignored by TDP
4318 * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
4319 * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
4320 * from the guest's MTRRs so that guest accesses to memory that is
4321 * DMA'd aren't cached against the guest's wishes.
4322 *
4323 * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
4324 * e.g. KVM will force UC memtype for host MMIO.
4325 */
4326 if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
4327 for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
4328 int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4329 gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
4326e57e 4330
d5e90a69
SC
4331 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4332 break;
4333 }
fd136902 4334 }
852e3c19 4335
4326e57e 4336 return direct_page_fault(vcpu, fault);
fb72d167
JR
4337}
4338
84a16226 4339static void nonpaging_init_context(struct kvm_mmu *context)
6aa8b732 4340{
6aa8b732 4341 context->page_fault = nonpaging_page_fault;
6aa8b732 4342 context->gva_to_gpa = nonpaging_gva_to_gpa;
e8bc217a 4343 context->sync_page = nonpaging_sync_page;
5efac074 4344 context->invlpg = NULL;
6aa8b732
AK
4345}
4346
be01e8e2 4347static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
0be44352
SC
4348 union kvm_mmu_page_role role)
4349{
be01e8e2 4350 return (role.direct || pgd == root->pgd) &&
5499ea73 4351 VALID_PAGE(root->hpa) &&
e47c4aee 4352 role.word == to_shadow_page(root->hpa)->role.word;
0be44352
SC
4353}
4354
b94742c9 4355/*
5499ea73
PB
4356 * Find out if a previously cached root matching the new pgd/role is available,
4357 * and insert the current root as the MRU in the cache.
4358 * If a matching root is found, it is assigned to kvm_mmu->root and
4359 * true is returned.
4360 * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4361 * evicted to make room for the current root, and false is returned.
b94742c9 4362 */
5499ea73
PB
4363static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4364 gpa_t new_pgd,
4365 union kvm_mmu_page_role new_role)
b94742c9
JS
4366{
4367 uint i;
b94742c9 4368
b9e5603c 4369 if (is_root_usable(&mmu->root, new_pgd, new_role))
0be44352
SC
4370 return true;
4371
b94742c9 4372 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5499ea73
PB
4373 /*
4374 * The swaps end up rotating the cache like this:
4375 * C 0 1 2 3 (on entry to the function)
4376 * 0 C 1 2 3
4377 * 1 C 0 2 3
4378 * 2 C 0 1 3
4379 * 3 C 0 1 2 (on exit from the loop)
4380 */
b9e5603c 4381 swap(mmu->root, mmu->prev_roots[i]);
b9e5603c 4382 if (is_root_usable(&mmu->root, new_pgd, new_role))
5499ea73 4383 return true;
b94742c9
JS
4384 }
4385
5499ea73
PB
4386 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4387 return false;
b94742c9
JS
4388}
4389
5499ea73
PB
4390/*
4391 * Find out if a previously cached root matching the new pgd/role is available.
4392 * On entry, mmu->root is invalid.
4393 * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4394 * of the cache becomes invalid, and true is returned.
4395 * If no match is found, kvm_mmu->root is left invalid and false is returned.
4396 */
4397static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4398 gpa_t new_pgd,
4399 union kvm_mmu_page_role new_role)
6aa8b732 4400{
5499ea73
PB
4401 uint i;
4402
4403 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4404 if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4405 goto hit;
7c390d35 4406
5499ea73
PB
4407 return false;
4408
4409hit:
4410 swap(mmu->root, mmu->prev_roots[i]);
4411 /* Bubble up the remaining roots. */
4412 for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4413 mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4414 mmu->prev_roots[i].hpa = INVALID_PAGE;
4415 return true;
4416}
4417
4418static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4419 gpa_t new_pgd, union kvm_mmu_page_role new_role)
4420{
7c390d35 4421 /*
5499ea73 4422 * For now, limit the caching to 64-bit hosts+VMs in order to avoid
7c390d35
JS
4423 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4424 * later if necessary.
4425 */
5499ea73
PB
4426 if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
4427 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
7c390d35 4428
5499ea73
PB
4429 if (VALID_PAGE(mmu->root.hpa))
4430 return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4431 else
4432 return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
6aa8b732
AK
4433}
4434
d2e5f333 4435void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
6aa8b732 4436{
0c1c92f1 4437 struct kvm_mmu *mmu = vcpu->arch.mmu;
7a458f0e 4438 union kvm_mmu_page_role new_role = mmu->root_role;
0c1c92f1 4439
5499ea73
PB
4440 if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
4441 /* kvm_mmu_ensure_valid_pgd will set up a new root. */
b869855b
SC
4442 return;
4443 }
4444
4445 /*
4446 * It's possible that the cached previous root page is obsolete because
4447 * of a change in the MMU generation number. However, changing the
527d5cd7
SC
4448 * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4449 * which will free the root set here and allocate a new one.
b869855b
SC
4450 */
4451 kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4452
b5129100 4453 if (force_flush_and_sync_on_reuse) {
b869855b
SC
4454 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4455 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
b5129100 4456 }
b869855b
SC
4457
4458 /*
4459 * The last MMIO access's GVA and GPA are cached in the VCPU. When
4460 * switching to a new CR3, that GVA->GPA mapping may no longer be
4461 * valid. So clear any cached MMIO info even when we don't need to sync
4462 * the shadow page tables.
4463 */
4464 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4465
daa5b6c1
BG
4466 /*
4467 * If this is a direct root page, it doesn't have a write flooding
4468 * count. Otherwise, clear the write flooding count.
4469 */
4470 if (!new_role.direct)
4471 __clear_sp_write_flooding_count(
b9e5603c 4472 to_shadow_page(vcpu->arch.mmu->root.hpa));
6aa8b732 4473}
be01e8e2 4474EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
0aab33e4 4475
5777ed34
JR
4476static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4477{
9f8fe504 4478 return kvm_read_cr3(vcpu);
5777ed34
JR
4479}
4480
54bf36aa 4481static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
c3e5e415 4482 unsigned int access)
ce88decf
XG
4483{
4484 if (unlikely(is_mmio_spte(*sptep))) {
4485 if (gfn != get_mmio_spte_gfn(*sptep)) {
4486 mmu_spte_clear_no_track(sptep);
4487 return true;
4488 }
4489
54bf36aa 4490 mark_mmio_spte(vcpu, sptep, gfn, access);
ce88decf
XG
4491 return true;
4492 }
4493
4494 return false;
4495}
4496
37406aaa
NHE
4497#define PTTYPE_EPT 18 /* arbitrary */
4498#define PTTYPE PTTYPE_EPT
4499#include "paging_tmpl.h"
4500#undef PTTYPE
4501
6aa8b732
AK
4502#define PTTYPE 64
4503#include "paging_tmpl.h"
4504#undef PTTYPE
4505
4506#define PTTYPE 32
4507#include "paging_tmpl.h"
4508#undef PTTYPE
4509
6dc98b86 4510static void
b705a277 4511__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
5b7f575c 4512 u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
6fec2144 4513 bool pse, bool amd)
82725b20 4514{
5f7dde7b 4515 u64 gbpages_bit_rsvd = 0;
a0c0feb5 4516 u64 nonleaf_bit8_rsvd = 0;
5b7f575c 4517 u64 high_bits_rsvd;
82725b20 4518
a0a64f50 4519 rsvd_check->bad_mt_xwr = 0;
25d92081 4520
6dc98b86 4521 if (!gbpages)
5f7dde7b 4522 gbpages_bit_rsvd = rsvd_bits(7, 7);
a0c0feb5 4523
5b7f575c
SC
4524 if (level == PT32E_ROOT_LEVEL)
4525 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4526 else
4527 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4528
4529 /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4530 if (!nx)
4531 high_bits_rsvd |= rsvd_bits(63, 63);
4532
a0c0feb5
PB
4533 /*
4534 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4535 * leaf entries) on AMD CPUs only.
4536 */
6fec2144 4537 if (amd)
a0c0feb5
PB
4538 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4539
6dc98b86 4540 switch (level) {
82725b20
DE
4541 case PT32_ROOT_LEVEL:
4542 /* no rsvd bits for 2 level 4K page table entries */
a0a64f50
XG
4543 rsvd_check->rsvd_bits_mask[0][1] = 0;
4544 rsvd_check->rsvd_bits_mask[0][0] = 0;
4545 rsvd_check->rsvd_bits_mask[1][0] =
4546 rsvd_check->rsvd_bits_mask[0][0];
f815bce8 4547
6dc98b86 4548 if (!pse) {
a0a64f50 4549 rsvd_check->rsvd_bits_mask[1][1] = 0;
f815bce8
XG
4550 break;
4551 }
4552
82725b20
DE
4553 if (is_cpuid_PSE36())
4554 /* 36bits PSE 4MB page */
a0a64f50 4555 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
82725b20
DE
4556 else
4557 /* 32 bits PSE 4MB page */
a0a64f50 4558 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
82725b20
DE
4559 break;
4560 case PT32E_ROOT_LEVEL:
5b7f575c
SC
4561 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4562 high_bits_rsvd |
4563 rsvd_bits(5, 8) |
4564 rsvd_bits(1, 2); /* PDPTE */
4565 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
4566 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
4567 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4568 rsvd_bits(13, 20); /* large page */
a0a64f50
XG
4569 rsvd_check->rsvd_bits_mask[1][0] =
4570 rsvd_check->rsvd_bits_mask[0][0];
82725b20 4571 break;
855feb67 4572 case PT64_ROOT_5LEVEL:
5b7f575c
SC
4573 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4574 nonleaf_bit8_rsvd |
4575 rsvd_bits(7, 7);
855feb67
YZ
4576 rsvd_check->rsvd_bits_mask[1][4] =
4577 rsvd_check->rsvd_bits_mask[0][4];
df561f66 4578 fallthrough;
2a7266a8 4579 case PT64_ROOT_4LEVEL:
5b7f575c
SC
4580 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
4581 nonleaf_bit8_rsvd |
4582 rsvd_bits(7, 7);
4583 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
4584 gbpages_bit_rsvd;
4585 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
4586 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
a0a64f50
XG
4587 rsvd_check->rsvd_bits_mask[1][3] =
4588 rsvd_check->rsvd_bits_mask[0][3];
5b7f575c
SC
4589 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
4590 gbpages_bit_rsvd |
4591 rsvd_bits(13, 29);
4592 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4593 rsvd_bits(13, 20); /* large page */
a0a64f50
XG
4594 rsvd_check->rsvd_bits_mask[1][0] =
4595 rsvd_check->rsvd_bits_mask[0][0];
82725b20
DE
4596 break;
4597 }
4598}
4599
27de9250
SC
4600static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
4601{
4602 /*
4603 * If TDP is enabled, let the guest use GBPAGES if they're supported in
4604 * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
4605 * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
4606 * walk for performance and complexity reasons. Not to mention KVM
4607 * _can't_ solve the problem because GVA->GPA walks aren't visible to
4608 * KVM once a TDP translation is installed. Mimic hardware behavior so
4609 * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
4610 */
4611 return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
4612 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
4613}
4614
c919e881
KH
4615static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4616 struct kvm_mmu *context)
6dc98b86 4617{
b705a277 4618 __reset_rsvds_bits_mask(&context->guest_rsvd_check,
5b7f575c 4619 vcpu->arch.reserved_gpa_bits,
4d25502a 4620 context->cpu_role.base.level, is_efer_nx(context),
27de9250 4621 guest_can_use_gbpages(vcpu),
4e9c0d80 4622 is_cr4_pse(context),
23493d0a 4623 guest_cpuid_is_amd_or_hygon(vcpu));
6dc98b86
XG
4624}
4625
81b8eebb
XG
4626static void
4627__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
84ea5c09 4628 u64 pa_bits_rsvd, bool execonly, int huge_page_level)
25d92081 4629{
5b7f575c 4630 u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
84ea5c09 4631 u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
951f9fd7 4632 u64 bad_mt_xwr;
25d92081 4633
84ea5c09
LJ
4634 if (huge_page_level < PG_LEVEL_1G)
4635 large_1g_rsvd = rsvd_bits(7, 7);
4636 if (huge_page_level < PG_LEVEL_2M)
4637 large_2m_rsvd = rsvd_bits(7, 7);
4638
5b7f575c
SC
4639 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
4640 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
84ea5c09
LJ
4641 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
4642 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
5b7f575c 4643 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
25d92081
YZ
4644
4645 /* large page */
855feb67 4646 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
a0a64f50 4647 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
84ea5c09
LJ
4648 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
4649 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
a0a64f50 4650 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
25d92081 4651
951f9fd7
PB
4652 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
4653 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
4654 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
4655 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
4656 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
4657 if (!execonly) {
4658 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4659 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
25d92081 4660 }
951f9fd7 4661 rsvd_check->bad_mt_xwr = bad_mt_xwr;
25d92081
YZ
4662}
4663
81b8eebb 4664static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
84ea5c09 4665 struct kvm_mmu *context, bool execonly, int huge_page_level)
81b8eebb
XG
4666{
4667 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
84ea5c09
LJ
4668 vcpu->arch.reserved_gpa_bits, execonly,
4669 huge_page_level);
81b8eebb
XG
4670}
4671
6f8e65a6
SC
4672static inline u64 reserved_hpa_bits(void)
4673{
4674 return rsvd_bits(shadow_phys_bits, 63);
4675}
4676
c258b62b
XG
4677/*
4678 * the page table on host is the shadow page table for the page
4679 * table in guest or amd nested guest, its mmu features completely
4680 * follow the features in guest.
4681 */
16be1d12
SC
4682static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4683 struct kvm_mmu *context)
c258b62b 4684{
8c985b2d
SC
4685 /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4686 bool is_amd = true;
4687 /* KVM doesn't use 2-level page tables for the shadow MMU. */
4688 bool is_pse = false;
ea2800dd
BS
4689 struct rsvd_bits_validate *shadow_zero_check;
4690 int i;
5f0b8199 4691
a972e29c 4692 WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
8c985b2d 4693
ea2800dd 4694 shadow_zero_check = &context->shadow_zero_check;
b705a277 4695 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
a972e29c 4696 context->root_role.level,
7a458f0e 4697 context->root_role.efer_nx,
27de9250 4698 guest_can_use_gbpages(vcpu), is_pse, is_amd);
ea2800dd
BS
4699
4700 if (!shadow_me_mask)
4701 return;
4702
a972e29c 4703 for (i = context->root_role.level; --i >= 0;) {
e54f1ff2
KH
4704 /*
4705 * So far shadow_me_value is a constant during KVM's life
4706 * time. Bits in shadow_me_value are allowed to be set.
4707 * Bits in shadow_me_mask but not in shadow_me_value are
4708 * not allowed to be set.
4709 */
4710 shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
4711 shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
4712 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
4713 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
ea2800dd
BS
4714 }
4715
c258b62b 4716}
c258b62b 4717
6fec2144
PB
4718static inline bool boot_cpu_is_amd(void)
4719{
4720 WARN_ON_ONCE(!tdp_enabled);
4721 return shadow_x_mask == 0;
4722}
4723
c258b62b
XG
4724/*
4725 * the direct page table on host, use as much mmu features as
4726 * possible, however, kvm currently does not do execution-protection.
4727 */
4728static void
e8f6e738 4729reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
c258b62b 4730{
ea2800dd
BS
4731 struct rsvd_bits_validate *shadow_zero_check;
4732 int i;
4733
4734 shadow_zero_check = &context->shadow_zero_check;
4735
6fec2144 4736 if (boot_cpu_is_amd())
b705a277 4737 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
6c6ab524 4738 context->root_role.level, true,
b8291adc 4739 boot_cpu_has(X86_FEATURE_GBPAGES),
8c985b2d 4740 false, true);
c258b62b 4741 else
ea2800dd 4742 __reset_rsvds_bits_mask_ept(shadow_zero_check,
84ea5c09
LJ
4743 reserved_hpa_bits(), false,
4744 max_huge_page_level);
c258b62b 4745
ea2800dd
BS
4746 if (!shadow_me_mask)
4747 return;
4748
a972e29c 4749 for (i = context->root_role.level; --i >= 0;) {
ea2800dd
BS
4750 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4751 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4752 }
c258b62b
XG
4753}
4754
4755/*
4756 * as the comments in reset_shadow_zero_bits_mask() except it
4757 * is the shadow page table for intel nested guest.
4758 */
4759static void
e8f6e738 4760reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
c258b62b
XG
4761{
4762 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
84ea5c09
LJ
4763 reserved_hpa_bits(), execonly,
4764 max_huge_page_level);
c258b62b
XG
4765}
4766
09f037aa
PB
4767#define BYTE_MASK(access) \
4768 ((1 & (access) ? 2 : 0) | \
4769 (2 & (access) ? 4 : 0) | \
4770 (3 & (access) ? 8 : 0) | \
4771 (4 & (access) ? 16 : 0) | \
4772 (5 & (access) ? 32 : 0) | \
4773 (6 & (access) ? 64 : 0) | \
4774 (7 & (access) ? 128 : 0))
4775
4776
c596f147 4777static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
97d64b78 4778{
09f037aa
PB
4779 unsigned byte;
4780
4781 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4782 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4783 const u8 u = BYTE_MASK(ACC_USER_MASK);
4784
c596f147
SC
4785 bool cr4_smep = is_cr4_smep(mmu);
4786 bool cr4_smap = is_cr4_smap(mmu);
4787 bool cr0_wp = is_cr0_wp(mmu);
90599c28 4788 bool efer_nx = is_efer_nx(mmu);
97d64b78 4789
97d64b78 4790 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
09f037aa
PB
4791 unsigned pfec = byte << 1;
4792
97ec8c06 4793 /*
09f037aa
PB
4794 * Each "*f" variable has a 1 bit for each UWX value
4795 * that causes a fault with the given PFEC.
97ec8c06 4796 */
97d64b78 4797
09f037aa 4798 /* Faults from writes to non-writable pages */
a6a6d3b1 4799 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
09f037aa 4800 /* Faults from user mode accesses to supervisor pages */
a6a6d3b1 4801 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
09f037aa 4802 /* Faults from fetches of non-executable pages*/
a6a6d3b1 4803 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
09f037aa
PB
4804 /* Faults from kernel mode fetches of user pages */
4805 u8 smepf = 0;
4806 /* Faults from kernel mode accesses of user pages */
4807 u8 smapf = 0;
4808
4809 if (!ept) {
4810 /* Faults from kernel mode accesses to user pages */
4811 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4812
4813 /* Not really needed: !nx will cause pte.nx to fault */
90599c28 4814 if (!efer_nx)
09f037aa
PB
4815 ff = 0;
4816
4817 /* Allow supervisor writes if !cr0.wp */
4818 if (!cr0_wp)
4819 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4820
4821 /* Disallow supervisor fetches of user code if cr4.smep */
4822 if (cr4_smep)
4823 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4824
4825 /*
4826 * SMAP:kernel-mode data accesses from user-mode
4827 * mappings should fault. A fault is considered
4828 * as a SMAP violation if all of the following
39337ad1 4829 * conditions are true:
09f037aa
PB
4830 * - X86_CR4_SMAP is set in CR4
4831 * - A user page is accessed
4832 * - The access is not a fetch
4f4aa80e
LJ
4833 * - The access is supervisor mode
4834 * - If implicit supervisor access or X86_EFLAGS_AC is clear
09f037aa 4835 *
94b4a2f1
LJ
4836 * Here, we cover the first four conditions.
4837 * The fifth is computed dynamically in permission_fault();
09f037aa
PB
4838 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4839 * *not* subject to SMAP restrictions.
4840 */
4841 if (cr4_smap)
4842 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
97d64b78 4843 }
09f037aa
PB
4844
4845 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
97d64b78
AK
4846 }
4847}
4848
2d344105
HH
4849/*
4850* PKU is an additional mechanism by which the paging controls access to
4851* user-mode addresses based on the value in the PKRU register. Protection
4852* key violations are reported through a bit in the page fault error code.
4853* Unlike other bits of the error code, the PK bit is not known at the
4854* call site of e.g. gva_to_gpa; it must be computed directly in
4855* permission_fault based on two bits of PKRU, on some machine state (CR4,
4856* CR0, EFER, CPL), and on other bits of the error code and the page tables.
4857*
4858* In particular the following conditions come from the error code, the
4859* page tables and the machine state:
4860* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4861* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4862* - PK is always zero if U=0 in the page tables
4863* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4864*
4865* The PKRU bitmask caches the result of these four conditions. The error
4866* code (minus the P bit) and the page table's U bit form an index into the
4867* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
4868* with the two bits of the PKRU register corresponding to the protection key.
4869* For the first three conditions above the bits will be 00, thus masking
4870* away both AD and WD. For all reads or if the last condition holds, WD
4871* only will be masked away.
4872*/
2e4c0661 4873static void update_pkru_bitmask(struct kvm_mmu *mmu)
2d344105
HH
4874{
4875 unsigned bit;
4876 bool wp;
4877
a3ca5281
CQ
4878 mmu->pkru_mask = 0;
4879
4880 if (!is_cr4_pke(mmu))
2d344105 4881 return;
2d344105 4882
2e4c0661 4883 wp = is_cr0_wp(mmu);
2d344105
HH
4884
4885 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4886 unsigned pfec, pkey_bits;
4887 bool check_pkey, check_write, ff, uf, wf, pte_user;
4888
4889 pfec = bit << 1;
4890 ff = pfec & PFERR_FETCH_MASK;
4891 uf = pfec & PFERR_USER_MASK;
4892 wf = pfec & PFERR_WRITE_MASK;
4893
4894 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4895 pte_user = pfec & PFERR_RSVD_MASK;
4896
4897 /*
4898 * Only need to check the access which is not an
4899 * instruction fetch and is to a user page.
4900 */
4901 check_pkey = (!ff && pte_user);
4902 /*
4903 * write access is controlled by PKRU if it is a
4904 * user access or CR0.WP = 1.
4905 */
4906 check_write = check_pkey && wf && (uf || wp);
4907
4908 /* PKRU.AD stops both read and write access. */
4909 pkey_bits = !!check_pkey;
4910 /* PKRU.WD stops write access. */
4911 pkey_bits |= (!!check_write) << 1;
4912
4913 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4914 }
4915}
4916
533f9a4b
SC
4917static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
4918 struct kvm_mmu *mmu)
6fd01b71 4919{
533f9a4b
SC
4920 if (!is_cr0_pg(mmu))
4921 return;
6bb69c9b 4922
c919e881 4923 reset_guest_rsvds_bits_mask(vcpu, mmu);
533f9a4b
SC
4924 update_permission_bitmask(mmu, false);
4925 update_pkru_bitmask(mmu);
6fd01b71
AK
4926}
4927
fe660f72 4928static void paging64_init_context(struct kvm_mmu *context)
6aa8b732 4929{
6aa8b732 4930 context->page_fault = paging64_page_fault;
6aa8b732 4931 context->gva_to_gpa = paging64_gva_to_gpa;
e8bc217a 4932 context->sync_page = paging64_sync_page;
a7052897 4933 context->invlpg = paging64_invlpg;
6aa8b732
AK
4934}
4935
84a16226 4936static void paging32_init_context(struct kvm_mmu *context)
6aa8b732 4937{
6aa8b732 4938 context->page_fault = paging32_page_fault;
6aa8b732 4939 context->gva_to_gpa = paging32_gva_to_gpa;
e8bc217a 4940 context->sync_page = paging32_sync_page;
a7052897 4941 context->invlpg = paging32_invlpg;
6aa8b732
AK
4942}
4943
7a7ae829 4944static union kvm_cpu_role
e5ed0fb0
PB
4945kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
4946{
7a7ae829 4947 union kvm_cpu_role role = {0};
e5ed0fb0
PB
4948
4949 role.base.access = ACC_ALL;
4950 role.base.smm = is_smm(vcpu);
4951 role.base.guest_mode = is_guest_mode(vcpu);
4952 role.ext.valid = 1;
4953
4954 if (!____is_cr0_pg(regs)) {
4955 role.base.direct = 1;
4956 return role;
4957 }
4958
4959 role.base.efer_nx = ____is_efer_nx(regs);
4960 role.base.cr0_wp = ____is_cr0_wp(regs);
4961 role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
4962 role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
4963 role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
60f3cb60
PB
4964
4965 if (____is_efer_lma(regs))
4966 role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
4967 : PT64_ROOT_4LEVEL;
4968 else if (____is_cr4_pae(regs))
4969 role.base.level = PT32E_ROOT_LEVEL;
4970 else
4971 role.base.level = PT32_ROOT_LEVEL;
e5ed0fb0 4972
e5ed0fb0
PB
4973 role.ext.cr4_smep = ____is_cr4_smep(regs);
4974 role.ext.cr4_smap = ____is_cr4_smap(regs);
4975 role.ext.cr4_pse = ____is_cr4_pse(regs);
4976
4977 /* PKEY and LA57 are active iff long mode is active. */
4978 role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
4979 role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
4980 role.ext.efer_lma = ____is_efer_lma(regs);
4981 return role;
4982}
4983
d468d94b
SC
4984static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
4985{
746700d2
WH
4986 /* tdp_root_level is architecture forced level, use it if nonzero */
4987 if (tdp_root_level)
4988 return tdp_root_level;
4989
d468d94b 4990 /* Use 5-level TDP if and only if it's useful/necessary. */
83013059 4991 if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
d468d94b
SC
4992 return 4;
4993
83013059 4994 return max_tdp_level;
d468d94b
SC
4995}
4996
7a458f0e 4997static union kvm_mmu_page_role
8626c120 4998kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
7a7ae829 4999 union kvm_cpu_role cpu_role)
9fa72119 5000{
7a458f0e 5001 union kvm_mmu_page_role role = {0};
9fa72119 5002
7a458f0e
PB
5003 role.access = ACC_ALL;
5004 role.cr0_wp = true;
5005 role.efer_nx = true;
5006 role.smm = cpu_role.base.smm;
5007 role.guest_mode = cpu_role.base.guest_mode;
54275f74 5008 role.ad_disabled = !kvm_ad_enabled();
7a458f0e
PB
5009 role.level = kvm_mmu_get_tdp_level(vcpu);
5010 role.direct = true;
5011 role.has_4_byte_gpte = false;
9fa72119
JS
5012
5013 return role;
5014}
5015
39e7e2bf 5016static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
a7f1de9b 5017 union kvm_cpu_role cpu_role)
fb72d167 5018{
8c008659 5019 struct kvm_mmu *context = &vcpu->arch.root_mmu;
7a458f0e 5020 union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
fb72d167 5021
e5ed0fb0 5022 if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
7a458f0e 5023 root_role.word == context->root_role.word)
7dcd5755
VK
5024 return;
5025
e5ed0fb0 5026 context->cpu_role.as_u64 = cpu_role.as_u64;
7a458f0e 5027 context->root_role.word = root_role.word;
7a02674d 5028 context->page_fault = kvm_tdp_page_fault;
e8bc217a 5029 context->sync_page = nonpaging_sync_page;
5efac074 5030 context->invlpg = NULL;
d8dd54e0 5031 context->get_guest_pgd = get_cr3;
e4e517b4 5032 context->get_pdptr = kvm_pdptr_read;
cb659db8 5033 context->inject_page_fault = kvm_inject_page_fault;
fb72d167 5034
36f26787 5035 if (!is_cr0_pg(context))
fb72d167 5036 context->gva_to_gpa = nonpaging_gva_to_gpa;
36f26787 5037 else if (is_cr4_pae(context))
4d6931c3 5038 context->gva_to_gpa = paging64_gva_to_gpa;
f4bd6f73 5039 else
4d6931c3 5040 context->gva_to_gpa = paging32_gva_to_gpa;
fb72d167 5041
533f9a4b 5042 reset_guest_paging_metadata(vcpu, context);
e8f6e738 5043 reset_tdp_shadow_zero_bits_mask(context);
fb72d167
JR
5044}
5045
8c008659 5046static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
7a7ae829 5047 union kvm_cpu_role cpu_role,
7a458f0e 5048 union kvm_mmu_page_role root_role)
9fa72119 5049{
e5ed0fb0 5050 if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
7a458f0e 5051 root_role.word == context->root_role.word)
18db1b17 5052 return;
a770f6f2 5053
e5ed0fb0 5054 context->cpu_role.as_u64 = cpu_role.as_u64;
7a458f0e 5055 context->root_role.word = root_role.word;
18db1b17 5056
36f26787 5057 if (!is_cr0_pg(context))
84a16226 5058 nonpaging_init_context(context);
36f26787 5059 else if (is_cr4_pae(context))
fe660f72 5060 paging64_init_context(context);
6aa8b732 5061 else
84a16226 5062 paging32_init_context(context);
a770f6f2 5063
533f9a4b 5064 reset_guest_paging_metadata(vcpu, context);
c258b62b 5065 reset_shadow_zero_bits_mask(vcpu, context);
52fde8df 5066}
0f04a2ac 5067
594e91a1 5068static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
a7f1de9b 5069 union kvm_cpu_role cpu_role)
0f04a2ac 5070{
8c008659 5071 struct kvm_mmu *context = &vcpu->arch.root_mmu;
56b321f9 5072 union kvm_mmu_page_role root_role;
0f04a2ac 5073
56b321f9 5074 root_role = cpu_role.base;
0f04a2ac 5075
56b321f9
PB
5076 /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
5077 root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
59505b55 5078
56b321f9
PB
5079 /*
5080 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
5081 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
5082 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
5083 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
5084 * The iTLB multi-hit workaround can be toggled at any time, so assume
5085 * NX can be used by any non-nested shadow MMU to avoid having to reset
5086 * MMU contexts.
5087 */
5088 root_role.efer_nx = true;
5089
5090 shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
59505b55
SC
5091}
5092
dbc4739b
SC
5093void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
5094 unsigned long cr4, u64 efer, gpa_t nested_cr3)
0f04a2ac 5095{
8c008659 5096 struct kvm_mmu *context = &vcpu->arch.guest_mmu;
594e91a1
SC
5097 struct kvm_mmu_role_regs regs = {
5098 .cr0 = cr0,
28f091bc 5099 .cr4 = cr4 & ~X86_CR4_PKE,
594e91a1
SC
5100 .efer = efer,
5101 };
7a7ae829 5102 union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
56b321f9
PB
5103 union kvm_mmu_page_role root_role;
5104
5105 /* NPT requires CR0.PG=1. */
5106 WARN_ON_ONCE(cpu_role.base.direct);
5107
5108 root_role = cpu_role.base;
5109 root_role.level = kvm_mmu_get_tdp_level(vcpu);
84e5ffd0
LJ
5110 if (root_role.level == PT64_ROOT_5LEVEL &&
5111 cpu_role.base.level == PT64_ROOT_4LEVEL)
5112 root_role.passthrough = 1;
a506fdd2 5113
7a458f0e 5114 shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
d2e5f333 5115 kvm_mmu_new_pgd(vcpu, nested_cr3);
0f04a2ac
VK
5116}
5117EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
52fde8df 5118
7a7ae829 5119static union kvm_cpu_role
a336282d 5120kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
bb1fcc70 5121 bool execonly, u8 level)
9fa72119 5122{
7a7ae829 5123 union kvm_cpu_role role = {0};
14c07ad8 5124
daed87b8
PB
5125 /*
5126 * KVM does not support SMM transfer monitors, and consequently does not
5127 * support the "entry to SMM" control either. role.base.smm is always 0.
5128 */
5129 WARN_ON_ONCE(is_smm(vcpu));
bb1fcc70 5130 role.base.level = level;
bb3b394d 5131 role.base.has_4_byte_gpte = false;
a336282d
VK
5132 role.base.direct = false;
5133 role.base.ad_disabled = !accessed_dirty;
5134 role.base.guest_mode = true;
5135 role.base.access = ACC_ALL;
9fa72119 5136
cd6767c3 5137 role.ext.word = 0;
a336282d 5138 role.ext.execonly = execonly;
cd6767c3 5139 role.ext.valid = 1;
9fa72119
JS
5140
5141 return role;
5142}
5143
ae1e2d10 5144void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
cc022ae1
LJ
5145 int huge_page_level, bool accessed_dirty,
5146 gpa_t new_eptp)
155a97a3 5147{
8c008659 5148 struct kvm_mmu *context = &vcpu->arch.guest_mmu;
bb1fcc70 5149 u8 level = vmx_eptp_page_walk_level(new_eptp);
7a7ae829 5150 union kvm_cpu_role new_mode =
a336282d 5151 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
bb1fcc70 5152 execonly, level);
a336282d 5153
e5ed0fb0
PB
5154 if (new_mode.as_u64 != context->cpu_role.as_u64) {
5155 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
5156 context->cpu_role.as_u64 = new_mode.as_u64;
7a458f0e 5157 context->root_role.word = new_mode.base.word;
3cffc89d 5158
3cffc89d
PB
5159 context->page_fault = ept_page_fault;
5160 context->gva_to_gpa = ept_gva_to_gpa;
5161 context->sync_page = ept_sync_page;
5162 context->invlpg = ept_invlpg;
347a0d0d 5163
3cffc89d
PB
5164 update_permission_bitmask(context, true);
5165 context->pkru_mask = 0;
5166 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
5167 reset_ept_shadow_zero_bits_mask(context, execonly);
5168 }
3dc773e7 5169
d2e5f333 5170 kvm_mmu_new_pgd(vcpu, new_eptp);
155a97a3
NHE
5171}
5172EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5173
39e7e2bf 5174static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
a7f1de9b 5175 union kvm_cpu_role cpu_role)
52fde8df 5176{
8c008659 5177 struct kvm_mmu *context = &vcpu->arch.root_mmu;
ad896af0 5178
a7f1de9b 5179 kvm_init_shadow_mmu(vcpu, cpu_role);
929d1cfa 5180
d8dd54e0 5181 context->get_guest_pgd = get_cr3;
ad896af0
PB
5182 context->get_pdptr = kvm_pdptr_read;
5183 context->inject_page_fault = kvm_inject_page_fault;
6aa8b732
AK
5184}
5185
39e7e2bf 5186static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
a7f1de9b 5187 union kvm_cpu_role new_mode)
02f59dc9
JR
5188{
5189 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5190
e5ed0fb0 5191 if (new_mode.as_u64 == g_context->cpu_role.as_u64)
bf627a92
VK
5192 return;
5193
e5ed0fb0 5194 g_context->cpu_role.as_u64 = new_mode.as_u64;
d8dd54e0 5195 g_context->get_guest_pgd = get_cr3;
e4e517b4 5196 g_context->get_pdptr = kvm_pdptr_read;
02f59dc9
JR
5197 g_context->inject_page_fault = kvm_inject_page_fault;
5198
5efac074
PB
5199 /*
5200 * L2 page tables are never shadowed, so there is no need to sync
5201 * SPTEs.
5202 */
5203 g_context->invlpg = NULL;
5204
02f59dc9 5205 /*
44dd3ffa 5206 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
0af2593b
DM
5207 * L1's nested page tables (e.g. EPT12). The nested translation
5208 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5209 * L2's page tables as the first level of translation and L1's
5210 * nested page tables as the second level of translation. Basically
5211 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
02f59dc9 5212 */
fa4b5588 5213 if (!is_paging(vcpu))
1f5a21ee 5214 g_context->gva_to_gpa = nonpaging_gva_to_gpa;
fa4b5588 5215 else if (is_long_mode(vcpu))
1f5a21ee 5216 g_context->gva_to_gpa = paging64_gva_to_gpa;
fa4b5588 5217 else if (is_pae(vcpu))
1f5a21ee 5218 g_context->gva_to_gpa = paging64_gva_to_gpa;
fa4b5588 5219 else
1f5a21ee 5220 g_context->gva_to_gpa = paging32_gva_to_gpa;
02f59dc9 5221
533f9a4b 5222 reset_guest_paging_metadata(vcpu, g_context);
02f59dc9
JR
5223}
5224
c9060662 5225void kvm_init_mmu(struct kvm_vcpu *vcpu)
fb72d167 5226{
39e7e2bf 5227 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
a7f1de9b 5228 union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
39e7e2bf 5229
02f59dc9 5230 if (mmu_is_nested(vcpu))
a7f1de9b 5231 init_kvm_nested_mmu(vcpu, cpu_role);
02f59dc9 5232 else if (tdp_enabled)
a7f1de9b 5233 init_kvm_tdp_mmu(vcpu, cpu_role);
fb72d167 5234 else
a7f1de9b 5235 init_kvm_softmmu(vcpu, cpu_role);
fb72d167 5236}
1c53da3f 5237EXPORT_SYMBOL_GPL(kvm_init_mmu);
fb72d167 5238
49c6f875
SC
5239void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5240{
5241 /*
5242 * Invalidate all MMU roles to force them to reinitialize as CPUID
5243 * information is factored into reserved bit calculations.
feb627e8
VK
5244 *
5245 * Correctly handling multiple vCPU models with respect to paging and
5246 * physical address properties) in a single VM would require tracking
5247 * all relevant CPUID information in kvm_mmu_page_role. That is very
5248 * undesirable as it would increase the memory requirements for
5249 * gfn_track (see struct kvm_mmu_page_role comments). For now that
5250 * problem is swept under the rug; KVM's CPUID API is horrific and
5251 * it's all but impossible to solve it without introducing a new API.
49c6f875 5252 */
7a458f0e
PB
5253 vcpu->arch.root_mmu.root_role.word = 0;
5254 vcpu->arch.guest_mmu.root_role.word = 0;
5255 vcpu->arch.nested_mmu.root_role.word = 0;
e5ed0fb0
PB
5256 vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
5257 vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
5258 vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
49c6f875 5259 kvm_mmu_reset_context(vcpu);
63f5a190
SC
5260
5261 /*
feb627e8
VK
5262 * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5263 * kvm_arch_vcpu_ioctl().
63f5a190 5264 */
feb627e8 5265 KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
49c6f875
SC
5266}
5267
8a3c1a33 5268void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
6aa8b732 5269{
95f93af4 5270 kvm_mmu_unload(vcpu);
c9060662 5271 kvm_init_mmu(vcpu);
17c3ba9d 5272}
8668a3c4 5273EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
17c3ba9d
AK
5274
5275int kvm_mmu_load(struct kvm_vcpu *vcpu)
6aa8b732 5276{
714b93da
AK
5277 int r;
5278
347a0d0d 5279 r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
17c3ba9d
AK
5280 if (r)
5281 goto out;
748e52b9 5282 r = mmu_alloc_special_roots(vcpu);
17c3ba9d
AK
5283 if (r)
5284 goto out;
347a0d0d 5285 if (vcpu->arch.mmu->root_role.direct)
6e6ec584
SC
5286 r = mmu_alloc_direct_roots(vcpu);
5287 else
5288 r = mmu_alloc_shadow_roots(vcpu);
8986ecc0
MT
5289 if (r)
5290 goto out;
a91f387b
SC
5291
5292 kvm_mmu_sync_roots(vcpu);
5293
727a7e27 5294 kvm_mmu_load_pgd(vcpu);
db01416b
SC
5295
5296 /*
5297 * Flush any TLB entries for the new root, the provenance of the root
5298 * is unknown. Even if KVM ensures there are no stale TLB entries
5299 * for a freed root, in theory another hypervisor could have left
5300 * stale entries. Flushing on alloc also allows KVM to skip the TLB
5301 * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5302 */
e27bc044 5303 static_call(kvm_x86_flush_tlb_current)(vcpu);
714b93da
AK
5304out:
5305 return r;
6aa8b732 5306}
17c3ba9d
AK
5307
5308void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5309{
0c1c92f1
PB
5310 struct kvm *kvm = vcpu->kvm;
5311
5312 kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
b9e5603c 5313 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
0c1c92f1 5314 kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
b9e5603c 5315 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
6d58f275 5316 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
17c3ba9d 5317}
6aa8b732 5318
527d5cd7
SC
5319static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5320{
5321 struct kvm_mmu_page *sp;
5322
5323 if (!VALID_PAGE(root_hpa))
5324 return false;
5325
5326 /*
5327 * When freeing obsolete roots, treat roots as obsolete if they don't
5328 * have an associated shadow page. This does mean KVM will get false
5329 * positives and free roots that don't strictly need to be freed, but
5330 * such false positives are relatively rare:
5331 *
5332 * (a) only PAE paging and nested NPT has roots without shadow pages
5333 * (b) remote reloads due to a memslot update obsoletes _all_ roots
5334 * (c) KVM doesn't track previous roots for PAE paging, and the guest
5335 * is unlikely to zap an in-use PGD.
5336 */
5337 sp = to_shadow_page(root_hpa);
5338 return !sp || is_obsolete_sp(kvm, sp);
5339}
5340
5341static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5342{
5343 unsigned long roots_to_free = 0;
5344 int i;
5345
5346 if (is_obsolete_root(kvm, mmu->root.hpa))
5347 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5348
5349 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
cf4a8693 5350 if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
527d5cd7
SC
5351 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5352 }
5353
5354 if (roots_to_free)
5355 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5356}
5357
5358void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5359{
5360 __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5361 __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5362}
5363
79539cec
AK
5364static bool need_remote_flush(u64 old, u64 new)
5365{
5366 if (!is_shadow_present_pte(old))
5367 return false;
5368 if (!is_shadow_present_pte(new))
5369 return true;
2ca3129e 5370 if ((old ^ new) & SPTE_BASE_ADDR_MASK)
79539cec 5371 return true;
53166229
GN
5372 old ^= shadow_nx_mask;
5373 new ^= shadow_nx_mask;
2ca3129e 5374 return (old & ~new & SPTE_PERM_MASK) != 0;
79539cec
AK
5375}
5376
889e5cbc 5377static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
0e0fee5c 5378 int *bytes)
da4a00f0 5379{
0e0fee5c 5380 u64 gentry = 0;
889e5cbc 5381 int r;
72016f3a 5382
72016f3a
AK
5383 /*
5384 * Assume that the pte write on a page table of the same type
49b26e26
XG
5385 * as the current vcpu paging mode since we update the sptes only
5386 * when they have the same mode.
72016f3a 5387 */
889e5cbc 5388 if (is_pae(vcpu) && *bytes == 4) {
72016f3a 5389 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
889e5cbc
XG
5390 *gpa &= ~(gpa_t)7;
5391 *bytes = 8;
08e850c6
AK
5392 }
5393
0e0fee5c
JS
5394 if (*bytes == 4 || *bytes == 8) {
5395 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5396 if (r)
5397 gentry = 0;
72016f3a
AK
5398 }
5399
889e5cbc
XG
5400 return gentry;
5401}
5402
5403/*
5404 * If we're seeing too many writes to a page, it may no longer be a page table,
5405 * or we may be forking, in which case it is better to unmap the page.
5406 */
a138fe75 5407static bool detect_write_flooding(struct kvm_mmu_page *sp)
889e5cbc 5408{
a30f47cb
XG
5409 /*
5410 * Skip write-flooding detected for the sp whose level is 1, because
5411 * it can become unsync, then the guest page is not write-protected.
5412 */
3bae0459 5413 if (sp->role.level == PG_LEVEL_4K)
a30f47cb 5414 return false;
3246af0e 5415
e5691a81
XG
5416 atomic_inc(&sp->write_flooding_count);
5417 return atomic_read(&sp->write_flooding_count) >= 3;
889e5cbc
XG
5418}
5419
5420/*
5421 * Misaligned accesses are too much trouble to fix up; also, they usually
5422 * indicate a page is not used as a page table.
5423 */
5424static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5425 int bytes)
5426{
5427 unsigned offset, pte_size, misaligned;
5428
5429 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5430 gpa, bytes, sp->role.word);
5431
5432 offset = offset_in_page(gpa);
bb3b394d 5433 pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5d9ca30e
XG
5434
5435 /*
5436 * Sometimes, the OS only writes the last one bytes to update status
5437 * bits, for example, in linux, andb instruction is used in clear_bit().
5438 */
5439 if (!(offset & (pte_size - 1)) && bytes == 1)
5440 return false;
5441
889e5cbc
XG
5442 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5443 misaligned |= bytes < 4;
5444
5445 return misaligned;
5446}
5447
5448static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5449{
5450 unsigned page_offset, quadrant;
5451 u64 *spte;
5452 int level;
5453
5454 page_offset = offset_in_page(gpa);
5455 level = sp->role.level;
5456 *nspte = 1;
bb3b394d 5457 if (sp->role.has_4_byte_gpte) {
889e5cbc
XG
5458 page_offset <<= 1; /* 32->64 */
5459 /*
5460 * A 32-bit pde maps 4MB while the shadow pdes map
5461 * only 2MB. So we need to double the offset again
5462 * and zap two pdes instead of one.
5463 */
5464 if (level == PT32_ROOT_LEVEL) {
5465 page_offset &= ~7; /* kill rounding error */
5466 page_offset <<= 1;
5467 *nspte = 2;
5468 }
5469 quadrant = page_offset >> PAGE_SHIFT;
5470 page_offset &= ~PAGE_MASK;
5471 if (quadrant != sp->role.quadrant)
5472 return NULL;
5473 }
5474
5475 spte = &sp->spt[page_offset / sizeof(*spte)];
5476 return spte;
5477}
5478
13d268ca 5479static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
d126363d
JS
5480 const u8 *new, int bytes,
5481 struct kvm_page_track_notifier_node *node)
889e5cbc
XG
5482{
5483 gfn_t gfn = gpa >> PAGE_SHIFT;
889e5cbc 5484 struct kvm_mmu_page *sp;
889e5cbc
XG
5485 LIST_HEAD(invalid_list);
5486 u64 entry, gentry, *spte;
5487 int npte;
06152b2d 5488 bool flush = false;
889e5cbc
XG
5489
5490 /*
5491 * If we don't have indirect shadow pages, it means no page is
5492 * write-protected, so we can exit simply.
5493 */
6aa7de05 5494 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
889e5cbc
XG
5495 return;
5496
889e5cbc
XG
5497 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5498
531810ca 5499 write_lock(&vcpu->kvm->mmu_lock);
0e0fee5c
JS
5500
5501 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5502
889e5cbc 5503 ++vcpu->kvm->stat.mmu_pte_write;
889e5cbc 5504
767d8d8d 5505 for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
a30f47cb 5506 if (detect_write_misaligned(sp, gpa, bytes) ||
a138fe75 5507 detect_write_flooding(sp)) {
b8c67b7a 5508 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
4cee5764 5509 ++vcpu->kvm->stat.mmu_flooded;
0e7bc4b9
AK
5510 continue;
5511 }
889e5cbc
XG
5512
5513 spte = get_written_sptes(sp, gpa, &npte);
5514 if (!spte)
5515 continue;
5516
ac1b714e 5517 while (npte--) {
79539cec 5518 entry = *spte;
2de4085c 5519 mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
c5e2184d
SC
5520 if (gentry && sp->role.level != PG_LEVEL_4K)
5521 ++vcpu->kvm->stat.mmu_pde_zapped;
9bb4f6b1 5522 if (need_remote_flush(entry, *spte))
06152b2d 5523 flush = true;
ac1b714e 5524 ++spte;
9b7a0325 5525 }
9b7a0325 5526 }
06152b2d 5527 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
531810ca 5528 write_unlock(&vcpu->kvm->mmu_lock);
da4a00f0
AK
5529}
5530
1075d41e 5531int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
dc25e89e 5532 void *insn, int insn_len)
3067714c 5533{
92daa48b 5534 int r, emulation_type = EMULTYPE_PF;
347a0d0d 5535 bool direct = vcpu->arch.mmu->root_role.direct;
3067714c 5536
b9e5603c 5537 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
ddce6208
SC
5538 return RET_PF_RETRY;
5539
9b8ebbdb 5540 r = RET_PF_INVALID;
e9ee956e 5541 if (unlikely(error_code & PFERR_RSVD_MASK)) {
736c291c 5542 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
472faffa 5543 if (r == RET_PF_EMULATE)
e9ee956e 5544 goto emulate;
e9ee956e 5545 }
3067714c 5546
9b8ebbdb 5547 if (r == RET_PF_INVALID) {
7a02674d
SC
5548 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5549 lower_32_bits(error_code), false);
19025e7b 5550 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
7b367bc9 5551 return -EIO;
9b8ebbdb
PB
5552 }
5553
3067714c 5554 if (r < 0)
e9ee956e 5555 return r;
83a2ba4c
SC
5556 if (r != RET_PF_EMULATE)
5557 return 1;
3067714c 5558
14727754
TL
5559 /*
5560 * Before emulating the instruction, check if the error code
5561 * was due to a RO violation while translating the guest page.
5562 * This can occur when using nested virtualization with nested
5563 * paging in both guests. If true, we simply unprotect the page
5564 * and resume the guest.
14727754 5565 */
347a0d0d 5566 if (vcpu->arch.mmu->root_role.direct &&
eebed243 5567 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
736c291c 5568 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
14727754
TL
5569 return 1;
5570 }
5571
472faffa
SC
5572 /*
5573 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5574 * optimistically try to just unprotect the page and let the processor
5575 * re-execute the instruction that caused the page fault. Do not allow
5576 * retrying MMIO emulation, as it's not only pointless but could also
5577 * cause us to enter an infinite loop because the processor will keep
6c3dfeb6
SC
5578 * faulting on the non-existent MMIO address. Retrying an instruction
5579 * from a nested guest is also pointless and dangerous as we are only
5580 * explicitly shadowing L1's page tables, i.e. unprotecting something
5581 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
472faffa 5582 */
736c291c 5583 if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
92daa48b 5584 emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
e9ee956e 5585emulate:
736c291c 5586 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
60fc3d02 5587 insn_len);
3067714c
AK
5588}
5589EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5590
5efac074
PB
5591void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
5592 gva_t gva, hpa_t root_hpa)
a7052897 5593{
b94742c9 5594 int i;
7eb77e9f 5595
5efac074
PB
5596 /* It's actually a GPA for vcpu->arch.guest_mmu. */
5597 if (mmu != &vcpu->arch.guest_mmu) {
5598 /* INVLPG on a non-canonical address is a NOP according to the SDM. */
5599 if (is_noncanonical_address(gva, vcpu))
5600 return;
5601
e27bc044 5602 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
5efac074
PB
5603 }
5604
5605 if (!mmu->invlpg)
faff8758
JS
5606 return;
5607
5efac074 5608 if (root_hpa == INVALID_PAGE) {
b9e5603c 5609 mmu->invlpg(vcpu, gva, mmu->root.hpa);
956bf353 5610
5efac074
PB
5611 /*
5612 * INVLPG is required to invalidate any global mappings for the VA,
5613 * irrespective of PCID. Since it would take us roughly similar amount
5614 * of work to determine whether any of the prev_root mappings of the VA
5615 * is marked global, or to just sync it blindly, so we might as well
5616 * just always sync it.
5617 *
5618 * Mappings not reachable via the current cr3 or the prev_roots will be
5619 * synced when switching to that cr3, so nothing needs to be done here
5620 * for them.
5621 */
5622 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5623 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5624 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5625 } else {
5626 mmu->invlpg(vcpu, gva, root_hpa);
5627 }
5628}
956bf353 5629
5efac074
PB
5630void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5631{
05b29633 5632 kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
a7052897
MT
5633 ++vcpu->stat.invlpg;
5634}
5635EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5636
5efac074 5637
eb4b248e
JS
5638void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5639{
44dd3ffa 5640 struct kvm_mmu *mmu = vcpu->arch.mmu;
faff8758 5641 bool tlb_flush = false;
b94742c9 5642 uint i;
eb4b248e
JS
5643
5644 if (pcid == kvm_get_active_pcid(vcpu)) {
9f46c187
PB
5645 if (mmu->invlpg)
5646 mmu->invlpg(vcpu, gva, mmu->root.hpa);
faff8758 5647 tlb_flush = true;
eb4b248e
JS
5648 }
5649
b94742c9
JS
5650 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5651 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
be01e8e2 5652 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
9f46c187
PB
5653 if (mmu->invlpg)
5654 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
b94742c9
JS
5655 tlb_flush = true;
5656 }
956bf353 5657 }
ade61e28 5658
faff8758 5659 if (tlb_flush)
e27bc044 5660 static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
faff8758 5661
eb4b248e
JS
5662 ++vcpu->stat.invlpg;
5663
5664 /*
b94742c9
JS
5665 * Mappings not reachable via the current cr3 or the prev_roots will be
5666 * synced when switching to that cr3, so nothing needs to be done here
5667 * for them.
eb4b248e
JS
5668 */
5669}
eb4b248e 5670
746700d2
WH
5671void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
5672 int tdp_max_root_level, int tdp_huge_page_level)
18552672 5673{
bde77235 5674 tdp_enabled = enable_tdp;
746700d2 5675 tdp_root_level = tdp_forced_root_level;
83013059 5676 max_tdp_level = tdp_max_root_level;
703c335d
SC
5677
5678 /*
1d92d2e8 5679 * max_huge_page_level reflects KVM's MMU capabilities irrespective
703c335d
SC
5680 * of kernel support, e.g. KVM may be capable of using 1GB pages when
5681 * the kernel is not. But, KVM never creates a page size greater than
5682 * what is used by the kernel for any given HVA, i.e. the kernel's
5683 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5684 */
5685 if (tdp_enabled)
1d92d2e8 5686 max_huge_page_level = tdp_huge_page_level;
703c335d 5687 else if (boot_cpu_has(X86_FEATURE_GBPAGES))
1d92d2e8 5688 max_huge_page_level = PG_LEVEL_1G;
703c335d 5689 else
1d92d2e8 5690 max_huge_page_level = PG_LEVEL_2M;
18552672 5691}
bde77235 5692EXPORT_SYMBOL_GPL(kvm_configure_mmu);
85875a13
SC
5693
5694/* The return value indicates if tlb flush on all vcpus is needed. */
269e9552
HM
5695typedef bool (*slot_level_handler) (struct kvm *kvm,
5696 struct kvm_rmap_head *rmap_head,
5697 const struct kvm_memory_slot *slot);
85875a13
SC
5698
5699/* The caller should hold mmu-lock before calling this function. */
5700static __always_inline bool
269e9552 5701slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
85875a13 5702 slot_level_handler fn, int start_level, int end_level,
1a61b7db
SC
5703 gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
5704 bool flush)
85875a13
SC
5705{
5706 struct slot_rmap_walk_iterator iterator;
85875a13
SC
5707
5708 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5709 end_gfn, &iterator) {
5710 if (iterator.rmap)
0a234f5d 5711 flush |= fn(kvm, iterator.rmap, memslot);
85875a13 5712
531810ca 5713 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
302695a5 5714 if (flush && flush_on_yield) {
f285c633
BG
5715 kvm_flush_remote_tlbs_with_address(kvm,
5716 start_gfn,
5717 iterator.gfn - start_gfn + 1);
85875a13
SC
5718 flush = false;
5719 }
531810ca 5720 cond_resched_rwlock_write(&kvm->mmu_lock);
85875a13
SC
5721 }
5722 }
5723
85875a13
SC
5724 return flush;
5725}
5726
5727static __always_inline bool
269e9552 5728slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
85875a13 5729 slot_level_handler fn, int start_level, int end_level,
302695a5 5730 bool flush_on_yield)
85875a13
SC
5731{
5732 return slot_handle_level_range(kvm, memslot, fn, start_level,
5733 end_level, memslot->base_gfn,
5734 memslot->base_gfn + memslot->npages - 1,
1a61b7db 5735 flush_on_yield, false);
85875a13
SC
5736}
5737
85875a13 5738static __always_inline bool
610265ea
DM
5739slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
5740 slot_level_handler fn, bool flush_on_yield)
85875a13 5741{
3bae0459 5742 return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
302695a5 5743 PG_LEVEL_4K, flush_on_yield);
85875a13
SC
5744}
5745
1cfff4d9 5746static void free_mmu_pages(struct kvm_mmu *mmu)
6aa8b732 5747{
4a98623d
SC
5748 if (!tdp_enabled && mmu->pae_root)
5749 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
1cfff4d9 5750 free_page((unsigned long)mmu->pae_root);
03ca4589 5751 free_page((unsigned long)mmu->pml4_root);
cb0f722a 5752 free_page((unsigned long)mmu->pml5_root);
6aa8b732
AK
5753}
5754
04d28e37 5755static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
6aa8b732 5756{
17ac10ad 5757 struct page *page;
6aa8b732
AK
5758 int i;
5759
b9e5603c
PB
5760 mmu->root.hpa = INVALID_PAGE;
5761 mmu->root.pgd = 0;
04d28e37
SC
5762 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5763 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5764
27f4fca2
LJ
5765 /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
5766 if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
5767 return 0;
5768
17ac10ad 5769 /*
b6b80c78
SC
5770 * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5771 * while the PDP table is a per-vCPU construct that's allocated at MMU
5772 * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
5773 * x86_64. Therefore we need to allocate the PDP table in the first
04d45551
SC
5774 * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
5775 * generally doesn't use PAE paging and can skip allocating the PDP
5776 * table. The main exception, handled here, is SVM's 32-bit NPT. The
5777 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
84432316 5778 * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
17ac10ad 5779 */
d468d94b 5780 if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
b6b80c78
SC
5781 return 0;
5782
254272ce 5783 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
17ac10ad 5784 if (!page)
d7fa6ab2
WY
5785 return -ENOMEM;
5786
1cfff4d9 5787 mmu->pae_root = page_address(page);
4a98623d
SC
5788
5789 /*
5790 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
5791 * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
5792 * that KVM's writes and the CPU's reads get along. Note, this is
5793 * only necessary when using shadow paging, as 64-bit NPT can get at
5794 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
5795 * by 32-bit kernels (when KVM itself uses 32-bit NPT).
5796 */
5797 if (!tdp_enabled)
5798 set_memory_decrypted((unsigned long)mmu->pae_root, 1);
5799 else
e54f1ff2 5800 WARN_ON_ONCE(shadow_me_value);
4a98623d 5801
17ac10ad 5802 for (i = 0; i < 4; ++i)
c834e5e4 5803 mmu->pae_root[i] = INVALID_PAE_ROOT;
17ac10ad 5804
6aa8b732 5805 return 0;
6aa8b732
AK
5806}
5807
8018c27b 5808int kvm_mmu_create(struct kvm_vcpu *vcpu)
6aa8b732 5809{
1cfff4d9 5810 int ret;
b94742c9 5811
5962bfb7 5812 vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
5f6078f9
SC
5813 vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
5814
5962bfb7 5815 vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
5f6078f9 5816 vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
5962bfb7 5817
96880883
SC
5818 vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
5819
44dd3ffa
VK
5820 vcpu->arch.mmu = &vcpu->arch.root_mmu;
5821 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
6aa8b732 5822
04d28e37 5823 ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
1cfff4d9
JP
5824 if (ret)
5825 return ret;
5826
04d28e37 5827 ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
1cfff4d9
JP
5828 if (ret)
5829 goto fail_allocate_root;
5830
5831 return ret;
5832 fail_allocate_root:
5833 free_mmu_pages(&vcpu->arch.guest_mmu);
5834 return ret;
6aa8b732
AK
5835}
5836
fbb158cb 5837#define BATCH_ZAP_PAGES 10
002c5f73
SC
5838static void kvm_zap_obsolete_pages(struct kvm *kvm)
5839{
5840 struct kvm_mmu_page *sp, *node;
fbb158cb 5841 int nr_zapped, batch = 0;
b28cb0cd 5842 bool unstable;
002c5f73
SC
5843
5844restart:
5845 list_for_each_entry_safe_reverse(sp, node,
5846 &kvm->arch.active_mmu_pages, link) {
5847 /*
5848 * No obsolete valid page exists before a newly created page
5849 * since active_mmu_pages is a FIFO list.
5850 */
5851 if (!is_obsolete_sp(kvm, sp))
5852 break;
5853
5854 /*
f95eec9b
SC
5855 * Invalid pages should never land back on the list of active
5856 * pages. Skip the bogus page, otherwise we'll get stuck in an
5857 * infinite loop if the page gets put back on the list (again).
002c5f73 5858 */
f95eec9b 5859 if (WARN_ON(sp->role.invalid))
002c5f73
SC
5860 continue;
5861
4506ecf4
SC
5862 /*
5863 * No need to flush the TLB since we're only zapping shadow
5864 * pages with an obsolete generation number and all vCPUS have
5865 * loaded a new root, i.e. the shadow pages being zapped cannot
5866 * be in active use by the guest.
5867 */
fbb158cb 5868 if (batch >= BATCH_ZAP_PAGES &&
531810ca 5869 cond_resched_rwlock_write(&kvm->mmu_lock)) {
fbb158cb 5870 batch = 0;
002c5f73
SC
5871 goto restart;
5872 }
5873
b28cb0cd
SC
5874 unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
5875 &kvm->arch.zapped_obsolete_pages, &nr_zapped);
5876 batch += nr_zapped;
5877
5878 if (unstable)
002c5f73
SC
5879 goto restart;
5880 }
5881
4506ecf4 5882 /*
7ae5840e
SC
5883 * Kick all vCPUs (via remote TLB flush) before freeing the page tables
5884 * to ensure KVM is not in the middle of a lockless shadow page table
5885 * walk, which may reference the pages. The remote TLB flush itself is
5886 * not required and is simply a convenient way to kick vCPUs as needed.
5887 * KVM performs a local TLB flush when allocating a new root (see
5888 * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
5889 * running with an obsolete MMU.
4506ecf4 5890 */
10605204 5891 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
002c5f73
SC
5892}
5893
5894/*
5895 * Fast invalidate all shadow pages and use lock-break technique
5896 * to zap obsolete pages.
5897 *
5898 * It's required when memslot is being deleted or VM is being
5899 * destroyed, in these cases, we should ensure that KVM MMU does
5900 * not use any resource of the being-deleted slot or all slots
5901 * after calling the function.
5902 */
5903static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5904{
ca333add
SC
5905 lockdep_assert_held(&kvm->slots_lock);
5906
531810ca 5907 write_lock(&kvm->mmu_lock);
14a3c4f4 5908 trace_kvm_mmu_zap_all_fast(kvm);
ca333add
SC
5909
5910 /*
5911 * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
5912 * held for the entire duration of zapping obsolete pages, it's
5913 * impossible for there to be multiple invalid generations associated
5914 * with *valid* shadow pages at any given time, i.e. there is exactly
5915 * one valid generation and (at most) one invalid generation.
5916 */
5917 kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
002c5f73 5918
2f6f66cc
SC
5919 /*
5920 * In order to ensure all vCPUs drop their soon-to-be invalid roots,
5921 * invalidating TDP MMU roots must be done while holding mmu_lock for
5922 * write and in the same critical section as making the reload request,
5923 * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
b7cccd39
BG
5924 */
5925 if (is_tdp_mmu_enabled(kvm))
5926 kvm_tdp_mmu_invalidate_all_roots(kvm);
5927
4506ecf4
SC
5928 /*
5929 * Notify all vcpus to reload its shadow page table and flush TLB.
5930 * Then all vcpus will switch to new shadow page table with the new
5931 * mmu_valid_gen.
5932 *
5933 * Note: we need to do this under the protection of mmu_lock,
5934 * otherwise, vcpu would purge shadow page but miss tlb flush.
5935 */
527d5cd7 5936 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
4506ecf4 5937
002c5f73 5938 kvm_zap_obsolete_pages(kvm);
faaf05b0 5939
531810ca 5940 write_unlock(&kvm->mmu_lock);
4c6654bd 5941
f28e9c7f
SC
5942 /*
5943 * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
5944 * returning to the caller, e.g. if the zap is in response to a memslot
5945 * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
5946 * associated with the deleted memslot once the update completes, and
5947 * Deferring the zap until the final reference to the root is put would
5948 * lead to use-after-free.
5949 */
22b94c4b 5950 if (is_tdp_mmu_enabled(kvm))
4c6654bd 5951 kvm_tdp_mmu_zap_invalidated_roots(kvm);
002c5f73
SC
5952}
5953
10605204
SC
5954static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5955{
5956 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5957}
5958
b5f5fdca 5959static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
d126363d
JS
5960 struct kvm_memory_slot *slot,
5961 struct kvm_page_track_notifier_node *node)
b5f5fdca 5962{
002c5f73 5963 kvm_mmu_zap_all_fast(kvm);
1bad2b2a
XG
5964}
5965
a1a39128 5966int kvm_mmu_init_vm(struct kvm *kvm)
1bad2b2a 5967{
13d268ca 5968 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
a1a39128 5969 int r;
1bad2b2a 5970
a1a39128
PB
5971 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5972 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
5973 INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
ce25681d
SC
5974 spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
5975
a1a39128
PB
5976 r = kvm_mmu_init_tdp_mmu(kvm);
5977 if (r < 0)
5978 return r;
fe5db27d 5979
13d268ca 5980 node->track_write = kvm_mmu_pte_write;
b5f5fdca 5981 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
13d268ca 5982 kvm_page_track_register_notifier(kvm, node);
ada51a9d
DM
5983
5984 kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
5985 kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
5986
5987 kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
5988
5989 kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
5990 kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
5991
a1a39128 5992 return 0;
1bad2b2a
XG
5993}
5994
ada51a9d
DM
5995static void mmu_free_vm_memory_caches(struct kvm *kvm)
5996{
5997 kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
5998 kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
5999 kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
6000}
6001
13d268ca 6002void kvm_mmu_uninit_vm(struct kvm *kvm)
1bad2b2a 6003{
13d268ca 6004 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
1bad2b2a 6005
13d268ca 6006 kvm_page_track_unregister_notifier(kvm, node);
fe5db27d
BG
6007
6008 kvm_mmu_uninit_tdp_mmu(kvm);
ada51a9d
DM
6009
6010 mmu_free_vm_memory_caches(kvm);
1bad2b2a
XG
6011}
6012
2833eda0 6013static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
21fa3246
SC
6014{
6015 const struct kvm_memory_slot *memslot;
6016 struct kvm_memslots *slots;
f4209439 6017 struct kvm_memslot_iter iter;
21fa3246
SC
6018 bool flush = false;
6019 gfn_t start, end;
f4209439 6020 int i;
21fa3246
SC
6021
6022 if (!kvm_memslots_have_rmaps(kvm))
6023 return flush;
6024
6025 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6026 slots = __kvm_memslots(kvm, i);
f4209439
MS
6027
6028 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
6029 memslot = iter.slot;
21fa3246
SC
6030 start = max(gfn_start, memslot->base_gfn);
6031 end = min(gfn_end, memslot->base_gfn + memslot->npages);
f4209439 6032 if (WARN_ON_ONCE(start >= end))
21fa3246
SC
6033 continue;
6034
f8480721 6035 flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap,
21fa3246
SC
6036 PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
6037 start, end - 1, true, flush);
6038 }
6039 }
6040
6041 return flush;
6042}
6043
88f58535
ML
6044/*
6045 * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
6046 * (not including it)
6047 */
efdfe536
XG
6048void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6049{
21fa3246 6050 bool flush;
9da0e4d5 6051 int i;
efdfe536 6052
f4209439
MS
6053 if (WARN_ON_ONCE(gfn_end <= gfn_start))
6054 return;
6055
5a324c24
SC
6056 write_lock(&kvm->mmu_lock);
6057
20ec3ebd 6058 kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);
edb298c6 6059
2833eda0 6060 flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
efdfe536 6061
897218ff 6062 if (is_tdp_mmu_enabled(kvm)) {
6103bc07 6063 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
f47e5bbb
SC
6064 flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
6065 gfn_end, true, flush);
6103bc07 6066 }
5a324c24
SC
6067
6068 if (flush)
bc3b3c10
SC
6069 kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
6070 gfn_end - gfn_start);
5a324c24 6071
20ec3ebd 6072 kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);
edb298c6 6073
5a324c24 6074 write_unlock(&kvm->mmu_lock);
efdfe536
XG
6075}
6076
018aabb5 6077static bool slot_rmap_write_protect(struct kvm *kvm,
0a234f5d 6078 struct kvm_rmap_head *rmap_head,
269e9552 6079 const struct kvm_memory_slot *slot)
d77aa73c 6080{
1346bbb6 6081 return rmap_write_protect(rmap_head, false);
d77aa73c
XG
6082}
6083
1c91cad4 6084void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
269e9552 6085 const struct kvm_memory_slot *memslot,
3c9bd400 6086 int start_level)
6aa8b732 6087{
e2209710 6088 bool flush = false;
6aa8b732 6089
e2209710
BG
6090 if (kvm_memslots_have_rmaps(kvm)) {
6091 write_lock(&kvm->mmu_lock);
6092 flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
6093 start_level, KVM_MAX_HUGEPAGE_LEVEL,
6094 false);
6095 write_unlock(&kvm->mmu_lock);
6096 }
198c74f4 6097
24ae4cfa
BG
6098 if (is_tdp_mmu_enabled(kvm)) {
6099 read_lock(&kvm->mmu_lock);
6100 flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
6101 read_unlock(&kvm->mmu_lock);
6102 }
6103
198c74f4 6104 /*
6ff94f27
DM
6105 * Flush TLBs if any SPTEs had to be write-protected to ensure that
6106 * guest writes are reflected in the dirty bitmap before the memslot
6107 * update completes, i.e. before enabling dirty logging is visible to
6108 * userspace.
6109 *
6110 * Perform the TLB flush outside the mmu_lock to reduce the amount of
6111 * time the lock is held. However, this does mean that another CPU can
6112 * now grab mmu_lock and encounter a write-protected SPTE while CPUs
6113 * still have a writable mapping for the associated GFN in their TLB.
6114 *
6115 * This is safe but requires KVM to be careful when making decisions
6116 * based on the write-protection status of an SPTE. Specifically, KVM
6117 * also write-protects SPTEs to monitor changes to guest page tables
6118 * during shadow paging, and must guarantee no CPUs can write to those
6119 * page before the lock is dropped. As mentioned in the previous
6120 * paragraph, a write-protected SPTE is no guarantee that CPU cannot
6121 * perform writes. So to determine if a TLB flush is truly required, KVM
6122 * will clear a separate software-only bit (MMU-writable) and skip the
6123 * flush if-and-only-if this bit was already clear.
6124 *
02844ac1 6125 * See is_writable_pte() for more details.
198c74f4 6126 */
d91ffee9 6127 if (flush)
7f42aa76 6128 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
6aa8b732 6129}
37a7d8b0 6130
ada51a9d
DM
6131static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
6132{
6133 return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
6134}
6135
6136static bool need_topup_split_caches_or_resched(struct kvm *kvm)
6137{
6138 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
6139 return true;
6140
6141 /*
6142 * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
6143 * to split a single huge page. Calculating how many are actually needed
6144 * is possible but not worth the complexity.
6145 */
6146 return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
6147 need_topup(&kvm->arch.split_page_header_cache, 1) ||
6148 need_topup(&kvm->arch.split_shadow_page_cache, 1);
6149}
6150
6151static int topup_split_caches(struct kvm *kvm)
6152{
b9b71f43
SC
6153 /*
6154 * Allocating rmap list entries when splitting huge pages for nested
dfd4eb44 6155 * MMUs is uncommon as KVM needs to use a list if and only if there is
b9b71f43 6156 * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
dfd4eb44
SC
6157 * aliased by multiple L2 gfns and/or from multiple nested roots with
6158 * different roles. Aliasing gfns when using TDP is atypical for VMMs;
6159 * a few gfns are often aliased during boot, e.g. when remapping BIOS,
6160 * but aliasing rarely occurs post-boot or for many gfns. If there is
6161 * only one rmap entry, rmap->val points directly at that one entry and
6162 * doesn't need to allocate a list. Buffer the cache by the default
6163 * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
b9b71f43
SC
6164 * encounters an aliased gfn or two.
6165 */
6166 const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
6167 KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
ada51a9d
DM
6168 int r;
6169
6170 lockdep_assert_held(&kvm->slots_lock);
6171
b9b71f43 6172 r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
ada51a9d
DM
6173 SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
6174 if (r)
6175 return r;
6176
6177 r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
6178 if (r)
6179 return r;
6180
6181 return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
6182}
6183
6184static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
6185{
6186 struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6187 struct shadow_page_caches caches = {};
6188 union kvm_mmu_page_role role;
6189 unsigned int access;
6190 gfn_t gfn;
6191
79e48cec
SC
6192 gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6193 access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
ada51a9d
DM
6194
6195 /*
6196 * Note, huge page splitting always uses direct shadow pages, regardless
6197 * of whether the huge page itself is mapped by a direct or indirect
6198 * shadow page, since the huge page region itself is being directly
6199 * mapped with smaller pages.
6200 */
6201 role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
6202
6203 /* Direct SPs do not require a shadowed_info_cache. */
6204 caches.page_header_cache = &kvm->arch.split_page_header_cache;
6205 caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
6206
6207 /* Safe to pass NULL for vCPU since requesting a direct SP. */
6208 return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
6209}
6210
6211static void shadow_mmu_split_huge_page(struct kvm *kvm,
6212 const struct kvm_memory_slot *slot,
6213 u64 *huge_sptep)
6214
6215{
6216 struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
6217 u64 huge_spte = READ_ONCE(*huge_sptep);
6218 struct kvm_mmu_page *sp;
03787394 6219 bool flush = false;
ada51a9d
DM
6220 u64 *sptep, spte;
6221 gfn_t gfn;
6222 int index;
6223
6224 sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
6225
6226 for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
6227 sptep = &sp->spt[index];
6228 gfn = kvm_mmu_page_get_gfn(sp, index);
6229
6230 /*
6231 * The SP may already have populated SPTEs, e.g. if this huge
6232 * page is aliased by multiple sptes with the same access
6233 * permissions. These entries are guaranteed to map the same
6234 * gfn-to-pfn translation since the SP is direct, so no need to
6235 * modify them.
6236 *
03787394
PB
6237 * However, if a given SPTE points to a lower level page table,
6238 * that lower level page table may only be partially populated.
6239 * Installing such SPTEs would effectively unmap a potion of the
6240 * huge page. Unmapping guest memory always requires a TLB flush
6241 * since a subsequent operation on the unmapped regions would
6242 * fail to detect the need to flush.
ada51a9d 6243 */
03787394
PB
6244 if (is_shadow_present_pte(*sptep)) {
6245 flush |= !is_last_spte(*sptep, sp->role.level);
ada51a9d 6246 continue;
03787394 6247 }
ada51a9d
DM
6248
6249 spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
6250 mmu_spte_set(sptep, spte);
6251 __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
6252 }
6253
03787394 6254 __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
ada51a9d
DM
6255}
6256
6257static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
6258 const struct kvm_memory_slot *slot,
6259 u64 *huge_sptep)
6260{
6261 struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6262 int level, r = 0;
6263 gfn_t gfn;
6264 u64 spte;
6265
6266 /* Grab information for the tracepoint before dropping the MMU lock. */
79e48cec 6267 gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
ada51a9d
DM
6268 level = huge_sp->role.level;
6269 spte = *huge_sptep;
6270
6271 if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
6272 r = -ENOSPC;
6273 goto out;
6274 }
6275
6276 if (need_topup_split_caches_or_resched(kvm)) {
6277 write_unlock(&kvm->mmu_lock);
6278 cond_resched();
6279 /*
6280 * If the topup succeeds, return -EAGAIN to indicate that the
6281 * rmap iterator should be restarted because the MMU lock was
6282 * dropped.
6283 */
6284 r = topup_split_caches(kvm) ?: -EAGAIN;
6285 write_lock(&kvm->mmu_lock);
6286 goto out;
6287 }
6288
6289 shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
6290
6291out:
6292 trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
6293 return r;
6294}
6295
6296static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6297 struct kvm_rmap_head *rmap_head,
6298 const struct kvm_memory_slot *slot)
6299{
6300 struct rmap_iterator iter;
6301 struct kvm_mmu_page *sp;
6302 u64 *huge_sptep;
6303 int r;
6304
6305restart:
6306 for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
6307 sp = sptep_to_sp(huge_sptep);
6308
6309 /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
6310 if (WARN_ON_ONCE(!sp->role.guest_mode))
6311 continue;
6312
6313 /* The rmaps should never contain non-leaf SPTEs. */
6314 if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
6315 continue;
6316
6317 /* SPs with level >PG_LEVEL_4K should never by unsync. */
6318 if (WARN_ON_ONCE(sp->unsync))
6319 continue;
6320
6321 /* Don't bother splitting huge pages on invalid SPs. */
6322 if (sp->role.invalid)
6323 continue;
6324
6325 r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
6326
6327 /*
6328 * The split succeeded or needs to be retried because the MMU
6329 * lock was dropped. Either way, restart the iterator to get it
6330 * back into a consistent state.
6331 */
6332 if (!r || r == -EAGAIN)
6333 goto restart;
6334
6335 /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
6336 break;
6337 }
6338
6339 return false;
6340}
6341
6342static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6343 const struct kvm_memory_slot *slot,
6344 gfn_t start, gfn_t end,
6345 int target_level)
6346{
6347 int level;
6348
6349 /*
6350 * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
6351 * down to the target level. This ensures pages are recursively split
6352 * all the way to the target level. There's no need to split pages
6353 * already at the target level.
6354 */
6355 for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
6356 slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
6357 level, level, start, end - 1, true, false);
6358 }
6359}
6360
cb00a70b
DM
6361/* Must be called with the mmu_lock held in write-mode. */
6362void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
6363 const struct kvm_memory_slot *memslot,
6364 u64 start, u64 end,
6365 int target_level)
6366{
ada51a9d
DM
6367 if (!is_tdp_mmu_enabled(kvm))
6368 return;
6369
6370 if (kvm_memslots_have_rmaps(kvm))
6371 kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6372
6373 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
cb00a70b
DM
6374
6375 /*
6376 * A TLB flush is unnecessary at this point for the same resons as in
6377 * kvm_mmu_slot_try_split_huge_pages().
6378 */
6379}
6380
a3fe5dbd 6381void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
cb00a70b
DM
6382 const struct kvm_memory_slot *memslot,
6383 int target_level)
a3fe5dbd
DM
6384{
6385 u64 start = memslot->base_gfn;
6386 u64 end = start + memslot->npages;
6387
ada51a9d
DM
6388 if (!is_tdp_mmu_enabled(kvm))
6389 return;
6390
6391 if (kvm_memslots_have_rmaps(kvm)) {
6392 write_lock(&kvm->mmu_lock);
6393 kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6394 write_unlock(&kvm->mmu_lock);
a3fe5dbd
DM
6395 }
6396
ada51a9d
DM
6397 read_lock(&kvm->mmu_lock);
6398 kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
6399 read_unlock(&kvm->mmu_lock);
6400
a3fe5dbd
DM
6401 /*
6402 * No TLB flush is necessary here. KVM will flush TLBs after
6403 * write-protecting and/or clearing dirty on the newly split SPTEs to
6404 * ensure that guest writes are reflected in the dirty log before the
6405 * ioctl to enable dirty logging on this memslot completes. Since the
6406 * split SPTEs retain the write and dirty bits of the huge SPTE, it is
6407 * safe for KVM to decide if a TLB flush is necessary based on the split
6408 * SPTEs.
6409 */
6410}
6411
3ea3b7fa 6412static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
0a234f5d 6413 struct kvm_rmap_head *rmap_head,
269e9552 6414 const struct kvm_memory_slot *slot)
3ea3b7fa
WL
6415{
6416 u64 *sptep;
6417 struct rmap_iterator iter;
6418 int need_tlb_flush = 0;
3ea3b7fa
WL
6419 struct kvm_mmu_page *sp;
6420
0d536790 6421restart:
018aabb5 6422 for_each_rmap_spte(rmap_head, &iter, sptep) {
57354682 6423 sp = sptep_to_sp(sptep);
3ea3b7fa
WL
6424
6425 /*
decf6333
XG
6426 * We cannot do huge page mapping for indirect shadow pages,
6427 * which are found on the last rmap (level = 1) when not using
6428 * tdp; such shadow pages are synced with the page table in
6429 * the guest, and the guest page table is using 4K page size
6430 * mapping if the indirect sp has level = 1.
3ea3b7fa 6431 */
5d49f08c 6432 if (sp->role.direct &&
9eba50f8 6433 sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
a8ac499b 6434 PG_LEVEL_NUM)) {
9202aee8 6435 kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
40ef75a7
LT
6436
6437 if (kvm_available_flush_tlb_with_range())
6438 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6439 KVM_PAGES_PER_HPAGE(sp->role.level));
6440 else
6441 need_tlb_flush = 1;
6442
0d536790
XG
6443 goto restart;
6444 }
3ea3b7fa
WL
6445 }
6446
6447 return need_tlb_flush;
6448}
6449
20d49186
DM
6450static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
6451 const struct kvm_memory_slot *slot)
6452{
6453 /*
6454 * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
6455 * pages that are already mapped at the maximum hugepage level.
6456 */
6457 if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte,
6458 PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
6459 kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
6460}
6461
3ea3b7fa 6462void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
269e9552 6463 const struct kvm_memory_slot *slot)
3ea3b7fa 6464{
e2209710
BG
6465 if (kvm_memslots_have_rmaps(kvm)) {
6466 write_lock(&kvm->mmu_lock);
20d49186 6467 kvm_rmap_zap_collapsible_sptes(kvm, slot);
e2209710
BG
6468 write_unlock(&kvm->mmu_lock);
6469 }
2db6f772
BG
6470
6471 if (is_tdp_mmu_enabled(kvm)) {
2db6f772 6472 read_lock(&kvm->mmu_lock);
4b85c921 6473 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
2db6f772
BG
6474 read_unlock(&kvm->mmu_lock);
6475 }
3ea3b7fa
WL
6476}
6477
b3594ffb 6478void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
6c9dd6d2 6479 const struct kvm_memory_slot *memslot)
b3594ffb
SC
6480{
6481 /*
7f42aa76 6482 * All current use cases for flushing the TLBs for a specific memslot
302695a5 6483 * related to dirty logging, and many do the TLB flush out of mmu_lock.
7f42aa76
SC
6484 * The interaction between the various operations on memslot must be
6485 * serialized by slots_locks to ensure the TLB flush from one operation
6486 * is observed by any other operation on the same memslot.
b3594ffb
SC
6487 */
6488 lockdep_assert_held(&kvm->slots_lock);
cec37648
SC
6489 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6490 memslot->npages);
b3594ffb
SC
6491}
6492
f4b4b180 6493void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
269e9552 6494 const struct kvm_memory_slot *memslot)
f4b4b180 6495{
e2209710 6496 bool flush = false;
f4b4b180 6497
e2209710
BG
6498 if (kvm_memslots_have_rmaps(kvm)) {
6499 write_lock(&kvm->mmu_lock);
610265ea
DM
6500 /*
6501 * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6502 * support dirty logging at a 4k granularity.
6503 */
6504 flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
e2209710
BG
6505 write_unlock(&kvm->mmu_lock);
6506 }
f4b4b180 6507
24ae4cfa
BG
6508 if (is_tdp_mmu_enabled(kvm)) {
6509 read_lock(&kvm->mmu_lock);
6510 flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6511 read_unlock(&kvm->mmu_lock);
6512 }
6513
f4b4b180
KH
6514 /*
6515 * It's also safe to flush TLBs out of mmu lock here as currently this
6516 * function is only used for dirty logging, in which case flushing TLB
6517 * out of mmu lock also guarantees no dirty pages will be lost in
6518 * dirty_bitmap.
6519 */
6520 if (flush)
7f42aa76 6521 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
f4b4b180 6522}
f4b4b180 6523
92f58b5c 6524void kvm_mmu_zap_all(struct kvm *kvm)
5304b8d3
XG
6525{
6526 struct kvm_mmu_page *sp, *node;
7390de1e 6527 LIST_HEAD(invalid_list);
83cdb568 6528 int ign;
5304b8d3 6529
531810ca 6530 write_lock(&kvm->mmu_lock);
5304b8d3 6531restart:
8a674adc 6532 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
f95eec9b 6533 if (WARN_ON(sp->role.invalid))
4771450c 6534 continue;
92f58b5c 6535 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
5304b8d3 6536 goto restart;
531810ca 6537 if (cond_resched_rwlock_write(&kvm->mmu_lock))
5304b8d3
XG
6538 goto restart;
6539 }
6540
4771450c 6541 kvm_mmu_commit_zap_page(kvm, &invalid_list);
faaf05b0 6542
897218ff 6543 if (is_tdp_mmu_enabled(kvm))
faaf05b0
BG
6544 kvm_tdp_mmu_zap_all(kvm);
6545
531810ca 6546 write_unlock(&kvm->mmu_lock);
5304b8d3
XG
6547}
6548
15248258 6549void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
f8f55942 6550{
164bf7e5 6551 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
e1359e2b 6552
164bf7e5 6553 gen &= MMIO_SPTE_GEN_MASK;
e1359e2b 6554
f8f55942 6555 /*
e1359e2b
SC
6556 * Generation numbers are incremented in multiples of the number of
6557 * address spaces in order to provide unique generations across all
6558 * address spaces. Strip what is effectively the address space
6559 * modifier prior to checking for a wrap of the MMIO generation so
6560 * that a wrap in any address space is detected.
6561 */
6562 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6563
f8f55942 6564 /*
e1359e2b 6565 * The very rare case: if the MMIO generation number has wrapped,
f8f55942 6566 * zap all shadow pages.
f8f55942 6567 */
e1359e2b 6568 if (unlikely(gen == 0)) {
ae0f5499 6569 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
92f58b5c 6570 kvm_mmu_zap_all_fast(kvm);
7a2e8aaf 6571 }
f8f55942
XG
6572}
6573
70534a73
DC
6574static unsigned long
6575mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
3ee16c81
IE
6576{
6577 struct kvm *kvm;
1495f230 6578 int nr_to_scan = sc->nr_to_scan;
70534a73 6579 unsigned long freed = 0;
3ee16c81 6580
0d9ce162 6581 mutex_lock(&kvm_lock);
3ee16c81
IE
6582
6583 list_for_each_entry(kvm, &vm_list, vm_list) {
3d56cbdf 6584 int idx;
d98ba053 6585 LIST_HEAD(invalid_list);
3ee16c81 6586
35f2d16b
TY
6587 /*
6588 * Never scan more than sc->nr_to_scan VM instances.
6589 * Will not hit this condition practically since we do not try
6590 * to shrink more than one VM and it is very unlikely to see
6591 * !n_used_mmu_pages so many times.
6592 */
6593 if (!nr_to_scan--)
6594 break;
19526396
GN
6595 /*
6596 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6597 * here. We may skip a VM instance errorneosly, but we do not
6598 * want to shrink a VM that only started to populate its MMU
6599 * anyway.
6600 */
10605204
SC
6601 if (!kvm->arch.n_used_mmu_pages &&
6602 !kvm_has_zapped_obsolete_pages(kvm))
19526396 6603 continue;
19526396 6604
f656ce01 6605 idx = srcu_read_lock(&kvm->srcu);
531810ca 6606 write_lock(&kvm->mmu_lock);
3ee16c81 6607
10605204
SC
6608 if (kvm_has_zapped_obsolete_pages(kvm)) {
6609 kvm_mmu_commit_zap_page(kvm,
6610 &kvm->arch.zapped_obsolete_pages);
6611 goto unlock;
6612 }
6613
ebdb292d 6614 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
19526396 6615
10605204 6616unlock:
531810ca 6617 write_unlock(&kvm->mmu_lock);
f656ce01 6618 srcu_read_unlock(&kvm->srcu, idx);
19526396 6619
70534a73
DC
6620 /*
6621 * unfair on small ones
6622 * per-vm shrinkers cry out
6623 * sadness comes quickly
6624 */
19526396
GN
6625 list_move_tail(&kvm->vm_list, &vm_list);
6626 break;
3ee16c81 6627 }
3ee16c81 6628
0d9ce162 6629 mutex_unlock(&kvm_lock);
70534a73 6630 return freed;
70534a73
DC
6631}
6632
6633static unsigned long
6634mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6635{
45221ab6 6636 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3ee16c81
IE
6637}
6638
6639static struct shrinker mmu_shrinker = {
70534a73
DC
6640 .count_objects = mmu_shrink_count,
6641 .scan_objects = mmu_shrink_scan,
3ee16c81
IE
6642 .seeks = DEFAULT_SEEKS * 10,
6643};
6644
2ddfd20e 6645static void mmu_destroy_caches(void)
b5a33a75 6646{
c1bd743e
TH
6647 kmem_cache_destroy(pte_list_desc_cache);
6648 kmem_cache_destroy(mmu_page_header_cache);
b5a33a75
AK
6649}
6650
b8e8c830
PB
6651static bool get_nx_auto_mode(void)
6652{
6653 /* Return true when CPU has the bug, and mitigations are ON */
6654 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6655}
6656
6657static void __set_nx_huge_pages(bool val)
6658{
6659 nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6660}
6661
6662static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6663{
6664 bool old_val = nx_huge_pages;
6665 bool new_val;
6666
6667 /* In "auto" mode deploy workaround only if CPU has the bug. */
6668 if (sysfs_streq(val, "off"))
6669 new_val = 0;
6670 else if (sysfs_streq(val, "force"))
6671 new_val = 1;
6672 else if (sysfs_streq(val, "auto"))
6673 new_val = get_nx_auto_mode();
6674 else if (strtobool(val, &new_val) < 0)
6675 return -EINVAL;
6676
6677 __set_nx_huge_pages(new_val);
6678
6679 if (new_val != old_val) {
6680 struct kvm *kvm;
b8e8c830
PB
6681
6682 mutex_lock(&kvm_lock);
6683
6684 list_for_each_entry(kvm, &vm_list, vm_list) {
ed69a6cb 6685 mutex_lock(&kvm->slots_lock);
b8e8c830 6686 kvm_mmu_zap_all_fast(kvm);
ed69a6cb 6687 mutex_unlock(&kvm->slots_lock);
1aa9b957
JS
6688
6689 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
b8e8c830
PB
6690 }
6691 mutex_unlock(&kvm_lock);
6692 }
6693
6694 return 0;
6695}
6696
1d0e8480
SC
6697/*
6698 * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
6699 * its default value of -1 is technically undefined behavior for a boolean.
c3e0c8c2
SC
6700 * Forward the module init call to SPTE code so that it too can handle module
6701 * params that need to be resolved/snapshot.
1d0e8480 6702 */
982bae43 6703void __init kvm_mmu_x86_module_init(void)
b5a33a75 6704{
b8e8c830
PB
6705 if (nx_huge_pages == -1)
6706 __set_nx_huge_pages(get_nx_auto_mode());
c3e0c8c2
SC
6707
6708 kvm_mmu_spte_module_init();
1d0e8480
SC
6709}
6710
6711/*
6712 * The bulk of the MMU initialization is deferred until the vendor module is
6713 * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
6714 * to be reset when a potentially different vendor module is loaded.
6715 */
6716int kvm_mmu_vendor_module_init(void)
6717{
6718 int ret = -ENOMEM;
b8e8c830 6719
36d9594d
VK
6720 /*
6721 * MMU roles use union aliasing which is, generally speaking, an
6722 * undefined behavior. However, we supposedly know how compilers behave
6723 * and the current status quo is unlikely to change. Guardians below are
6724 * supposed to let us know if the assumption becomes false.
6725 */
6726 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6727 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
7a7ae829 6728 BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
36d9594d 6729
28a1f3ac 6730 kvm_mmu_reset_all_pte_masks();
f160c7b7 6731
53c07b18
XG
6732 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6733 sizeof(struct pte_list_desc),
46bea48a 6734 0, SLAB_ACCOUNT, NULL);
53c07b18 6735 if (!pte_list_desc_cache)
ab271bd4 6736 goto out;
b5a33a75 6737
d3d25b04
AK
6738 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6739 sizeof(struct kvm_mmu_page),
46bea48a 6740 0, SLAB_ACCOUNT, NULL);
d3d25b04 6741 if (!mmu_page_header_cache)
ab271bd4 6742 goto out;
d3d25b04 6743
908c7f19 6744 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
ab271bd4 6745 goto out;
45bf21a8 6746
e33c267a 6747 ret = register_shrinker(&mmu_shrinker, "x86-mmu");
ab271bd4
AB
6748 if (ret)
6749 goto out;
3ee16c81 6750
b5a33a75
AK
6751 return 0;
6752
ab271bd4 6753out:
3ee16c81 6754 mmu_destroy_caches();
ab271bd4 6755 return ret;
b5a33a75
AK
6756}
6757
c42fffe3
XG
6758void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6759{
95f93af4 6760 kvm_mmu_unload(vcpu);
1cfff4d9
JP
6761 free_mmu_pages(&vcpu->arch.root_mmu);
6762 free_mmu_pages(&vcpu->arch.guest_mmu);
c42fffe3 6763 mmu_free_memory_caches(vcpu);
b034cf01
XG
6764}
6765
1d0e8480 6766void kvm_mmu_vendor_module_exit(void)
b034cf01
XG
6767{
6768 mmu_destroy_caches();
6769 percpu_counter_destroy(&kvm_total_used_mmu_pages);
6770 unregister_shrinker(&mmu_shrinker);
c42fffe3 6771}
1aa9b957 6772
f47491d7
SC
6773/*
6774 * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6775 * select a halving time of 1 hour". Returns true if recovery is enabled.
6776 */
6777static bool calc_nx_huge_pages_recovery_period(uint *period)
6778{
6779 /*
6780 * Use READ_ONCE to get the params, this may be called outside of the
6781 * param setters, e.g. by the kthread to compute its next timeout.
6782 */
6783 bool enabled = READ_ONCE(nx_huge_pages);
6784 uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6785
6786 if (!enabled || !ratio)
6787 return false;
6788
6789 *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
6790 if (!*period) {
6791 /* Make sure the period is not less than one second. */
6792 ratio = min(ratio, 3600u);
6793 *period = 60 * 60 * 1000 / ratio;
6794 }
6795 return true;
6796}
6797
4dfe4f40 6798static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
1aa9b957 6799{
4dfe4f40
JS
6800 bool was_recovery_enabled, is_recovery_enabled;
6801 uint old_period, new_period;
1aa9b957
JS
6802 int err;
6803
f47491d7 6804 was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
4dfe4f40 6805
1aa9b957
JS
6806 err = param_set_uint(val, kp);
6807 if (err)
6808 return err;
6809
f47491d7 6810 is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
4dfe4f40 6811
f47491d7 6812 if (is_recovery_enabled &&
4dfe4f40 6813 (!was_recovery_enabled || old_period > new_period)) {
1aa9b957
JS
6814 struct kvm *kvm;
6815
6816 mutex_lock(&kvm_lock);
6817
6818 list_for_each_entry(kvm, &vm_list, vm_list)
6819 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6820
6821 mutex_unlock(&kvm_lock);
6822 }
6823
6824 return err;
6825}
6826
6827static void kvm_recover_nx_lpages(struct kvm *kvm)
6828{
ade74e14 6829 unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
1aa9b957
JS
6830 int rcu_idx;
6831 struct kvm_mmu_page *sp;
6832 unsigned int ratio;
6833 LIST_HEAD(invalid_list);
048f4980 6834 bool flush = false;
1aa9b957
JS
6835 ulong to_zap;
6836
6837 rcu_idx = srcu_read_lock(&kvm->srcu);
531810ca 6838 write_lock(&kvm->mmu_lock);
1aa9b957 6839
bb95dfb9
SC
6840 /*
6841 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
6842 * be done under RCU protection, because the pages are freed via RCU
6843 * callback.
6844 */
6845 rcu_read_lock();
6846
1aa9b957 6847 ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
ade74e14 6848 to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
7d919c7a
SC
6849 for ( ; to_zap; --to_zap) {
6850 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
6851 break;
6852
1aa9b957
JS
6853 /*
6854 * We use a separate list instead of just using active_mmu_pages
6855 * because the number of lpage_disallowed pages is expected to
6856 * be relatively small compared to the total.
6857 */
6858 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6859 struct kvm_mmu_page,
6860 lpage_disallowed_link);
6861 WARN_ON_ONCE(!sp->lpage_disallowed);
897218ff 6862 if (is_tdp_mmu_page(sp)) {
315f02c6 6863 flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
8d1a182e 6864 } else {
29cf0f50
BG
6865 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6866 WARN_ON_ONCE(sp->lpage_disallowed);
6867 }
1aa9b957 6868
531810ca 6869 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
048f4980 6870 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
bb95dfb9
SC
6871 rcu_read_unlock();
6872
531810ca 6873 cond_resched_rwlock_write(&kvm->mmu_lock);
048f4980 6874 flush = false;
bb95dfb9
SC
6875
6876 rcu_read_lock();
1aa9b957
JS
6877 }
6878 }
048f4980 6879 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
1aa9b957 6880
bb95dfb9
SC
6881 rcu_read_unlock();
6882
531810ca 6883 write_unlock(&kvm->mmu_lock);
1aa9b957
JS
6884 srcu_read_unlock(&kvm->srcu, rcu_idx);
6885}
6886
6887static long get_nx_lpage_recovery_timeout(u64 start_time)
6888{
f47491d7
SC
6889 bool enabled;
6890 uint period;
4dfe4f40 6891
f47491d7 6892 enabled = calc_nx_huge_pages_recovery_period(&period);
4dfe4f40 6893
f47491d7
SC
6894 return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
6895 : MAX_SCHEDULE_TIMEOUT;
1aa9b957
JS
6896}
6897
6898static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6899{
6900 u64 start_time;
6901 long remaining_time;
6902
6903 while (true) {
6904 start_time = get_jiffies_64();
6905 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6906
6907 set_current_state(TASK_INTERRUPTIBLE);
6908 while (!kthread_should_stop() && remaining_time > 0) {
6909 schedule_timeout(remaining_time);
6910 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6911 set_current_state(TASK_INTERRUPTIBLE);
6912 }
6913
6914 set_current_state(TASK_RUNNING);
6915
6916 if (kthread_should_stop())
6917 return 0;
6918
6919 kvm_recover_nx_lpages(kvm);
6920 }
6921}
6922
6923int kvm_mmu_post_init_vm(struct kvm *kvm)
6924{
6925 int err;
6926
6927 err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6928 "kvm-nx-lpage-recovery",
6929 &kvm->arch.nx_lpage_recovery_thread);
6930 if (!err)
6931 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6932
6933 return err;
6934}
6935
6936void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6937{
6938 if (kvm->arch.nx_lpage_recovery_thread)
6939 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6940}