]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - arch/x86/kvm/mmu/tdp_mmu.c
KVM: x86/mmu: Check for leaf SPTE when clearing dirty bit in the TDP MMU
[thirdparty/kernel/stable.git] / arch / x86 / kvm / mmu / tdp_mmu.c
CommitLineData
fe5db27d 1// SPDX-License-Identifier: GPL-2.0
8d20bd63 2#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
fe5db27d 3
02c00b3a
BG
4#include "mmu.h"
5#include "mmu_internal.h"
bb18842e 6#include "mmutrace.h"
2f2fad08 7#include "tdp_iter.h"
fe5db27d 8#include "tdp_mmu.h"
02c00b3a 9#include "spte.h"
fe5db27d 10
9a77daac 11#include <asm/cmpxchg.h>
33dd3574
BG
12#include <trace/events/kvm.h>
13
fe5db27d 14/* Initializes the TDP MMU for the VM, if enabled. */
0df9dab8 15void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
fe5db27d 16{
02c00b3a 17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
9a77daac 18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
fe5db27d
BG
19}
20
226b8c8f
SC
21/* Arbitrarily returns true so that this may be used in if statements. */
22static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
6103bc07
BG
23 bool shared)
24{
25 if (shared)
26 lockdep_assert_held_read(&kvm->mmu_lock);
27 else
28 lockdep_assert_held_write(&kvm->mmu_lock);
226b8c8f
SC
29
30 return true;
6103bc07
BG
31}
32
fe5db27d
BG
33void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34{
edbdb43f
SC
35 /*
36 * Invalidate all roots, which besides the obvious, schedules all roots
37 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38 * ultimately frees all roots.
39 */
40 kvm_tdp_mmu_invalidate_all_roots(kvm);
0df9dab8 41 kvm_tdp_mmu_zap_invalidated_roots(kvm);
22b94c4b 42
d25ceb92 43 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
02c00b3a 44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
7cca2d0b
BG
45
46 /*
47 * Ensure that all the outstanding RCU callbacks to free shadow pages
0df9dab8
SC
48 * can run before the VM is torn down. Putting the last reference to
49 * zapped roots will create new callbacks.
7cca2d0b
BG
50 */
51 rcu_barrier();
02c00b3a
BG
52}
53
2bdb3d84 54static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
a889ea54 55{
2bdb3d84
BG
56 free_page((unsigned long)sp->spt);
57 kmem_cache_free(mmu_page_header_cache, sp);
a889ea54
BG
58}
59
c0e64238
BG
60/*
61 * This is called through call_rcu in order to free TDP page table memory
62 * safely with respect to other kernel threads that may be operating on
63 * the memory.
64 * By only accessing TDP MMU page table memory in an RCU read critical
65 * section, and freeing it after a grace period, lockless access to that
66 * memory won't use it after it is freed.
67 */
68static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
a889ea54 69{
c0e64238
BG
70 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
71 rcu_head);
a889ea54 72
c0e64238
BG
73 tdp_mmu_free_sp(sp);
74}
a889ea54 75
6103bc07
BG
76void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
77 bool shared)
2bdb3d84 78{
6103bc07 79 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a889ea54 80
11cccf5c 81 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
2bdb3d84
BG
82 return;
83
db01416b 84 /*
edbdb43f
SC
85 * The TDP MMU itself holds a reference to each root until the root is
86 * explicitly invalidated, i.e. the final reference should be never be
87 * put for a valid root.
db01416b 88 */
edbdb43f 89 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
2bdb3d84 90
8351779c
PB
91 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92 list_del_rcu(&root->link);
93 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
c0e64238 94 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
a889ea54
BG
95}
96
cfc10997 97/*
d62007ed
SC
98 * Returns the next root after @prev_root (or the first root if @prev_root is
99 * NULL). A reference to the returned root is acquired, and the reference to
100 * @prev_root is released (the caller obviously must hold a reference to
101 * @prev_root if it's non-NULL).
102 *
103 * If @only_valid is true, invalid roots are skipped.
104 *
105 * Returns NULL if the end of tdp_mmu_roots was reached.
cfc10997
BG
106 */
107static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
6103bc07 108 struct kvm_mmu_page *prev_root,
d62007ed 109 bool shared, bool only_valid)
a889ea54
BG
110{
111 struct kvm_mmu_page *next_root;
112
c0e64238
BG
113 rcu_read_lock();
114
cfc10997 115 if (prev_root)
c0e64238
BG
116 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
117 &prev_root->link,
118 typeof(*prev_root), link);
cfc10997 119 else
c0e64238
BG
120 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
121 typeof(*next_root), link);
a889ea54 122
04dc4e6c 123 while (next_root) {
d62007ed 124 if ((!only_valid || !next_root->role.invalid) &&
ad6d6b94 125 kvm_tdp_mmu_get_root(next_root))
04dc4e6c
SC
126 break;
127
c0e64238
BG
128 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
129 &next_root->link, typeof(*next_root), link);
04dc4e6c 130 }
fb101293 131
c0e64238 132 rcu_read_unlock();
a889ea54 133
cfc10997 134 if (prev_root)
6103bc07 135 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
a889ea54 136
a889ea54
BG
137 return next_root;
138}
139
140/*
141 * Note: this iterator gets and puts references to the roots it iterates over.
142 * This makes it safe to release the MMU lock and yield within the loop, but
143 * if exiting the loop early, the caller must drop the reference to the most
144 * recent root. (Unless keeping a live reference is desirable.)
6103bc07
BG
145 *
146 * If shared is set, this function is operating under the MMU lock in read
147 * mode. In the unlikely event that this thread must free a root, the lock
148 * will be temporarily dropped and reacquired in write mode.
a889ea54 149 */
d62007ed
SC
150#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
151 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
152 _root; \
153 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
614f6970
PB
154 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
155 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 156 } else
a889ea54 157
d62007ed
SC
158#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
159 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
160
0df9dab8
SC
161#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
162 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \
441a5dfc 163 _root; \
0df9dab8
SC
164 _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \
165 if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \
441a5dfc 166 } else
d62007ed 167
226b8c8f
SC
168/*
169 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
170 * the implication being that any flow that holds mmu_lock for read is
171 * inherently yield-friendly and should use the yield-safe variant above.
172 * Holding mmu_lock for write obviates the need for RCU protection as the list
173 * is guaranteed to be stable.
174 */
175#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
176 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
177 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
178 kvm_mmu_page_as_id(_root) != _as_id) { \
a3f15bda 179 } else
02c00b3a 180
a82070b6 181static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
02c00b3a
BG
182{
183 struct kvm_mmu_page *sp;
184
185 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
186 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
a82070b6
DM
187
188 return sp;
189}
190
c10743a1
SC
191static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
192 gfn_t gfn, union kvm_mmu_page_role role)
a82070b6 193{
55c510e2 194 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
428e9216 195
02c00b3a
BG
196 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
197
a3aca4de 198 sp->role = role;
02c00b3a 199 sp->gfn = gfn;
c10743a1 200 sp->ptep = sptep;
02c00b3a
BG
201 sp->tdp_mmu_page = true;
202
33dd3574 203 trace_kvm_mmu_get_page(sp, true);
02c00b3a
BG
204}
205
a82070b6
DM
206static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
207 struct tdp_iter *iter)
02c00b3a 208{
a3aca4de 209 struct kvm_mmu_page *parent_sp;
02c00b3a 210 union kvm_mmu_page_role role;
a3aca4de
DM
211
212 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
213
214 role = parent_sp->role;
215 role.level--;
216
c10743a1 217 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
a3aca4de
DM
218}
219
220hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
221{
7a458f0e 222 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
02c00b3a
BG
223 struct kvm *kvm = vcpu->kvm;
224 struct kvm_mmu_page *root;
225
6e6ec584 226 lockdep_assert_held_write(&kvm->mmu_lock);
02c00b3a 227
04dc4e6c
SC
228 /*
229 * Check for an existing root before allocating a new one. Note, the
230 * role check prevents consuming an invalid root.
231 */
a3f15bda 232 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
fb101293 233 if (root->role.word == role.word &&
ad6d6b94 234 kvm_tdp_mmu_get_root(root))
6e6ec584 235 goto out;
02c00b3a
BG
236 }
237
a82070b6 238 root = tdp_mmu_alloc_sp(vcpu);
c10743a1 239 tdp_mmu_init_sp(root, NULL, 0, role);
a82070b6 240
edbdb43f
SC
241 /*
242 * TDP MMU roots are kept until they are explicitly invalidated, either
243 * by a memslot update or by the destruction of the VM. Initialize the
244 * refcount to two; one reference for the vCPU, and one reference for
245 * the TDP MMU itself, which is held until the root is invalidated and
0df9dab8 246 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
edbdb43f
SC
247 */
248 refcount_set(&root->tdp_mmu_root_count, 2);
02c00b3a 249
c0e64238
BG
250 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
251 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
252 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
02c00b3a 253
6e6ec584 254out:
02c00b3a 255 return __pa(root->spt);
fe5db27d 256}
2f2fad08
BG
257
258static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
9a77daac
BG
259 u64 old_spte, u64 new_spte, int level,
260 bool shared);
2f2fad08 261
43a063ca
YA
262static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
263{
264 kvm_account_pgtable_pages((void *)sp->spt, +1);
d25ceb92 265 atomic64_inc(&kvm->arch.tdp_mmu_pages);
43a063ca
YA
266}
267
268static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
269{
270 kvm_account_pgtable_pages((void *)sp->spt, -1);
d25ceb92 271 atomic64_dec(&kvm->arch.tdp_mmu_pages);
43a063ca
YA
272}
273
a9442f59 274/**
c298a30c 275 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
a9442f59
BG
276 *
277 * @kvm: kvm instance
278 * @sp: the page to be removed
9a77daac
BG
279 * @shared: This operation may not be running under the exclusive use of
280 * the MMU lock and the operation must synchronize with other
281 * threads that might be adding or removing pages.
a9442f59 282 */
c298a30c
DM
283static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
284 bool shared)
a9442f59 285{
43a063ca 286 tdp_unaccount_mmu_page(kvm, sp);
d25ceb92
SC
287
288 if (!sp->nx_huge_page_disallowed)
289 return;
290
9a77daac
BG
291 if (shared)
292 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
293 else
294 lockdep_assert_held_write(&kvm->mmu_lock);
a9442f59 295
d25ceb92
SC
296 sp->nx_huge_page_disallowed = false;
297 untrack_possible_nx_huge_page(kvm, sp);
9a77daac
BG
298
299 if (shared)
300 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
a9442f59
BG
301}
302
a066e61f 303/**
0f53dfa3 304 * handle_removed_pt() - handle a page table removed from the TDP structure
a066e61f
BG
305 *
306 * @kvm: kvm instance
307 * @pt: the page removed from the paging structure
9a77daac
BG
308 * @shared: This operation may not be running under the exclusive use
309 * of the MMU lock and the operation must synchronize with other
310 * threads that might be modifying SPTEs.
a066e61f
BG
311 *
312 * Given a page table that has been removed from the TDP paging structure,
313 * iterates through the page table to clear SPTEs and free child page tables.
70fb3e41
BG
314 *
315 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
316 * protection. Since this thread removed it from the paging structure,
317 * this thread will be responsible for ensuring the page is freed. Hence the
318 * early rcu_dereferences in the function.
a066e61f 319 */
0f53dfa3 320static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
a066e61f 321{
70fb3e41 322 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
a066e61f 323 int level = sp->role.level;
e25f0e0c 324 gfn_t base_gfn = sp->gfn;
a066e61f
BG
325 int i;
326
327 trace_kvm_mmu_prepare_zap_page(sp);
328
c298a30c 329 tdp_mmu_unlink_sp(kvm, sp, shared);
a066e61f 330
2ca3129e 331 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
ba3a6120 332 tdp_ptep_t sptep = pt + i;
574c3c55 333 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
ba3a6120 334 u64 old_spte;
9a77daac
BG
335
336 if (shared) {
e25f0e0c
BG
337 /*
338 * Set the SPTE to a nonpresent value that other
339 * threads will not overwrite. If the SPTE was
340 * already marked as removed then another thread
341 * handling a page fault could overwrite it, so
342 * set the SPTE until it is set from some other
343 * value to the removed SPTE value.
344 */
345 for (;;) {
ba3a6120
SC
346 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
347 if (!is_removed_spte(old_spte))
e25f0e0c
BG
348 break;
349 cpu_relax();
350 }
9a77daac 351 } else {
8df9f1af
SC
352 /*
353 * If the SPTE is not MMU-present, there is no backing
354 * page associated with the SPTE and so no side effects
355 * that need to be recorded, and exclusive ownership of
356 * mmu_lock ensures the SPTE can't be made present.
357 * Note, zapping MMIO SPTEs is also unnecessary as they
358 * are guarded by the memslots generation, not by being
359 * unreachable.
360 */
ba3a6120
SC
361 old_spte = kvm_tdp_mmu_read_spte(sptep);
362 if (!is_shadow_present_pte(old_spte))
8df9f1af 363 continue;
e25f0e0c
BG
364
365 /*
ba3a6120
SC
366 * Use the common helper instead of a raw WRITE_ONCE as
367 * the SPTE needs to be updated atomically if it can be
368 * modified by a different vCPU outside of mmu_lock.
369 * Even though the parent SPTE is !PRESENT, the TLB
370 * hasn't yet been flushed, and both Intel and AMD
371 * document that A/D assists can use upper-level PxE
372 * entries that are cached in the TLB, i.e. the CPU can
373 * still access the page and mark it dirty.
374 *
375 * No retry is needed in the atomic update path as the
376 * sole concern is dropping a Dirty bit, i.e. no other
377 * task can zap/remove the SPTE as mmu_lock is held for
378 * write. Marking the SPTE as a removed SPTE is not
379 * strictly necessary for the same reason, but using
380 * the remove SPTE value keeps the shared/exclusive
381 * paths consistent and allows the handle_changed_spte()
382 * call below to hardcode the new value to REMOVED_SPTE.
383 *
384 * Note, even though dropping a Dirty bit is the only
385 * scenario where a non-atomic update could result in a
386 * functional bug, simply checking the Dirty bit isn't
387 * sufficient as a fast page fault could read the upper
388 * level SPTE before it is zapped, and then make this
389 * target SPTE writable, resume the guest, and set the
390 * Dirty bit between reading the SPTE above and writing
391 * it here.
e25f0e0c 392 */
ba3a6120
SC
393 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
394 REMOVED_SPTE, level);
9a77daac 395 }
e25f0e0c 396 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
ba3a6120 397 old_spte, REMOVED_SPTE, level, shared);
a066e61f
BG
398 }
399
7cca2d0b 400 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
a066e61f
BG
401}
402
2f2fad08 403/**
40fa907e 404 * handle_changed_spte - handle bookkeeping associated with an SPTE change
2f2fad08
BG
405 * @kvm: kvm instance
406 * @as_id: the address space of the paging structure the SPTE was a part of
407 * @gfn: the base GFN that was mapped by the SPTE
408 * @old_spte: The value of the SPTE before the change
409 * @new_spte: The value of the SPTE after the change
410 * @level: the level of the PT the SPTE is part of in the paging structure
9a77daac
BG
411 * @shared: This operation may not be running under the exclusive use of
412 * the MMU lock and the operation must synchronize with other
413 * threads that might be modifying SPTEs.
2f2fad08 414 *
1f997345
VS
415 * Handle bookkeeping that might result from the modification of a SPTE. Note,
416 * dirty logging updates are handled in common code, not here (see make_spte()
417 * and fast_pf_fix_direct_spte()).
2f2fad08 418 */
40fa907e
VS
419static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
420 u64 old_spte, u64 new_spte, int level,
421 bool shared)
2f2fad08
BG
422{
423 bool was_present = is_shadow_present_pte(old_spte);
424 bool is_present = is_shadow_present_pte(new_spte);
425 bool was_leaf = was_present && is_last_spte(old_spte, level);
426 bool is_leaf = is_present && is_last_spte(new_spte, level);
427 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
2f2fad08 428
20ba462d
SC
429 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
430 WARN_ON_ONCE(level < PG_LEVEL_4K);
431 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
2f2fad08
BG
432
433 /*
434 * If this warning were to trigger it would indicate that there was a
435 * missing MMU notifier or a race with some notifier handler.
436 * A present, leaf SPTE should never be directly replaced with another
d9f6e12f 437 * present leaf SPTE pointing to a different PFN. A notifier handler
2f2fad08
BG
438 * should be zapping the SPTE before the main MM's page table is
439 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
440 * thread before replacement.
441 */
442 if (was_leaf && is_leaf && pfn_changed) {
443 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
444 "SPTE with another present leaf SPTE mapping a\n"
445 "different PFN!\n"
446 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
447 as_id, gfn, old_spte, new_spte, level);
448
449 /*
450 * Crash the host to prevent error propagation and guest data
d9f6e12f 451 * corruption.
2f2fad08
BG
452 */
453 BUG();
454 }
455
456 if (old_spte == new_spte)
457 return;
458
b9a98c34
BG
459 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
460
115111ef
DM
461 if (is_leaf)
462 check_spte_writable_invariants(new_spte);
463
2f2fad08
BG
464 /*
465 * The only times a SPTE should be changed from a non-present to
466 * non-present state is when an MMIO entry is installed/modified/
467 * removed. In that case, there is nothing to do here.
468 */
469 if (!was_present && !is_present) {
470 /*
08f07c80
BG
471 * If this change does not involve a MMIO SPTE or removed SPTE,
472 * it is unexpected. Log the change, though it should not
473 * impact the guest since both the former and current SPTEs
474 * are nonpresent.
2f2fad08 475 */
20ba462d
SC
476 if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
477 !is_mmio_spte(new_spte) &&
478 !is_removed_spte(new_spte)))
2f2fad08
BG
479 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
480 "should not be replaced with another,\n"
481 "different nonpresent SPTE, unless one or both\n"
08f07c80
BG
482 "are MMIO SPTEs, or the new SPTE is\n"
483 "a temporary removed SPTE.\n"
2f2fad08
BG
484 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
485 as_id, gfn, old_spte, new_spte, level);
486 return;
487 }
488
71f51d2c
MZ
489 if (is_leaf != was_leaf)
490 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
2f2fad08
BG
491
492 if (was_leaf && is_dirty_spte(old_spte) &&
64bb2769 493 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
2f2fad08
BG
494 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
495
496 /*
497 * Recursively handle child PTs if the change removed a subtree from
c8e5a0d0
SC
498 * the paging structure. Note the WARN on the PFN changing without the
499 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
500 * pages are kernel allocations and should never be migrated.
2f2fad08 501 */
c8e5a0d0
SC
502 if (was_present && !was_leaf &&
503 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
0f53dfa3 504 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
2f2fad08 505
40fa907e
VS
506 if (was_leaf && is_accessed_spte(old_spte) &&
507 (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
508 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
2f2fad08 509}
faaf05b0 510
9a77daac 511/*
6ccf4438
PB
512 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
513 * and handle the associated bookkeeping. Do not mark the page dirty
24ae4cfa 514 * in KVM's dirty bitmaps.
9a77daac 515 *
3255530a
DM
516 * If setting the SPTE fails because it has changed, iter->old_spte will be
517 * refreshed to the current value of the spte.
518 *
9a77daac
BG
519 * @kvm: kvm instance
520 * @iter: a tdp_iter instance currently on the SPTE that should be set
521 * @new_spte: The value the SPTE should be set to
3e72c791
DM
522 * Return:
523 * * 0 - If the SPTE was set.
524 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
525 * no side-effects other than setting iter->old_spte to the last
526 * known value of the spte.
9a77daac 527 */
3e72c791
DM
528static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
529 struct tdp_iter *iter,
530 u64 new_spte)
9a77daac 531{
3255530a 532 u64 *sptep = rcu_dereference(iter->sptep);
3255530a 533
08f07c80 534 /*
396fd74d
SC
535 * The caller is responsible for ensuring the old SPTE is not a REMOVED
536 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
537 * and pre-checking before inserting a new SPTE is advantageous as it
538 * avoids unnecessary work.
08f07c80 539 */
396fd74d
SC
540 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
541
542 lockdep_assert_held_read(&kvm->mmu_lock);
08f07c80 543
6e8eb206
DM
544 /*
545 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
12ced095
UB
546 * does not hold the mmu_lock. On failure, i.e. if a different logical
547 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
548 * the current value, so the caller operates on fresh data, e.g. if it
549 * retries tdp_mmu_set_spte_atomic()
6e8eb206 550 */
aee98a68 551 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
3e72c791 552 return -EBUSY;
9a77daac 553
40fa907e
VS
554 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
555 new_spte, iter->level, true);
9a77daac 556
3e72c791 557 return 0;
9a77daac
BG
558}
559
3e72c791
DM
560static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
561 struct tdp_iter *iter)
08f07c80 562{
3e72c791
DM
563 int ret;
564
08f07c80
BG
565 /*
566 * Freeze the SPTE by setting it to a special,
567 * non-present value. This will stop other threads from
568 * immediately installing a present entry in its place
569 * before the TLBs are flushed.
570 */
3e72c791
DM
571 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
572 if (ret)
573 return ret;
08f07c80 574
4ad980ae 575 kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
08f07c80
BG
576
577 /*
ba3a6120
SC
578 * No other thread can overwrite the removed SPTE as they must either
579 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
580 * overwrite the special removed SPTE value. No bookkeeping is needed
581 * here since the SPTE is going from non-present to non-present. Use
582 * the raw write helper to avoid an unnecessary check on volatile bits.
08f07c80 583 */
ba3a6120 584 __kvm_tdp_mmu_write_spte(iter->sptep, 0);
08f07c80 585
3e72c791 586 return 0;
08f07c80
BG
587}
588
9a77daac 589
fe43fa2f 590/*
0b7cc254 591 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
626808d1
SC
592 * @kvm: KVM instance
593 * @as_id: Address space ID, i.e. regular vs. SMM
594 * @sptep: Pointer to the SPTE
595 * @old_spte: The current value of the SPTE
596 * @new_spte: The new value that will be set for the SPTE
597 * @gfn: The base GFN that was (or will be) mapped by the SPTE
598 * @level: The level _containing_ the SPTE (its parent PT's level)
ba3a6120
SC
599 *
600 * Returns the old SPTE value, which _may_ be different than @old_spte if the
601 * SPTE had voldatile bits.
fe43fa2f 602 */
0b7cc254
VS
603static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
604 u64 old_spte, u64 new_spte, gfn_t gfn, int level)
faaf05b0 605{
531810ca 606 lockdep_assert_held_write(&kvm->mmu_lock);
3a9a4aa5 607
08f07c80 608 /*
966da62a 609 * No thread should be using this function to set SPTEs to or from the
08f07c80
BG
610 * temporary removed SPTE value.
611 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
612 * should be used. If operating under the MMU lock in write mode, the
613 * use of the removed SPTE should not be necessary.
614 */
20ba462d 615 WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
08f07c80 616
ba3a6120 617 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
626808d1 618
40fa907e 619 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
ba3a6120 620 return old_spte;
626808d1
SC
621}
622
0b7cc254
VS
623static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
624 u64 new_spte)
626808d1
SC
625{
626 WARN_ON_ONCE(iter->yielded);
0b7cc254
VS
627 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
628 iter->old_spte, new_spte,
629 iter->gfn, iter->level);
f8e14497 630}
faaf05b0 631
faaf05b0 632#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
77aa6075 633 for_each_tdp_pte(_iter, _root, _start, _end)
faaf05b0 634
f8e14497
BG
635#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
636 tdp_root_for_each_pte(_iter, _root, _start, _end) \
637 if (!is_shadow_present_pte(_iter.old_spte) || \
638 !is_last_spte(_iter.old_spte, _iter.level)) \
639 continue; \
640 else
641
bb18842e 642#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
c5f2d564 643 for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
bb18842e 644
e28a436c
BG
645/*
646 * Yield if the MMU lock is contended or this thread needs to return control
647 * to the scheduler.
648 *
e139a34e
BG
649 * If this function should yield and flush is set, it will perform a remote
650 * TLB flush before yielding.
651 *
3a0f64de
SC
652 * If this function yields, iter->yielded is set and the caller must skip to
653 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
654 * over the paging structures to allow the iterator to continue its traversal
655 * from the paging structure root.
e28a436c 656 *
3a0f64de 657 * Returns true if this function yielded.
e28a436c 658 */
3a0f64de
SC
659static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
660 struct tdp_iter *iter,
661 bool flush, bool shared)
a6a0b05d 662{
20ba462d 663 WARN_ON_ONCE(iter->yielded);
3a0f64de 664
ed5e484b
BG
665 /* Ensure forward progress has been made before yielding. */
666 if (iter->next_last_level_gfn == iter->yielded_gfn)
667 return false;
668
531810ca 669 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
e139a34e
BG
670 if (flush)
671 kvm_flush_remote_tlbs(kvm);
672
bd296779
SC
673 rcu_read_unlock();
674
6103bc07
BG
675 if (shared)
676 cond_resched_rwlock_read(&kvm->mmu_lock);
677 else
678 cond_resched_rwlock_write(&kvm->mmu_lock);
679
7cca2d0b 680 rcu_read_lock();
ed5e484b 681
20ba462d 682 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
ed5e484b 683
3a0f64de 684 iter->yielded = true;
a6a0b05d 685 }
e28a436c 686
3a0f64de 687 return iter->yielded;
a6a0b05d
BG
688}
689
86931ff7 690static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
e2b5b21d
SC
691{
692 /*
86931ff7
SC
693 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
694 * a gpa range that would exceed the max gfn, and KVM does not create
695 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
696 * the slow emulation path every time.
e2b5b21d 697 */
86931ff7 698 return kvm_mmu_max_gfn() + 1;
e2b5b21d
SC
699}
700
1b6043e8
SC
701static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
702 bool shared, int zap_level)
e2b5b21d 703{
e2b5b21d
SC
704 struct tdp_iter iter;
705
86931ff7 706 gfn_t end = tdp_mmu_max_gfn_exclusive();
e2b5b21d
SC
707 gfn_t start = 0;
708
1b6043e8
SC
709 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
710retry:
711 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
712 continue;
713
714 if (!is_shadow_present_pte(iter.old_spte))
715 continue;
716
717 if (iter.level > zap_level)
718 continue;
719
720 if (!shared)
0b7cc254 721 tdp_mmu_iter_set_spte(kvm, &iter, 0);
1b6043e8
SC
722 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
723 goto retry;
724 }
725}
726
727static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
728 bool shared)
729{
730
8351779c
PB
731 /*
732 * The root must have an elevated refcount so that it's reachable via
733 * mmu_notifier callbacks, which allows this path to yield and drop
734 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
735 * must drop all references to relevant pages prior to completing the
736 * callback. Dropping mmu_lock with an unreachable root would result
737 * in zapping SPTEs after a relevant mmu_notifier callback completes
738 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
739 * dirty accessed bits to the SPTE's associated struct page.
740 */
741 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
742
e2b5b21d
SC
743 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
744
745 rcu_read_lock();
746
747 /*
1b6043e8
SC
748 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
749 * split the zap into two passes. On the first pass, zap at the 1gb
750 * level, and then zap top-level SPs on the second pass. "1gb" is not
751 * arbitrary, as KVM must be able to zap a 1gb shadow page without
752 * inducing a stall to allow in-place replacement with a 1gb hugepage.
753 *
754 * Because zapping a SP recurses on its children, stepping down to
755 * PG_LEVEL_4K in the iterator itself is unnecessary.
e2b5b21d 756 */
1b6043e8
SC
757 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
758 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
e2b5b21d
SC
759
760 rcu_read_unlock();
761}
762
c10743a1
SC
763bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
764{
765 u64 old_spte;
766
767 /*
768 * This helper intentionally doesn't allow zapping a root shadow page,
769 * which doesn't have a parent page table and thus no associated entry.
770 */
771 if (WARN_ON_ONCE(!sp->ptep))
772 return false;
773
c10743a1 774 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
bb95dfb9 775 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
c10743a1 776 return false;
c10743a1 777
0b7cc254
VS
778 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
779 sp->gfn, sp->role.level + 1);
c10743a1 780
c10743a1
SC
781 return true;
782}
783
faaf05b0 784/*
063afacd
BG
785 * If can_yield is true, will release the MMU lock and reschedule if the
786 * scheduler needs the CPU or there is contention on the MMU lock. If this
787 * function cannot yield, it will not release the MMU lock or reschedule and
788 * the caller must ensure it does not supply too large a GFN range, or the
6103bc07 789 * operation can cause a soft lockup.
faaf05b0 790 */
f47e5bbb
SC
791static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
792 gfn_t start, gfn_t end, bool can_yield, bool flush)
faaf05b0
BG
793{
794 struct tdp_iter iter;
faaf05b0 795
86931ff7 796 end = min(end, tdp_mmu_max_gfn_exclusive());
524a1e4e 797
acbda82a 798 lockdep_assert_held_write(&kvm->mmu_lock);
6103bc07 799
7cca2d0b
BG
800 rcu_read_lock();
801
f47e5bbb 802 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
1af4a960 803 if (can_yield &&
acbda82a 804 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
a835429c 805 flush = false;
1af4a960
BG
806 continue;
807 }
808
f47e5bbb 809 if (!is_shadow_present_pte(iter.old_spte) ||
faaf05b0
BG
810 !is_last_spte(iter.old_spte, iter.level))
811 continue;
812
0b7cc254 813 tdp_mmu_iter_set_spte(kvm, &iter, 0);
acbda82a 814 flush = true;
faaf05b0 815 }
7cca2d0b 816
fcb93eb6
PB
817 rcu_read_unlock();
818
f47e5bbb
SC
819 /*
820 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
821 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
822 */
823 return flush;
faaf05b0
BG
824}
825
826/*
7edc3a68
KH
827 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
828 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
829 * more SPTEs were zapped since the MMU lock was last acquired.
faaf05b0 830 */
441a5dfc 831bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
faaf05b0
BG
832{
833 struct kvm_mmu_page *root;
faaf05b0 834
0df9dab8 835 for_each_tdp_mmu_root_yield_safe(kvm, root, false)
50107e8b 836 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
faaf05b0 837
faaf05b0
BG
838 return flush;
839}
840
841void kvm_tdp_mmu_zap_all(struct kvm *kvm)
842{
e2b5b21d 843 struct kvm_mmu_page *root;
2b9663d8 844
77c8cd6b 845 /*
22b94c4b
PB
846 * Zap all roots, including invalid roots, as all SPTEs must be dropped
847 * before returning to the caller. Zap directly even if the root is
848 * also being zapped by a worker. Walking zapped top-level SPTEs isn't
849 * all that expensive and mmu_lock is already held, which means the
850 * worker has yielded, i.e. flushing the work instead of zapping here
851 * isn't guaranteed to be any faster.
852 *
77c8cd6b
SC
853 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
854 * is being destroyed or the userspace VMM has exited. In both cases,
855 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
856 */
0df9dab8 857 for_each_tdp_mmu_root_yield_safe(kvm, root, false)
441a5dfc 858 tdp_mmu_zap_root(kvm, root, false);
faaf05b0 859}
bb18842e 860
4c6654bd 861/*
f28e9c7f 862 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
22b94c4b 863 * zap" completes.
4c6654bd
BG
864 */
865void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
866{
0df9dab8
SC
867 struct kvm_mmu_page *root;
868
869 read_lock(&kvm->mmu_lock);
870
871 for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
872 if (!root->tdp_mmu_scheduled_root_to_zap)
873 continue;
874
875 root->tdp_mmu_scheduled_root_to_zap = false;
876 KVM_BUG_ON(!root->role.invalid, kvm);
877
878 /*
879 * A TLB flush is not necessary as KVM performs a local TLB
880 * flush when allocating a new root (see kvm_mmu_load()), and
881 * when migrating a vCPU to a different pCPU. Note, the local
882 * TLB flush on reuse also invalidates paging-structure-cache
883 * entries, i.e. TLB entries for intermediate paging structures,
884 * that may be zapped, as such entries are associated with the
885 * ASID on both VMX and SVM.
886 */
887 tdp_mmu_zap_root(kvm, root, true);
888
889 /*
890 * The referenced needs to be put *after* zapping the root, as
891 * the root must be reachable by mmu_notifiers while it's being
892 * zapped
893 */
894 kvm_tdp_mmu_put_root(kvm, root, true);
895 }
896
897 read_unlock(&kvm->mmu_lock);
faaf05b0 898}
bb18842e 899
b7cccd39 900/*
f28e9c7f 901 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
22b94c4b 902 * is about to be zapped, e.g. in response to a memslots update. The actual
0df9dab8
SC
903 * zapping is done separately so that it happens with mmu_lock with read,
904 * whereas invalidating roots must be done with mmu_lock held for write (unless
905 * the VM is being destroyed).
4c6654bd 906 *
0df9dab8 907 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
edbdb43f 908 * See kvm_tdp_mmu_get_vcpu_root_hpa().
b7cccd39
BG
909 */
910void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
911{
912 struct kvm_mmu_page *root;
913
edbdb43f
SC
914 /*
915 * mmu_lock must be held for write to ensure that a root doesn't become
916 * invalid while there are active readers (invalidating a root while
917 * there are active readers may or may not be problematic in practice,
918 * but it's uncharted territory and not supported).
919 *
920 * Waive the assertion if there are no users of @kvm, i.e. the VM is
921 * being destroyed after all references have been put, or if no vCPUs
922 * have been created (which means there are no roots), i.e. the VM is
923 * being destroyed in an error path of KVM_CREATE_VM.
924 */
925 if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
926 refcount_read(&kvm->users_count) && kvm->created_vcpus)
927 lockdep_assert_held_write(&kvm->mmu_lock);
928
929 /*
930 * As above, mmu_lock isn't held when destroying the VM! There can't
931 * be other references to @kvm, i.e. nothing else can invalidate roots
0df9dab8 932 * or get/put references to roots.
edbdb43f 933 */
0df9dab8
SC
934 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
935 /*
936 * Note, invalid roots can outlive a memslot update! Invalid
937 * roots must be *zapped* before the memslot update completes,
938 * but a different task can acquire a reference and keep the
939 * root alive after its been zapped.
940 */
edbdb43f 941 if (!root->role.invalid) {
0df9dab8 942 root->tdp_mmu_scheduled_root_to_zap = true;
4c6654bd 943 root->role.invalid = true;
22b94c4b 944 }
f28e9c7f 945 }
b7cccd39
BG
946}
947
bb18842e
BG
948/*
949 * Installs a last-level SPTE to handle a TDP page fault.
950 * (NPT/EPT violation/misconfiguration)
951 */
cdc47767
PB
952static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
953 struct kvm_page_fault *fault,
954 struct tdp_iter *iter)
bb18842e 955{
c435d4b7 956 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
bb18842e 957 u64 new_spte;
57a3e96d 958 int ret = RET_PF_FIXED;
ad67e480 959 bool wrprot = false;
bb18842e 960
50a9ac25
SC
961 if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
962 return RET_PF_RETRY;
963
e710c5f6 964 if (unlikely(!fault->slot))
bb18842e 965 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
9a77daac 966 else
53597858 967 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
2839180c 968 fault->pfn, iter->old_spte, fault->prefetch, true,
7158bee4 969 fault->map_writable, &new_spte);
bb18842e
BG
970
971 if (new_spte == iter->old_spte)
972 ret = RET_PF_SPURIOUS;
3e72c791 973 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
9a77daac 974 return RET_PF_RETRY;
bb95dfb9
SC
975 else if (is_shadow_present_pte(iter->old_spte) &&
976 !is_last_spte(iter->old_spte, iter->level))
1e203847 977 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
bb18842e
BG
978
979 /*
980 * If the page fault was caused by a write but the page is write
981 * protected, emulation is needed. If the emulation was skipped,
982 * the vCPU would have the same fault again.
983 */
ad67e480 984 if (wrprot) {
cdc47767 985 if (fault->write)
bb18842e 986 ret = RET_PF_EMULATE;
bb18842e
BG
987 }
988
989 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
9a77daac 990 if (unlikely(is_mmio_spte(new_spte))) {
1075d41e 991 vcpu->stat.pf_mmio_spte_created++;
9a77daac
BG
992 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
993 new_spte);
bb18842e 994 ret = RET_PF_EMULATE;
3849e092 995 } else {
9a77daac
BG
996 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
997 rcu_dereference(iter->sptep));
3849e092 998 }
bb18842e 999
bb18842e
BG
1000 return ret;
1001}
1002
7b7e1ab6 1003/*
cb00a70b
DM
1004 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1005 * provided page table.
7b7e1ab6
DM
1006 *
1007 * @kvm: kvm instance
1008 * @iter: a tdp_iter instance currently on the SPTE that should be set
1009 * @sp: The new TDP page table to install.
cb00a70b 1010 * @shared: This operation is running under the MMU lock in read mode.
7b7e1ab6
DM
1011 *
1012 * Returns: 0 if the new page table was installed. Non-0 if the page table
1013 * could not be installed (e.g. the atomic compare-exchange failed).
1014 */
cb00a70b 1015static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
61f94478 1016 struct kvm_mmu_page *sp, bool shared)
7b7e1ab6 1017{
54275f74 1018 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
cb00a70b 1019 int ret = 0;
7b7e1ab6 1020
cb00a70b
DM
1021 if (shared) {
1022 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1023 if (ret)
1024 return ret;
1025 } else {
0b7cc254 1026 tdp_mmu_iter_set_spte(kvm, iter, spte);
cb00a70b 1027 }
7b7e1ab6 1028
43a063ca 1029 tdp_account_mmu_page(kvm, sp);
7b7e1ab6
DM
1030
1031 return 0;
1032}
1033
c4b33d28
DM
1034static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1035 struct kvm_mmu_page *sp, bool shared);
1036
bb18842e
BG
1037/*
1038 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1039 * page tables and SPTEs to translate the faulting guest physical address.
1040 */
2f6305dd 1041int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bb18842e 1042{
bb18842e 1043 struct kvm_mmu *mmu = vcpu->arch.mmu;
61f94478 1044 struct kvm *kvm = vcpu->kvm;
bb18842e 1045 struct tdp_iter iter;
89c0fd49 1046 struct kvm_mmu_page *sp;
63d28a25 1047 int ret = RET_PF_RETRY;
bb18842e 1048
73a3c659 1049 kvm_mmu_hugepage_adjust(vcpu, fault);
bb18842e 1050
f0066d94 1051 trace_kvm_mmu_spte_requested(fault);
7cca2d0b
BG
1052
1053 rcu_read_lock();
1054
2f6305dd 1055 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
63d28a25
PB
1056 int r;
1057
73a3c659 1058 if (fault->nx_huge_page_workaround_enabled)
536f0e6a 1059 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
bb18842e 1060
c4b33d28
DM
1061 /*
1062 * If SPTE has been frozen by another thread, just give up and
1063 * retry, avoiding unnecessary page table allocation and free.
1064 */
1065 if (is_removed_spte(iter.old_spte))
63d28a25
PB
1066 goto retry;
1067
f5d16bb9 1068 if (iter.level == fault->goal_level)
80a3e4ae 1069 goto map_target_level;
f5d16bb9 1070
63d28a25
PB
1071 /* Step down into the lower level page table if it exists. */
1072 if (is_shadow_present_pte(iter.old_spte) &&
1073 !is_large_pte(iter.old_spte))
1074 continue;
bb18842e 1075
c4b33d28
DM
1076 /*
1077 * The SPTE is either non-present or points to a huge page that
1078 * needs to be split.
1079 */
1080 sp = tdp_mmu_alloc_sp(vcpu);
1081 tdp_mmu_init_child_sp(sp, &iter);
ff76d506 1082
c4b33d28 1083 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
a82070b6 1084
c4b33d28 1085 if (is_shadow_present_pte(iter.old_spte))
63d28a25 1086 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
c4b33d28 1087 else
63d28a25 1088 r = tdp_mmu_link_sp(kvm, &iter, sp, true);
61f94478 1089
63d28a25 1090 /*
80a3e4ae
SC
1091 * Force the guest to retry if installing an upper level SPTE
1092 * failed, e.g. because a different task modified the SPTE.
63d28a25
PB
1093 */
1094 if (r) {
c4b33d28 1095 tdp_mmu_free_sp(sp);
63d28a25 1096 goto retry;
c4b33d28 1097 }
61f94478 1098
c4b33d28
DM
1099 if (fault->huge_page_disallowed &&
1100 fault->req_level >= iter.level) {
1101 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
21a36ac6
SC
1102 if (sp->nx_huge_page_disallowed)
1103 track_possible_nx_huge_page(kvm, sp);
c4b33d28 1104 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
bb18842e
BG
1105 }
1106 }
1107
80a3e4ae
SC
1108 /*
1109 * The walk aborted before reaching the target level, e.g. because the
1110 * iterator detected an upper level SPTE was frozen during traversal.
1111 */
1112 WARN_ON_ONCE(iter.level == fault->goal_level);
1113 goto retry;
1114
1115map_target_level:
cdc47767 1116 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
bb18842e 1117
63d28a25
PB
1118retry:
1119 rcu_read_unlock();
bb18842e
BG
1120 return ret;
1121}
063afacd 1122
3039bcc7
SC
1123bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1124 bool flush)
063afacd 1125{
50107e8b
SC
1126 struct kvm_mmu_page *root;
1127
441a5dfc 1128 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
50107e8b
SC
1129 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1130 range->may_block, flush);
1131
1132 return flush;
063afacd
BG
1133}
1134
3039bcc7
SC
1135typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1136 struct kvm_gfn_range *range);
063afacd 1137
3039bcc7
SC
1138static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1139 struct kvm_gfn_range *range,
1140 tdp_handler_t handler)
063afacd 1141{
3039bcc7
SC
1142 struct kvm_mmu_page *root;
1143 struct tdp_iter iter;
1144 bool ret = false;
1145
e1eed584
SC
1146 /*
1147 * Don't support rescheduling, none of the MMU notifiers that funnel
1148 * into this helper allow blocking; it'd be dead, wasteful code.
1149 */
3039bcc7 1150 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
a151acec
SC
1151 rcu_read_lock();
1152
3039bcc7
SC
1153 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1154 ret |= handler(kvm, &iter, range);
3039bcc7 1155
a151acec
SC
1156 rcu_read_unlock();
1157 }
3039bcc7
SC
1158
1159 return ret;
063afacd 1160}
f8e14497
BG
1161
1162/*
1163 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1164 * if any of the GFNs in the range have been accessed.
7ee131e3
VS
1165 *
1166 * No need to mark the corresponding PFN as accessed as this call is coming
1167 * from the clear_young() or clear_flush_young() notifier, which uses the
1168 * return value to determine if the page has been accessed.
f8e14497 1169 */
3039bcc7
SC
1170static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1171 struct kvm_gfn_range *range)
f8e14497 1172{
7ee131e3 1173 u64 new_spte;
f8e14497 1174
3039bcc7
SC
1175 /* If we have a non-accessed entry we don't need to change the pte. */
1176 if (!is_accessed_spte(iter->old_spte))
1177 return false;
7cca2d0b 1178
7ee131e3
VS
1179 if (spte_ad_enabled(iter->old_spte)) {
1180 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1181 iter->old_spte,
1182 shadow_accessed_mask,
1183 iter->level);
1184 new_spte = iter->old_spte & ~shadow_accessed_mask;
3039bcc7 1185 } else {
f8e14497 1186 /*
3039bcc7
SC
1187 * Capture the dirty status of the page, so that it doesn't get
1188 * lost when the SPTE is marked for access tracking.
f8e14497 1189 */
7ee131e3
VS
1190 if (is_writable_pte(iter->old_spte))
1191 kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
f8e14497 1192
7ee131e3
VS
1193 new_spte = mark_spte_for_access_track(iter->old_spte);
1194 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1195 iter->old_spte, new_spte,
1196 iter->level);
f8e14497
BG
1197 }
1198
891f1159
VS
1199 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1200 iter->old_spte, new_spte);
3039bcc7 1201 return true;
f8e14497
BG
1202}
1203
3039bcc7 1204bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1205{
3039bcc7 1206 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
f8e14497
BG
1207}
1208
3039bcc7
SC
1209static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1210 struct kvm_gfn_range *range)
f8e14497 1211{
3039bcc7 1212 return is_accessed_spte(iter->old_spte);
f8e14497
BG
1213}
1214
3039bcc7 1215bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
f8e14497 1216{
3039bcc7 1217 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
f8e14497 1218}
1d8dd6b3 1219
3039bcc7
SC
1220static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1221 struct kvm_gfn_range *range)
1d8dd6b3 1222{
1d8dd6b3 1223 u64 new_spte;
7cca2d0b 1224
3039bcc7 1225 /* Huge pages aren't expected to be modified without first being zapped. */
20ba462d 1226 WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
1d8dd6b3 1227
3039bcc7
SC
1228 if (iter->level != PG_LEVEL_4K ||
1229 !is_shadow_present_pte(iter->old_spte))
1230 return false;
1d8dd6b3 1231
3039bcc7
SC
1232 /*
1233 * Note, when changing a read-only SPTE, it's not strictly necessary to
1234 * zero the SPTE before setting the new PFN, but doing so preserves the
1235 * invariant that the PFN of a present * leaf SPTE can never change.
40fa907e 1236 * See handle_changed_spte().
3039bcc7 1237 */
0b7cc254 1238 tdp_mmu_iter_set_spte(kvm, iter, 0);
1d8dd6b3 1239
3e1efe2b 1240 if (!pte_write(range->arg.pte)) {
3039bcc7 1241 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
3e1efe2b 1242 pte_pfn(range->arg.pte));
1d8dd6b3 1243
0b7cc254 1244 tdp_mmu_iter_set_spte(kvm, iter, new_spte);
1d8dd6b3
BG
1245 }
1246
3039bcc7 1247 return true;
1d8dd6b3
BG
1248}
1249
3039bcc7
SC
1250/*
1251 * Handle the changed_pte MMU notifier for the TDP MMU.
1252 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1253 * notifier.
1254 * Returns non-zero if a flush is needed before releasing the MMU lock.
1255 */
1256bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1d8dd6b3 1257{
93fa50f6
SC
1258 /*
1259 * No need to handle the remote TLB flush under RCU protection, the
1260 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
40fa907e 1261 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
93fa50f6
SC
1262 */
1263 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1d8dd6b3
BG
1264}
1265
a6a0b05d 1266/*
bedd9195
DM
1267 * Remove write access from all SPTEs at or above min_level that map GFNs
1268 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1269 * be flushed.
a6a0b05d
BG
1270 */
1271static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1272 gfn_t start, gfn_t end, int min_level)
1273{
1274 struct tdp_iter iter;
1275 u64 new_spte;
1276 bool spte_set = false;
1277
7cca2d0b
BG
1278 rcu_read_lock();
1279
a6a0b05d
BG
1280 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1281
77aa6075 1282 for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
24ae4cfa
BG
1283retry:
1284 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960
BG
1285 continue;
1286
a6a0b05d 1287 if (!is_shadow_present_pte(iter.old_spte) ||
0f99ee2c
BG
1288 !is_last_spte(iter.old_spte, iter.level) ||
1289 !(iter.old_spte & PT_WRITABLE_MASK))
a6a0b05d
BG
1290 continue;
1291
1292 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1293
3e72c791 1294 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
24ae4cfa 1295 goto retry;
3255530a 1296
a6a0b05d 1297 spte_set = true;
a6a0b05d 1298 }
7cca2d0b
BG
1299
1300 rcu_read_unlock();
a6a0b05d
BG
1301 return spte_set;
1302}
1303
1304/*
1305 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1306 * only affect leaf SPTEs down to min_level.
1307 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1308 */
269e9552
HM
1309bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1310 const struct kvm_memory_slot *slot, int min_level)
a6a0b05d
BG
1311{
1312 struct kvm_mmu_page *root;
a6a0b05d
BG
1313 bool spte_set = false;
1314
24ae4cfa 1315 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1316
d62007ed 1317 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1318 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1319 slot->base_gfn + slot->npages, min_level);
a6a0b05d
BG
1320
1321 return spte_set;
1322}
1323
a3fe5dbd
DM
1324static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1325{
1326 struct kvm_mmu_page *sp;
1327
1328 gfp |= __GFP_ZERO;
1329
1330 sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1331 if (!sp)
1332 return NULL;
1333
1334 sp->spt = (void *)__get_free_page(gfp);
1335 if (!sp->spt) {
1336 kmem_cache_free(mmu_page_header_cache, sp);
1337 return NULL;
1338 }
1339
1340 return sp;
1341}
1342
1343static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
cb00a70b
DM
1344 struct tdp_iter *iter,
1345 bool shared)
a3fe5dbd
DM
1346{
1347 struct kvm_mmu_page *sp;
1348
a3fe5dbd
DM
1349 /*
1350 * Since we are allocating while under the MMU lock we have to be
1351 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1352 * reclaim and to avoid making any filesystem callbacks (which can end
1353 * up invoking KVM MMU notifiers, resulting in a deadlock).
1354 *
1355 * If this allocation fails we drop the lock and retry with reclaim
1356 * allowed.
1357 */
1358 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1359 if (sp)
1360 return sp;
1361
1362 rcu_read_unlock();
cb00a70b
DM
1363
1364 if (shared)
1365 read_unlock(&kvm->mmu_lock);
1366 else
1367 write_unlock(&kvm->mmu_lock);
a3fe5dbd
DM
1368
1369 iter->yielded = true;
1370 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1371
cb00a70b
DM
1372 if (shared)
1373 read_lock(&kvm->mmu_lock);
1374 else
1375 write_lock(&kvm->mmu_lock);
1376
a3fe5dbd
DM
1377 rcu_read_lock();
1378
1379 return sp;
1380}
1381
c4b33d28 1382/* Note, the caller is responsible for initializing @sp. */
cb00a70b
DM
1383static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1384 struct kvm_mmu_page *sp, bool shared)
a3fe5dbd
DM
1385{
1386 const u64 huge_spte = iter->old_spte;
1387 const int level = iter->level;
1388 int ret, i;
1389
a3fe5dbd
DM
1390 /*
1391 * No need for atomics when writing to sp->spt since the page table has
1392 * not been linked in yet and thus is not reachable from any other CPU.
1393 */
2ca3129e 1394 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
47855da0 1395 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
a3fe5dbd
DM
1396
1397 /*
1398 * Replace the huge spte with a pointer to the populated lower level
1399 * page table. Since we are making this change without a TLB flush vCPUs
1400 * will see a mix of the split mappings and the original huge mapping,
1401 * depending on what's currently in their TLB. This is fine from a
1402 * correctness standpoint since the translation will be the same either
1403 * way.
1404 */
61f94478 1405 ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
a3fe5dbd 1406 if (ret)
e0b728b1 1407 goto out;
a3fe5dbd
DM
1408
1409 /*
1410 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1411 * are overwriting from the page stats. But we have to manually update
1412 * the page stats with the new present child pages.
1413 */
2ca3129e 1414 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
a3fe5dbd 1415
e0b728b1
DM
1416out:
1417 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1418 return ret;
a3fe5dbd
DM
1419}
1420
1421static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1422 struct kvm_mmu_page *root,
1423 gfn_t start, gfn_t end,
cb00a70b 1424 int target_level, bool shared)
a3fe5dbd
DM
1425{
1426 struct kvm_mmu_page *sp = NULL;
1427 struct tdp_iter iter;
1428 int ret = 0;
1429
1430 rcu_read_lock();
1431
1432 /*
1433 * Traverse the page table splitting all huge pages above the target
1434 * level into one lower level. For example, if we encounter a 1GB page
1435 * we split it into 512 2MB pages.
1436 *
1437 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1438 * to visit an SPTE before ever visiting its children, which means we
1439 * will correctly recursively split huge pages that are more than one
1440 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1441 * and then splitting each of those to 512 4KB pages).
1442 */
1443 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1444retry:
cb00a70b 1445 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
a3fe5dbd
DM
1446 continue;
1447
1448 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1449 continue;
1450
1451 if (!sp) {
cb00a70b 1452 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
a3fe5dbd
DM
1453 if (!sp) {
1454 ret = -ENOMEM;
e0b728b1
DM
1455 trace_kvm_mmu_split_huge_page(iter.gfn,
1456 iter.old_spte,
1457 iter.level, ret);
a3fe5dbd
DM
1458 break;
1459 }
1460
1461 if (iter.yielded)
1462 continue;
1463 }
1464
c4b33d28
DM
1465 tdp_mmu_init_child_sp(sp, &iter);
1466
cb00a70b 1467 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
a3fe5dbd
DM
1468 goto retry;
1469
1470 sp = NULL;
1471 }
1472
1473 rcu_read_unlock();
1474
1475 /*
1476 * It's possible to exit the loop having never used the last sp if, for
1477 * example, a vCPU doing HugePage NX splitting wins the race and
1478 * installs its own sp in place of the last sp we tried to split.
1479 */
1480 if (sp)
1481 tdp_mmu_free_sp(sp);
1482
a3fe5dbd
DM
1483 return ret;
1484}
1485
cb00a70b 1486
a3fe5dbd
DM
1487/*
1488 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1489 */
1490void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1491 const struct kvm_memory_slot *slot,
1492 gfn_t start, gfn_t end,
cb00a70b 1493 int target_level, bool shared)
a3fe5dbd
DM
1494{
1495 struct kvm_mmu_page *root;
1496 int r = 0;
1497
cb00a70b 1498 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
a3fe5dbd 1499
7c554d8e 1500 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
cb00a70b 1501 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
a3fe5dbd 1502 if (r) {
cb00a70b 1503 kvm_tdp_mmu_put_root(kvm, root, shared);
a3fe5dbd
DM
1504 break;
1505 }
1506 }
1507}
1508
a6a0b05d
BG
1509/*
1510 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1511 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1512 * If AD bits are not enabled, this will require clearing the writable bit on
1513 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1514 * be flushed.
1515 */
1516static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1517 gfn_t start, gfn_t end)
1518{
697c89be 1519 u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
a6a0b05d 1520 struct tdp_iter iter;
a6a0b05d
BG
1521 bool spte_set = false;
1522
7cca2d0b
BG
1523 rcu_read_lock();
1524
45a61ebb 1525 tdp_root_for_each_pte(iter, root, start, end) {
24ae4cfa 1526retry:
45a61ebb
DM
1527 if (!is_shadow_present_pte(iter.old_spte) ||
1528 !is_last_spte(iter.old_spte, iter.level))
1af4a960
BG
1529 continue;
1530
45a61ebb 1531 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
3354ef5a
SC
1532 continue;
1533
0fe6370e
SC
1534 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1535 spte_ad_need_write_protect(iter.old_spte));
5982a539 1536
697c89be
VS
1537 if (!(iter.old_spte & dbit))
1538 continue;
a6a0b05d 1539
697c89be 1540 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
24ae4cfa 1541 goto retry;
3255530a 1542
a6a0b05d 1543 spte_set = true;
a6a0b05d 1544 }
7cca2d0b
BG
1545
1546 rcu_read_unlock();
a6a0b05d
BG
1547 return spte_set;
1548}
1549
1550/*
1551 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1552 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1553 * If AD bits are not enabled, this will require clearing the writable bit on
1554 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1555 * be flushed.
1556 */
269e9552
HM
1557bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1558 const struct kvm_memory_slot *slot)
a6a0b05d
BG
1559{
1560 struct kvm_mmu_page *root;
a6a0b05d
BG
1561 bool spte_set = false;
1562
24ae4cfa 1563 lockdep_assert_held_read(&kvm->mmu_lock);
a6a0b05d 1564
d62007ed 1565 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
a6a0b05d
BG
1566 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1567 slot->base_gfn + slot->npages);
a6a0b05d
BG
1568
1569 return spte_set;
1570}
1571
1572/*
1573 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1574 * set in mask, starting at gfn. The given memslot is expected to contain all
1575 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1576 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1577 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1578 */
1579static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1580 gfn_t gfn, unsigned long mask, bool wrprot)
1581{
697c89be
VS
1582 u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1583 shadow_dirty_mask;
a6a0b05d 1584 struct tdp_iter iter;
a6a0b05d 1585
91303f80
LX
1586 lockdep_assert_held_write(&kvm->mmu_lock);
1587
7cca2d0b
BG
1588 rcu_read_lock();
1589
a6a0b05d
BG
1590 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1591 gfn + BITS_PER_LONG) {
1592 if (!mask)
1593 break;
1594
0fe6370e
SC
1595 KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1596 spte_ad_need_write_protect(iter.old_spte));
5982a539 1597
a6a0b05d
BG
1598 if (iter.level > PG_LEVEL_4K ||
1599 !(mask & (1UL << (iter.gfn - gfn))))
1600 continue;
1601
f1b3b06a
BG
1602 mask &= ~(1UL << (iter.gfn - gfn));
1603
697c89be
VS
1604 if (!(iter.old_spte & dbit))
1605 continue;
a6a0b05d 1606
89c313f2
VS
1607 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1608 iter.old_spte, dbit,
1609 iter.level);
1610
1e0f4298
VS
1611 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1612 iter.old_spte,
1613 iter.old_spte & ~dbit);
1614 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
a6a0b05d 1615 }
7cca2d0b
BG
1616
1617 rcu_read_unlock();
a6a0b05d
BG
1618}
1619
1620/*
1621 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1622 * set in mask, starting at gfn. The given memslot is expected to contain all
1623 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1624 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1625 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1626 */
1627void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1628 struct kvm_memory_slot *slot,
1629 gfn_t gfn, unsigned long mask,
1630 bool wrprot)
1631{
1632 struct kvm_mmu_page *root;
a6a0b05d 1633
a3f15bda 1634 for_each_tdp_mmu_root(kvm, root, slot->as_id)
a6a0b05d 1635 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
a6a0b05d
BG
1636}
1637
4b85c921 1638static void zap_collapsible_spte_range(struct kvm *kvm,
14881998 1639 struct kvm_mmu_page *root,
4b85c921 1640 const struct kvm_memory_slot *slot)
14881998 1641{
9eba50f8
SC
1642 gfn_t start = slot->base_gfn;
1643 gfn_t end = start + slot->npages;
14881998 1644 struct tdp_iter iter;
5ba7c4c6 1645 int max_mapping_level;
14881998 1646
7cca2d0b
BG
1647 rcu_read_lock();
1648
85f44f8c
SC
1649 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1650retry:
4b85c921 1651 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1af4a960 1652 continue;
1af4a960 1653
85f44f8c
SC
1654 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1655 !is_shadow_present_pte(iter.old_spte))
14881998
BG
1656 continue;
1657
5ba7c4c6 1658 /*
85f44f8c
SC
1659 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1660 * a large page size, then its parent would have been zapped
1661 * instead of stepping down.
5ba7c4c6 1662 */
85f44f8c 1663 if (is_last_spte(iter.old_spte, iter.level))
5ba7c4c6
BG
1664 continue;
1665
1666 /*
85f44f8c
SC
1667 * If iter.gfn resides outside of the slot, i.e. the page for
1668 * the current level overlaps but is not contained by the slot,
1669 * then the SPTE can't be made huge. More importantly, trying
1670 * to query that info from slot->arch.lpage_info will cause an
1671 * out-of-bounds access.
5ba7c4c6 1672 */
85f44f8c
SC
1673 if (iter.gfn < start || iter.gfn >= end)
1674 continue;
5ba7c4c6 1675
85f44f8c
SC
1676 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1677 iter.gfn, PG_LEVEL_NUM);
1678 if (max_mapping_level < iter.level)
1679 continue;
5ba7c4c6 1680
85f44f8c
SC
1681 /* Note, a successful atomic zap also does a remote TLB flush. */
1682 if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1683 goto retry;
14881998
BG
1684 }
1685
7cca2d0b 1686 rcu_read_unlock();
14881998
BG
1687}
1688
1689/*
85f44f8c
SC
1690 * Zap non-leaf SPTEs (and free their associated page tables) which could
1691 * be replaced by huge pages, for GFNs within the slot.
14881998 1692 */
4b85c921
SC
1693void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1694 const struct kvm_memory_slot *slot)
14881998
BG
1695{
1696 struct kvm_mmu_page *root;
14881998 1697
2db6f772 1698 lockdep_assert_held_read(&kvm->mmu_lock);
14881998 1699
d62007ed 1700 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
4b85c921 1701 zap_collapsible_spte_range(kvm, root, slot);
14881998 1702}
46044f72
BG
1703
1704/*
1705 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1706 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1707 * Returns true if an SPTE was set and a TLB flush is needed.
1708 */
1709static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
3ad93562 1710 gfn_t gfn, int min_level)
46044f72
BG
1711{
1712 struct tdp_iter iter;
1713 u64 new_spte;
1714 bool spte_set = false;
1715
3ad93562
KZ
1716 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1717
7cca2d0b
BG
1718 rcu_read_lock();
1719
77aa6075 1720 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
3ad93562
KZ
1721 if (!is_shadow_present_pte(iter.old_spte) ||
1722 !is_last_spte(iter.old_spte, iter.level))
1723 continue;
1724
46044f72 1725 new_spte = iter.old_spte &
5fc3424f 1726 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
46044f72 1727
7c8a4742
DM
1728 if (new_spte == iter.old_spte)
1729 break;
1730
0b7cc254 1731 tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
46044f72
BG
1732 spte_set = true;
1733 }
1734
7cca2d0b
BG
1735 rcu_read_unlock();
1736
46044f72
BG
1737 return spte_set;
1738}
1739
1740/*
1741 * Removes write access on the last level SPTE mapping this GFN and unsets the
5fc3424f 1742 * MMU-writable bit to ensure future writes continue to be intercepted.
46044f72
BG
1743 * Returns true if an SPTE was set and a TLB flush is needed.
1744 */
1745bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
3ad93562
KZ
1746 struct kvm_memory_slot *slot, gfn_t gfn,
1747 int min_level)
46044f72
BG
1748{
1749 struct kvm_mmu_page *root;
46044f72
BG
1750 bool spte_set = false;
1751
531810ca 1752 lockdep_assert_held_write(&kvm->mmu_lock);
a3f15bda 1753 for_each_tdp_mmu_root(kvm, root, slot->as_id)
3ad93562 1754 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
a3f15bda 1755
46044f72
BG
1756 return spte_set;
1757}
1758
95fb5b02
BG
1759/*
1760 * Return the level of the lowest level SPTE added to sptes.
1761 * That SPTE may be non-present.
c5c8c7c5
DM
1762 *
1763 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
95fb5b02 1764 */
39b4d43e
SC
1765int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1766 int *root_level)
95fb5b02
BG
1767{
1768 struct tdp_iter iter;
1769 struct kvm_mmu *mmu = vcpu->arch.mmu;
95fb5b02 1770 gfn_t gfn = addr >> PAGE_SHIFT;
2aa07893 1771 int leaf = -1;
95fb5b02 1772
a972e29c 1773 *root_level = vcpu->arch.mmu->root_role.level;
95fb5b02
BG
1774
1775 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1776 leaf = iter.level;
dde81f94 1777 sptes[leaf] = iter.old_spte;
95fb5b02
BG
1778 }
1779
1780 return leaf;
1781}
6e8eb206
DM
1782
1783/*
1784 * Returns the last level spte pointer of the shadow page walk for the given
1785 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1786 * walk could be performed, returns NULL and *spte does not contain valid data.
1787 *
1788 * Contract:
1789 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1790 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1791 *
1792 * WARNING: This function is only intended to be called during fast_page_fault.
1793 */
1794u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1795 u64 *spte)
1796{
1797 struct tdp_iter iter;
1798 struct kvm_mmu *mmu = vcpu->arch.mmu;
1799 gfn_t gfn = addr >> PAGE_SHIFT;
1800 tdp_ptep_t sptep = NULL;
1801
1802 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1803 *spte = iter.old_spte;
1804 sptep = iter.sptep;
1805 }
1806
1807 /*
1808 * Perform the rcu_dereference to get the raw spte pointer value since
1809 * we are passing it up to fast_page_fault, which is shared with the
1810 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1811 * annotation.
1812 *
1813 * This is safe since fast_page_fault obeys the contracts of this
1814 * function as well as all TDP MMU contracts around modifying SPTEs
1815 * outside of mmu_lock.
1816 */
1817 return rcu_dereference(sptep);
1818}