1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
10 * Copyright (C) 2006 Qumranet, Inc.
11 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
14 * Yaniv Kamay <yaniv@qumranet.com>
15 * Avi Kivity <avi@qumranet.com>
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 #include "mmu_internal.h"
25 #include "kvm_cache_regs.h"
27 #include "kvm_emulate.h"
28 #include "page_track.h"
32 #include <linux/kvm_host.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
36 #include <linux/highmem.h>
37 #include <linux/moduleparam.h>
38 #include <linux/export.h>
39 #include <linux/swap.h>
40 #include <linux/hugetlb.h>
41 #include <linux/compiler.h>
42 #include <linux/srcu.h>
43 #include <linux/slab.h>
44 #include <linux/sched/signal.h>
45 #include <linux/uaccess.h>
46 #include <linux/hash.h>
47 #include <linux/kern_levels.h>
48 #include <linux/kstrtox.h>
49 #include <linux/kthread.h>
52 #include <asm/memtype.h>
53 #include <asm/cmpxchg.h>
55 #include <asm/set_memory.h>
60 extern bool itlb_multihit_kvm_mitigation
;
62 static bool nx_hugepage_mitigation_hard_disabled
;
64 int __read_mostly nx_huge_pages
= -1;
65 static uint __read_mostly nx_huge_pages_recovery_period_ms
;
66 #ifdef CONFIG_PREEMPT_RT
67 /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
68 static uint __read_mostly nx_huge_pages_recovery_ratio
= 0;
70 static uint __read_mostly nx_huge_pages_recovery_ratio
= 60;
73 static int get_nx_huge_pages(char *buffer
, const struct kernel_param
*kp
);
74 static int set_nx_huge_pages(const char *val
, const struct kernel_param
*kp
);
75 static int set_nx_huge_pages_recovery_param(const char *val
, const struct kernel_param
*kp
);
77 static const struct kernel_param_ops nx_huge_pages_ops
= {
78 .set
= set_nx_huge_pages
,
79 .get
= get_nx_huge_pages
,
82 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops
= {
83 .set
= set_nx_huge_pages_recovery_param
,
84 .get
= param_get_uint
,
87 module_param_cb(nx_huge_pages
, &nx_huge_pages_ops
, &nx_huge_pages
, 0644);
88 __MODULE_PARM_TYPE(nx_huge_pages
, "bool");
89 module_param_cb(nx_huge_pages_recovery_ratio
, &nx_huge_pages_recovery_param_ops
,
90 &nx_huge_pages_recovery_ratio
, 0644);
91 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio
, "uint");
92 module_param_cb(nx_huge_pages_recovery_period_ms
, &nx_huge_pages_recovery_param_ops
,
93 &nx_huge_pages_recovery_period_ms
, 0644);
94 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms
, "uint");
96 static bool __read_mostly force_flush_and_sync_on_reuse
;
97 module_param_named(flush_on_reuse
, force_flush_and_sync_on_reuse
, bool, 0644);
100 * When setting this variable to true it enables Two-Dimensional-Paging
101 * where the hardware walks 2 page tables:
102 * 1. the guest-virtual to guest-physical
103 * 2. while doing 1. it walks guest-physical to host-physical
104 * If the hardware supports that we don't need to do shadow paging.
106 bool tdp_enabled
= false;
108 static bool __ro_after_init tdp_mmu_allowed
;
111 bool __read_mostly tdp_mmu_enabled
= true;
112 module_param_named(tdp_mmu
, tdp_mmu_enabled
, bool, 0444);
115 static int max_huge_page_level __read_mostly
;
116 static int tdp_root_level __read_mostly
;
117 static int max_tdp_level __read_mostly
;
119 #define PTE_PREFETCH_NUM 8
121 #include <trace/events/kvm.h>
123 /* make pte_list_desc fit well in cache lines */
124 #define PTE_LIST_EXT 14
127 * struct pte_list_desc is the core data structure used to implement a custom
128 * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
129 * given GFN when used in the context of rmaps. Using a custom list allows KVM
130 * to optimize for the common case where many GFNs will have at most a handful
131 * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
132 * memory footprint, which in turn improves runtime performance by exploiting
135 * A list is comprised of one or more pte_list_desc objects (descriptors).
136 * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
137 * is full and a new SPTEs needs to be added, a new descriptor is allocated and
138 * becomes the head of the list. This means that by definitions, all tail
139 * descriptors are full.
141 * Note, the meta data fields are deliberately placed at the start of the
142 * structure to optimize the cacheline layout; accessing the descriptor will
143 * touch only a single cacheline so long as @spte_count<=6 (or if only the
144 * descriptors metadata is accessed).
146 struct pte_list_desc
{
147 struct pte_list_desc
*more
;
148 /* The number of PTEs stored in _this_ descriptor. */
150 /* The number of PTEs stored in all tails of this descriptor. */
152 u64
*sptes
[PTE_LIST_EXT
];
155 struct kvm_shadow_walk_iterator
{
163 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
164 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
166 shadow_walk_okay(&(_walker)); \
167 shadow_walk_next(&(_walker)))
169 #define for_each_shadow_entry(_vcpu, _addr, _walker) \
170 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
171 shadow_walk_okay(&(_walker)); \
172 shadow_walk_next(&(_walker)))
174 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
175 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
176 shadow_walk_okay(&(_walker)) && \
177 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
178 __shadow_walk_next(&(_walker), spte))
180 static struct kmem_cache
*pte_list_desc_cache
;
181 struct kmem_cache
*mmu_page_header_cache
;
182 static struct percpu_counter kvm_total_used_mmu_pages
;
184 static void mmu_spte_set(u64
*sptep
, u64 spte
);
186 struct kvm_mmu_role_regs
{
187 const unsigned long cr0
;
188 const unsigned long cr4
;
192 #define CREATE_TRACE_POINTS
193 #include "mmutrace.h"
196 * Yes, lot's of underscores. They're a hint that you probably shouldn't be
197 * reading from the role_regs. Once the root_role is constructed, it becomes
198 * the single source of truth for the MMU's state.
200 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
201 static inline bool __maybe_unused \
202 ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
204 return !!(regs->reg & flag); \
206 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0
, pg
, X86_CR0_PG
);
207 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0
, wp
, X86_CR0_WP
);
208 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4
, pse
, X86_CR4_PSE
);
209 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4
, pae
, X86_CR4_PAE
);
210 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4
, smep
, X86_CR4_SMEP
);
211 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4
, smap
, X86_CR4_SMAP
);
212 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4
, pke
, X86_CR4_PKE
);
213 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4
, la57
, X86_CR4_LA57
);
214 BUILD_MMU_ROLE_REGS_ACCESSOR(efer
, nx
, EFER_NX
);
215 BUILD_MMU_ROLE_REGS_ACCESSOR(efer
, lma
, EFER_LMA
);
218 * The MMU itself (with a valid role) is the single source of truth for the
219 * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
220 * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
221 * and the vCPU may be incorrect/irrelevant.
223 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
224 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
226 return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
228 BUILD_MMU_ROLE_ACCESSOR(base
, cr0
, wp
);
229 BUILD_MMU_ROLE_ACCESSOR(ext
, cr4
, pse
);
230 BUILD_MMU_ROLE_ACCESSOR(ext
, cr4
, smep
);
231 BUILD_MMU_ROLE_ACCESSOR(ext
, cr4
, smap
);
232 BUILD_MMU_ROLE_ACCESSOR(ext
, cr4
, pke
);
233 BUILD_MMU_ROLE_ACCESSOR(ext
, cr4
, la57
);
234 BUILD_MMU_ROLE_ACCESSOR(base
, efer
, nx
);
235 BUILD_MMU_ROLE_ACCESSOR(ext
, efer
, lma
);
237 static inline bool is_cr0_pg(struct kvm_mmu
*mmu
)
239 return mmu
->cpu_role
.base
.level
> 0;
242 static inline bool is_cr4_pae(struct kvm_mmu
*mmu
)
244 return !mmu
->cpu_role
.base
.has_4_byte_gpte
;
247 static struct kvm_mmu_role_regs
vcpu_to_role_regs(struct kvm_vcpu
*vcpu
)
249 struct kvm_mmu_role_regs regs
= {
250 .cr0
= kvm_read_cr0_bits(vcpu
, KVM_MMU_CR0_ROLE_BITS
),
251 .cr4
= kvm_read_cr4_bits(vcpu
, KVM_MMU_CR4_ROLE_BITS
),
252 .efer
= vcpu
->arch
.efer
,
258 static unsigned long get_guest_cr3(struct kvm_vcpu
*vcpu
)
260 return kvm_read_cr3(vcpu
);
263 static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu
*vcpu
,
266 if (IS_ENABLED(CONFIG_RETPOLINE
) && mmu
->get_guest_pgd
== get_guest_cr3
)
267 return kvm_read_cr3(vcpu
);
269 return mmu
->get_guest_pgd(vcpu
);
272 static inline bool kvm_available_flush_remote_tlbs_range(void)
274 return kvm_x86_ops
.flush_remote_tlbs_range
;
277 int kvm_arch_flush_remote_tlbs_range(struct kvm
*kvm
, gfn_t gfn
, u64 nr_pages
)
279 if (!kvm_x86_ops
.flush_remote_tlbs_range
)
282 return static_call(kvm_x86_flush_remote_tlbs_range
)(kvm
, gfn
, nr_pages
);
285 static gfn_t
kvm_mmu_page_get_gfn(struct kvm_mmu_page
*sp
, int index
);
287 /* Flush the range of guest memory mapped by the given SPTE. */
288 static void kvm_flush_remote_tlbs_sptep(struct kvm
*kvm
, u64
*sptep
)
290 struct kvm_mmu_page
*sp
= sptep_to_sp(sptep
);
291 gfn_t gfn
= kvm_mmu_page_get_gfn(sp
, spte_index(sptep
));
293 kvm_flush_remote_tlbs_gfn(kvm
, gfn
, sp
->role
.level
);
296 static void mark_mmio_spte(struct kvm_vcpu
*vcpu
, u64
*sptep
, u64 gfn
,
299 u64 spte
= make_mmio_spte(vcpu
, gfn
, access
);
301 trace_mark_mmio_spte(sptep
, gfn
, spte
);
302 mmu_spte_set(sptep
, spte
);
305 static gfn_t
get_mmio_spte_gfn(u64 spte
)
307 u64 gpa
= spte
& shadow_nonpresent_or_rsvd_lower_gfn_mask
;
309 gpa
|= (spte
>> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN
)
310 & shadow_nonpresent_or_rsvd_mask
;
312 return gpa
>> PAGE_SHIFT
;
315 static unsigned get_mmio_spte_access(u64 spte
)
317 return spte
& shadow_mmio_access_mask
;
320 static bool check_mmio_spte(struct kvm_vcpu
*vcpu
, u64 spte
)
322 u64 kvm_gen
, spte_gen
, gen
;
324 gen
= kvm_vcpu_memslots(vcpu
)->generation
;
325 if (unlikely(gen
& KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS
))
328 kvm_gen
= gen
& MMIO_SPTE_GEN_MASK
;
329 spte_gen
= get_mmio_spte_generation(spte
);
331 trace_check_mmio_spte(spte
, kvm_gen
, spte_gen
);
332 return likely(kvm_gen
== spte_gen
);
335 static int is_cpuid_PSE36(void)
341 static void __set_spte(u64
*sptep
, u64 spte
)
343 WRITE_ONCE(*sptep
, spte
);
346 static void __update_clear_spte_fast(u64
*sptep
, u64 spte
)
348 WRITE_ONCE(*sptep
, spte
);
351 static u64
__update_clear_spte_slow(u64
*sptep
, u64 spte
)
353 return xchg(sptep
, spte
);
356 static u64
__get_spte_lockless(u64
*sptep
)
358 return READ_ONCE(*sptep
);
369 static void count_spte_clear(u64
*sptep
, u64 spte
)
371 struct kvm_mmu_page
*sp
= sptep_to_sp(sptep
);
373 if (is_shadow_present_pte(spte
))
376 /* Ensure the spte is completely set before we increase the count */
378 sp
->clear_spte_count
++;
381 static void __set_spte(u64
*sptep
, u64 spte
)
383 union split_spte
*ssptep
, sspte
;
385 ssptep
= (union split_spte
*)sptep
;
386 sspte
= (union split_spte
)spte
;
388 ssptep
->spte_high
= sspte
.spte_high
;
391 * If we map the spte from nonpresent to present, We should store
392 * the high bits firstly, then set present bit, so cpu can not
393 * fetch this spte while we are setting the spte.
397 WRITE_ONCE(ssptep
->spte_low
, sspte
.spte_low
);
400 static void __update_clear_spte_fast(u64
*sptep
, u64 spte
)
402 union split_spte
*ssptep
, sspte
;
404 ssptep
= (union split_spte
*)sptep
;
405 sspte
= (union split_spte
)spte
;
407 WRITE_ONCE(ssptep
->spte_low
, sspte
.spte_low
);
410 * If we map the spte from present to nonpresent, we should clear
411 * present bit firstly to avoid vcpu fetch the old high bits.
415 ssptep
->spte_high
= sspte
.spte_high
;
416 count_spte_clear(sptep
, spte
);
419 static u64
__update_clear_spte_slow(u64
*sptep
, u64 spte
)
421 union split_spte
*ssptep
, sspte
, orig
;
423 ssptep
= (union split_spte
*)sptep
;
424 sspte
= (union split_spte
)spte
;
426 /* xchg acts as a barrier before the setting of the high bits */
427 orig
.spte_low
= xchg(&ssptep
->spte_low
, sspte
.spte_low
);
428 orig
.spte_high
= ssptep
->spte_high
;
429 ssptep
->spte_high
= sspte
.spte_high
;
430 count_spte_clear(sptep
, spte
);
436 * The idea using the light way get the spte on x86_32 guest is from
437 * gup_get_pte (mm/gup.c).
439 * An spte tlb flush may be pending, because kvm_set_pte_rmap
440 * coalesces them and we are running out of the MMU lock. Therefore
441 * we need to protect against in-progress updates of the spte.
443 * Reading the spte while an update is in progress may get the old value
444 * for the high part of the spte. The race is fine for a present->non-present
445 * change (because the high part of the spte is ignored for non-present spte),
446 * but for a present->present change we must reread the spte.
448 * All such changes are done in two steps (present->non-present and
449 * non-present->present), hence it is enough to count the number of
450 * present->non-present updates: if it changed while reading the spte,
451 * we might have hit the race. This is done using clear_spte_count.
453 static u64
__get_spte_lockless(u64
*sptep
)
455 struct kvm_mmu_page
*sp
= sptep_to_sp(sptep
);
456 union split_spte spte
, *orig
= (union split_spte
*)sptep
;
460 count
= sp
->clear_spte_count
;
463 spte
.spte_low
= orig
->spte_low
;
466 spte
.spte_high
= orig
->spte_high
;
469 if (unlikely(spte
.spte_low
!= orig
->spte_low
||
470 count
!= sp
->clear_spte_count
))
477 /* Rules for using mmu_spte_set:
478 * Set the sptep from nonpresent to present.
479 * Note: the sptep being assigned *must* be either not present
480 * or in a state where the hardware will not attempt to update
483 static void mmu_spte_set(u64
*sptep
, u64 new_spte
)
485 WARN_ON_ONCE(is_shadow_present_pte(*sptep
));
486 __set_spte(sptep
, new_spte
);
490 * Update the SPTE (excluding the PFN), but do not track changes in its
491 * accessed/dirty status.
493 static u64
mmu_spte_update_no_track(u64
*sptep
, u64 new_spte
)
495 u64 old_spte
= *sptep
;
497 WARN_ON_ONCE(!is_shadow_present_pte(new_spte
));
498 check_spte_writable_invariants(new_spte
);
500 if (!is_shadow_present_pte(old_spte
)) {
501 mmu_spte_set(sptep
, new_spte
);
505 if (!spte_has_volatile_bits(old_spte
))
506 __update_clear_spte_fast(sptep
, new_spte
);
508 old_spte
= __update_clear_spte_slow(sptep
, new_spte
);
510 WARN_ON_ONCE(spte_to_pfn(old_spte
) != spte_to_pfn(new_spte
));
515 /* Rules for using mmu_spte_update:
516 * Update the state bits, it means the mapped pfn is not changed.
518 * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
519 * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
520 * spte, even though the writable spte might be cached on a CPU's TLB.
522 * Returns true if the TLB needs to be flushed
524 static bool mmu_spte_update(u64
*sptep
, u64 new_spte
)
527 u64 old_spte
= mmu_spte_update_no_track(sptep
, new_spte
);
529 if (!is_shadow_present_pte(old_spte
))
533 * For the spte updated out of mmu-lock is safe, since
534 * we always atomically update it, see the comments in
535 * spte_has_volatile_bits().
537 if (is_mmu_writable_spte(old_spte
) &&
538 !is_writable_pte(new_spte
))
542 * Flush TLB when accessed/dirty states are changed in the page tables,
543 * to guarantee consistency between TLB and page tables.
546 if (is_accessed_spte(old_spte
) && !is_accessed_spte(new_spte
)) {
548 kvm_set_pfn_accessed(spte_to_pfn(old_spte
));
551 if (is_dirty_spte(old_spte
) && !is_dirty_spte(new_spte
)) {
553 kvm_set_pfn_dirty(spte_to_pfn(old_spte
));
560 * Rules for using mmu_spte_clear_track_bits:
561 * It sets the sptep from present to nonpresent, and track the
562 * state bits, it is used to clear the last level sptep.
563 * Returns the old PTE.
565 static u64
mmu_spte_clear_track_bits(struct kvm
*kvm
, u64
*sptep
)
568 u64 old_spte
= *sptep
;
569 int level
= sptep_to_sp(sptep
)->role
.level
;
572 if (!is_shadow_present_pte(old_spte
) ||
573 !spte_has_volatile_bits(old_spte
))
574 __update_clear_spte_fast(sptep
, 0ull);
576 old_spte
= __update_clear_spte_slow(sptep
, 0ull);
578 if (!is_shadow_present_pte(old_spte
))
581 kvm_update_page_stats(kvm
, level
, -1);
583 pfn
= spte_to_pfn(old_spte
);
586 * KVM doesn't hold a reference to any pages mapped into the guest, and
587 * instead uses the mmu_notifier to ensure that KVM unmaps any pages
588 * before they are reclaimed. Sanity check that, if the pfn is backed
589 * by a refcounted page, the refcount is elevated.
591 page
= kvm_pfn_to_refcounted_page(pfn
);
592 WARN_ON_ONCE(page
&& !page_count(page
));
594 if (is_accessed_spte(old_spte
))
595 kvm_set_pfn_accessed(pfn
);
597 if (is_dirty_spte(old_spte
))
598 kvm_set_pfn_dirty(pfn
);
604 * Rules for using mmu_spte_clear_no_track:
605 * Directly clear spte without caring the state bits of sptep,
606 * it is used to set the upper level spte.
608 static void mmu_spte_clear_no_track(u64
*sptep
)
610 __update_clear_spte_fast(sptep
, 0ull);
613 static u64
mmu_spte_get_lockless(u64
*sptep
)
615 return __get_spte_lockless(sptep
);
618 /* Returns the Accessed status of the PTE and resets it at the same time. */
619 static bool mmu_spte_age(u64
*sptep
)
621 u64 spte
= mmu_spte_get_lockless(sptep
);
623 if (!is_accessed_spte(spte
))
626 if (spte_ad_enabled(spte
)) {
627 clear_bit((ffs(shadow_accessed_mask
) - 1),
628 (unsigned long *)sptep
);
631 * Capture the dirty status of the page, so that it doesn't get
632 * lost when the SPTE is marked for access tracking.
634 if (is_writable_pte(spte
))
635 kvm_set_pfn_dirty(spte_to_pfn(spte
));
637 spte
= mark_spte_for_access_track(spte
);
638 mmu_spte_update_no_track(sptep
, spte
);
644 static inline bool is_tdp_mmu_active(struct kvm_vcpu
*vcpu
)
646 return tdp_mmu_enabled
&& vcpu
->arch
.mmu
->root_role
.direct
;
649 static void walk_shadow_page_lockless_begin(struct kvm_vcpu
*vcpu
)
651 if (is_tdp_mmu_active(vcpu
)) {
652 kvm_tdp_mmu_walk_lockless_begin();
655 * Prevent page table teardown by making any free-er wait during
656 * kvm_flush_remote_tlbs() IPI to all active vcpus.
661 * Make sure a following spte read is not reordered ahead of the write
664 smp_store_mb(vcpu
->mode
, READING_SHADOW_PAGE_TABLES
);
668 static void walk_shadow_page_lockless_end(struct kvm_vcpu
*vcpu
)
670 if (is_tdp_mmu_active(vcpu
)) {
671 kvm_tdp_mmu_walk_lockless_end();
674 * Make sure the write to vcpu->mode is not reordered in front of
675 * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
676 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
678 smp_store_release(&vcpu
->mode
, OUTSIDE_GUEST_MODE
);
683 static int mmu_topup_memory_caches(struct kvm_vcpu
*vcpu
, bool maybe_indirect
)
687 /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
688 r
= kvm_mmu_topup_memory_cache(&vcpu
->arch
.mmu_pte_list_desc_cache
,
689 1 + PT64_ROOT_MAX_LEVEL
+ PTE_PREFETCH_NUM
);
692 r
= kvm_mmu_topup_memory_cache(&vcpu
->arch
.mmu_shadow_page_cache
,
693 PT64_ROOT_MAX_LEVEL
);
696 if (maybe_indirect
) {
697 r
= kvm_mmu_topup_memory_cache(&vcpu
->arch
.mmu_shadowed_info_cache
,
698 PT64_ROOT_MAX_LEVEL
);
702 return kvm_mmu_topup_memory_cache(&vcpu
->arch
.mmu_page_header_cache
,
703 PT64_ROOT_MAX_LEVEL
);
706 static void mmu_free_memory_caches(struct kvm_vcpu
*vcpu
)
708 kvm_mmu_free_memory_cache(&vcpu
->arch
.mmu_pte_list_desc_cache
);
709 kvm_mmu_free_memory_cache(&vcpu
->arch
.mmu_shadow_page_cache
);
710 kvm_mmu_free_memory_cache(&vcpu
->arch
.mmu_shadowed_info_cache
);
711 kvm_mmu_free_memory_cache(&vcpu
->arch
.mmu_page_header_cache
);
714 static void mmu_free_pte_list_desc(struct pte_list_desc
*pte_list_desc
)
716 kmem_cache_free(pte_list_desc_cache
, pte_list_desc
);
719 static bool sp_has_gptes(struct kvm_mmu_page
*sp
);
721 static gfn_t
kvm_mmu_page_get_gfn(struct kvm_mmu_page
*sp
, int index
)
723 if (sp
->role
.passthrough
)
726 if (!sp
->role
.direct
)
727 return sp
->shadowed_translation
[index
] >> PAGE_SHIFT
;
729 return sp
->gfn
+ (index
<< ((sp
->role
.level
- 1) * SPTE_LEVEL_BITS
));
733 * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
734 * that the SPTE itself may have a more constrained access permissions that
735 * what the guest enforces. For example, a guest may create an executable
736 * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
738 static u32
kvm_mmu_page_get_access(struct kvm_mmu_page
*sp
, int index
)
740 if (sp_has_gptes(sp
))
741 return sp
->shadowed_translation
[index
] & ACC_ALL
;
744 * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
745 * KVM is not shadowing any guest page tables, so the "guest access
746 * permissions" are just ACC_ALL.
748 * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
749 * is shadowing a guest huge page with small pages, the guest access
750 * permissions being shadowed are the access permissions of the huge
753 * In both cases, sp->role.access contains the correct access bits.
755 return sp
->role
.access
;
758 static void kvm_mmu_page_set_translation(struct kvm_mmu_page
*sp
, int index
,
759 gfn_t gfn
, unsigned int access
)
761 if (sp_has_gptes(sp
)) {
762 sp
->shadowed_translation
[index
] = (gfn
<< PAGE_SHIFT
) | access
;
766 WARN_ONCE(access
!= kvm_mmu_page_get_access(sp
, index
),
767 "access mismatch under %s page %llx (expected %u, got %u)\n",
768 sp
->role
.passthrough
? "passthrough" : "direct",
769 sp
->gfn
, kvm_mmu_page_get_access(sp
, index
), access
);
771 WARN_ONCE(gfn
!= kvm_mmu_page_get_gfn(sp
, index
),
772 "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
773 sp
->role
.passthrough
? "passthrough" : "direct",
774 sp
->gfn
, kvm_mmu_page_get_gfn(sp
, index
), gfn
);
777 static void kvm_mmu_page_set_access(struct kvm_mmu_page
*sp
, int index
,
780 gfn_t gfn
= kvm_mmu_page_get_gfn(sp
, index
);
782 kvm_mmu_page_set_translation(sp
, index
, gfn
, access
);
786 * Return the pointer to the large page information for a given gfn,
787 * handling slots that are not large page aligned.
789 static struct kvm_lpage_info
*lpage_info_slot(gfn_t gfn
,
790 const struct kvm_memory_slot
*slot
, int level
)
794 idx
= gfn_to_index(gfn
, slot
->base_gfn
, level
);
795 return &slot
->arch
.lpage_info
[level
- 2][idx
];
799 * The most significant bit in disallow_lpage tracks whether or not memory
800 * attributes are mixed, i.e. not identical for all gfns at the current level.
801 * The lower order bits are used to refcount other cases where a hugepage is
802 * disallowed, e.g. if KVM has shadow a page table at the gfn.
804 #define KVM_LPAGE_MIXED_FLAG BIT(31)
806 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot
*slot
,
807 gfn_t gfn
, int count
)
809 struct kvm_lpage_info
*linfo
;
812 for (i
= PG_LEVEL_2M
; i
<= KVM_MAX_HUGEPAGE_LEVEL
; ++i
) {
813 linfo
= lpage_info_slot(gfn
, slot
, i
);
815 old
= linfo
->disallow_lpage
;
816 linfo
->disallow_lpage
+= count
;
817 WARN_ON_ONCE((old
^ linfo
->disallow_lpage
) & KVM_LPAGE_MIXED_FLAG
);
821 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot
*slot
, gfn_t gfn
)
823 update_gfn_disallow_lpage_count(slot
, gfn
, 1);
826 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot
*slot
, gfn_t gfn
)
828 update_gfn_disallow_lpage_count(slot
, gfn
, -1);
831 static void account_shadowed(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
833 struct kvm_memslots
*slots
;
834 struct kvm_memory_slot
*slot
;
837 kvm
->arch
.indirect_shadow_pages
++;
839 slots
= kvm_memslots_for_spte_role(kvm
, sp
->role
);
840 slot
= __gfn_to_memslot(slots
, gfn
);
842 /* the non-leaf shadow pages are keeping readonly. */
843 if (sp
->role
.level
> PG_LEVEL_4K
)
844 return __kvm_write_track_add_gfn(kvm
, slot
, gfn
);
846 kvm_mmu_gfn_disallow_lpage(slot
, gfn
);
848 if (kvm_mmu_slot_gfn_write_protect(kvm
, slot
, gfn
, PG_LEVEL_4K
))
849 kvm_flush_remote_tlbs_gfn(kvm
, gfn
, PG_LEVEL_4K
);
852 void track_possible_nx_huge_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
855 * If it's possible to replace the shadow page with an NX huge page,
856 * i.e. if the shadow page is the only thing currently preventing KVM
857 * from using a huge page, add the shadow page to the list of "to be
858 * zapped for NX recovery" pages. Note, the shadow page can already be
859 * on the list if KVM is reusing an existing shadow page, i.e. if KVM
860 * links a shadow page at multiple points.
862 if (!list_empty(&sp
->possible_nx_huge_page_link
))
865 ++kvm
->stat
.nx_lpage_splits
;
866 list_add_tail(&sp
->possible_nx_huge_page_link
,
867 &kvm
->arch
.possible_nx_huge_pages
);
870 static void account_nx_huge_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
,
871 bool nx_huge_page_possible
)
873 sp
->nx_huge_page_disallowed
= true;
875 if (nx_huge_page_possible
)
876 track_possible_nx_huge_page(kvm
, sp
);
879 static void unaccount_shadowed(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
881 struct kvm_memslots
*slots
;
882 struct kvm_memory_slot
*slot
;
885 kvm
->arch
.indirect_shadow_pages
--;
887 slots
= kvm_memslots_for_spte_role(kvm
, sp
->role
);
888 slot
= __gfn_to_memslot(slots
, gfn
);
889 if (sp
->role
.level
> PG_LEVEL_4K
)
890 return __kvm_write_track_remove_gfn(kvm
, slot
, gfn
);
892 kvm_mmu_gfn_allow_lpage(slot
, gfn
);
895 void untrack_possible_nx_huge_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
897 if (list_empty(&sp
->possible_nx_huge_page_link
))
900 --kvm
->stat
.nx_lpage_splits
;
901 list_del_init(&sp
->possible_nx_huge_page_link
);
904 static void unaccount_nx_huge_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
906 sp
->nx_huge_page_disallowed
= false;
908 untrack_possible_nx_huge_page(kvm
, sp
);
911 static struct kvm_memory_slot
*gfn_to_memslot_dirty_bitmap(struct kvm_vcpu
*vcpu
,
915 struct kvm_memory_slot
*slot
;
917 slot
= kvm_vcpu_gfn_to_memslot(vcpu
, gfn
);
918 if (!slot
|| slot
->flags
& KVM_MEMSLOT_INVALID
)
920 if (no_dirty_log
&& kvm_slot_dirty_track_enabled(slot
))
927 * About rmap_head encoding:
929 * If the bit zero of rmap_head->val is clear, then it points to the only spte
930 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
931 * pte_list_desc containing more mappings.
935 * Returns the number of pointers in the rmap chain, not counting the new one.
937 static int pte_list_add(struct kvm_mmu_memory_cache
*cache
, u64
*spte
,
938 struct kvm_rmap_head
*rmap_head
)
940 struct pte_list_desc
*desc
;
943 if (!rmap_head
->val
) {
944 rmap_head
->val
= (unsigned long)spte
;
945 } else if (!(rmap_head
->val
& 1)) {
946 desc
= kvm_mmu_memory_cache_alloc(cache
);
947 desc
->sptes
[0] = (u64
*)rmap_head
->val
;
948 desc
->sptes
[1] = spte
;
949 desc
->spte_count
= 2;
950 desc
->tail_count
= 0;
951 rmap_head
->val
= (unsigned long)desc
| 1;
954 desc
= (struct pte_list_desc
*)(rmap_head
->val
& ~1ul);
955 count
= desc
->tail_count
+ desc
->spte_count
;
958 * If the previous head is full, allocate a new head descriptor
959 * as tail descriptors are always kept full.
961 if (desc
->spte_count
== PTE_LIST_EXT
) {
962 desc
= kvm_mmu_memory_cache_alloc(cache
);
963 desc
->more
= (struct pte_list_desc
*)(rmap_head
->val
& ~1ul);
964 desc
->spte_count
= 0;
965 desc
->tail_count
= count
;
966 rmap_head
->val
= (unsigned long)desc
| 1;
968 desc
->sptes
[desc
->spte_count
++] = spte
;
973 static void pte_list_desc_remove_entry(struct kvm
*kvm
,
974 struct kvm_rmap_head
*rmap_head
,
975 struct pte_list_desc
*desc
, int i
)
977 struct pte_list_desc
*head_desc
= (struct pte_list_desc
*)(rmap_head
->val
& ~1ul);
978 int j
= head_desc
->spte_count
- 1;
981 * The head descriptor should never be empty. A new head is added only
982 * when adding an entry and the previous head is full, and heads are
983 * removed (this flow) when they become empty.
985 KVM_BUG_ON_DATA_CORRUPTION(j
< 0, kvm
);
988 * Replace the to-be-freed SPTE with the last valid entry from the head
989 * descriptor to ensure that tail descriptors are full at all times.
990 * Note, this also means that tail_count is stable for each descriptor.
992 desc
->sptes
[i
] = head_desc
->sptes
[j
];
993 head_desc
->sptes
[j
] = NULL
;
994 head_desc
->spte_count
--;
995 if (head_desc
->spte_count
)
999 * The head descriptor is empty. If there are no tail descriptors,
1000 * nullify the rmap head to mark the list as emtpy, else point the rmap
1001 * head at the next descriptor, i.e. the new head.
1003 if (!head_desc
->more
)
1006 rmap_head
->val
= (unsigned long)head_desc
->more
| 1;
1007 mmu_free_pte_list_desc(head_desc
);
1010 static void pte_list_remove(struct kvm
*kvm
, u64
*spte
,
1011 struct kvm_rmap_head
*rmap_head
)
1013 struct pte_list_desc
*desc
;
1016 if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head
->val
, kvm
))
1019 if (!(rmap_head
->val
& 1)) {
1020 if (KVM_BUG_ON_DATA_CORRUPTION((u64
*)rmap_head
->val
!= spte
, kvm
))
1025 desc
= (struct pte_list_desc
*)(rmap_head
->val
& ~1ul);
1027 for (i
= 0; i
< desc
->spte_count
; ++i
) {
1028 if (desc
->sptes
[i
] == spte
) {
1029 pte_list_desc_remove_entry(kvm
, rmap_head
,
1037 KVM_BUG_ON_DATA_CORRUPTION(true, kvm
);
1041 static void kvm_zap_one_rmap_spte(struct kvm
*kvm
,
1042 struct kvm_rmap_head
*rmap_head
, u64
*sptep
)
1044 mmu_spte_clear_track_bits(kvm
, sptep
);
1045 pte_list_remove(kvm
, sptep
, rmap_head
);
1048 /* Return true if at least one SPTE was zapped, false otherwise */
1049 static bool kvm_zap_all_rmap_sptes(struct kvm
*kvm
,
1050 struct kvm_rmap_head
*rmap_head
)
1052 struct pte_list_desc
*desc
, *next
;
1055 if (!rmap_head
->val
)
1058 if (!(rmap_head
->val
& 1)) {
1059 mmu_spte_clear_track_bits(kvm
, (u64
*)rmap_head
->val
);
1063 desc
= (struct pte_list_desc
*)(rmap_head
->val
& ~1ul);
1065 for (; desc
; desc
= next
) {
1066 for (i
= 0; i
< desc
->spte_count
; i
++)
1067 mmu_spte_clear_track_bits(kvm
, desc
->sptes
[i
]);
1069 mmu_free_pte_list_desc(desc
);
1072 /* rmap_head is meaningless now, remember to reset it */
1077 unsigned int pte_list_count(struct kvm_rmap_head
*rmap_head
)
1079 struct pte_list_desc
*desc
;
1081 if (!rmap_head
->val
)
1083 else if (!(rmap_head
->val
& 1))
1086 desc
= (struct pte_list_desc
*)(rmap_head
->val
& ~1ul);
1087 return desc
->tail_count
+ desc
->spte_count
;
1090 static struct kvm_rmap_head
*gfn_to_rmap(gfn_t gfn
, int level
,
1091 const struct kvm_memory_slot
*slot
)
1095 idx
= gfn_to_index(gfn
, slot
->base_gfn
, level
);
1096 return &slot
->arch
.rmap
[level
- PG_LEVEL_4K
][idx
];
1099 static void rmap_remove(struct kvm
*kvm
, u64
*spte
)
1101 struct kvm_memslots
*slots
;
1102 struct kvm_memory_slot
*slot
;
1103 struct kvm_mmu_page
*sp
;
1105 struct kvm_rmap_head
*rmap_head
;
1107 sp
= sptep_to_sp(spte
);
1108 gfn
= kvm_mmu_page_get_gfn(sp
, spte_index(spte
));
1111 * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1112 * so we have to determine which memslots to use based on context
1113 * information in sp->role.
1115 slots
= kvm_memslots_for_spte_role(kvm
, sp
->role
);
1117 slot
= __gfn_to_memslot(slots
, gfn
);
1118 rmap_head
= gfn_to_rmap(gfn
, sp
->role
.level
, slot
);
1120 pte_list_remove(kvm
, spte
, rmap_head
);
1124 * Used by the following functions to iterate through the sptes linked by a
1125 * rmap. All fields are private and not assumed to be used outside.
1127 struct rmap_iterator
{
1128 /* private fields */
1129 struct pte_list_desc
*desc
; /* holds the sptep if not NULL */
1130 int pos
; /* index of the sptep */
1134 * Iteration must be started by this function. This should also be used after
1135 * removing/dropping sptes from the rmap link because in such cases the
1136 * information in the iterator may not be valid.
1138 * Returns sptep if found, NULL otherwise.
1140 static u64
*rmap_get_first(struct kvm_rmap_head
*rmap_head
,
1141 struct rmap_iterator
*iter
)
1145 if (!rmap_head
->val
)
1148 if (!(rmap_head
->val
& 1)) {
1150 sptep
= (u64
*)rmap_head
->val
;
1154 iter
->desc
= (struct pte_list_desc
*)(rmap_head
->val
& ~1ul);
1156 sptep
= iter
->desc
->sptes
[iter
->pos
];
1158 BUG_ON(!is_shadow_present_pte(*sptep
));
1163 * Must be used with a valid iterator: e.g. after rmap_get_first().
1165 * Returns sptep if found, NULL otherwise.
1167 static u64
*rmap_get_next(struct rmap_iterator
*iter
)
1172 if (iter
->pos
< PTE_LIST_EXT
- 1) {
1174 sptep
= iter
->desc
->sptes
[iter
->pos
];
1179 iter
->desc
= iter
->desc
->more
;
1183 /* desc->sptes[0] cannot be NULL */
1184 sptep
= iter
->desc
->sptes
[iter
->pos
];
1191 BUG_ON(!is_shadow_present_pte(*sptep
));
1195 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1196 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1197 _spte_; _spte_ = rmap_get_next(_iter_))
1199 static void drop_spte(struct kvm
*kvm
, u64
*sptep
)
1201 u64 old_spte
= mmu_spte_clear_track_bits(kvm
, sptep
);
1203 if (is_shadow_present_pte(old_spte
))
1204 rmap_remove(kvm
, sptep
);
1207 static void drop_large_spte(struct kvm
*kvm
, u64
*sptep
, bool flush
)
1209 struct kvm_mmu_page
*sp
;
1211 sp
= sptep_to_sp(sptep
);
1212 WARN_ON_ONCE(sp
->role
.level
== PG_LEVEL_4K
);
1214 drop_spte(kvm
, sptep
);
1217 kvm_flush_remote_tlbs_sptep(kvm
, sptep
);
1221 * Write-protect on the specified @sptep, @pt_protect indicates whether
1222 * spte write-protection is caused by protecting shadow page table.
1224 * Note: write protection is difference between dirty logging and spte
1226 * - for dirty logging, the spte can be set to writable at anytime if
1227 * its dirty bitmap is properly set.
1228 * - for spte protection, the spte can be writable only after unsync-ing
1231 * Return true if tlb need be flushed.
1233 static bool spte_write_protect(u64
*sptep
, bool pt_protect
)
1237 if (!is_writable_pte(spte
) &&
1238 !(pt_protect
&& is_mmu_writable_spte(spte
)))
1242 spte
&= ~shadow_mmu_writable_mask
;
1243 spte
= spte
& ~PT_WRITABLE_MASK
;
1245 return mmu_spte_update(sptep
, spte
);
1248 static bool rmap_write_protect(struct kvm_rmap_head
*rmap_head
,
1252 struct rmap_iterator iter
;
1255 for_each_rmap_spte(rmap_head
, &iter
, sptep
)
1256 flush
|= spte_write_protect(sptep
, pt_protect
);
1261 static bool spte_clear_dirty(u64
*sptep
)
1265 KVM_MMU_WARN_ON(!spte_ad_enabled(spte
));
1266 spte
&= ~shadow_dirty_mask
;
1267 return mmu_spte_update(sptep
, spte
);
1270 static bool spte_wrprot_for_clear_dirty(u64
*sptep
)
1272 bool was_writable
= test_and_clear_bit(PT_WRITABLE_SHIFT
,
1273 (unsigned long *)sptep
);
1274 if (was_writable
&& !spte_ad_enabled(*sptep
))
1275 kvm_set_pfn_dirty(spte_to_pfn(*sptep
));
1277 return was_writable
;
1281 * Gets the GFN ready for another round of dirty logging by clearing the
1282 * - D bit on ad-enabled SPTEs, and
1283 * - W bit on ad-disabled SPTEs.
1284 * Returns true iff any D or W bits were cleared.
1286 static bool __rmap_clear_dirty(struct kvm
*kvm
, struct kvm_rmap_head
*rmap_head
,
1287 const struct kvm_memory_slot
*slot
)
1290 struct rmap_iterator iter
;
1293 for_each_rmap_spte(rmap_head
, &iter
, sptep
)
1294 if (spte_ad_need_write_protect(*sptep
))
1295 flush
|= spte_wrprot_for_clear_dirty(sptep
);
1297 flush
|= spte_clear_dirty(sptep
);
1303 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1304 * @kvm: kvm instance
1305 * @slot: slot to protect
1306 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1307 * @mask: indicates which pages we should protect
1309 * Used when we do not need to care about huge page mappings.
1311 static void kvm_mmu_write_protect_pt_masked(struct kvm
*kvm
,
1312 struct kvm_memory_slot
*slot
,
1313 gfn_t gfn_offset
, unsigned long mask
)
1315 struct kvm_rmap_head
*rmap_head
;
1317 if (tdp_mmu_enabled
)
1318 kvm_tdp_mmu_clear_dirty_pt_masked(kvm
, slot
,
1319 slot
->base_gfn
+ gfn_offset
, mask
, true);
1321 if (!kvm_memslots_have_rmaps(kvm
))
1325 rmap_head
= gfn_to_rmap(slot
->base_gfn
+ gfn_offset
+ __ffs(mask
),
1327 rmap_write_protect(rmap_head
, false);
1329 /* clear the first set bit */
1335 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1336 * protect the page if the D-bit isn't supported.
1337 * @kvm: kvm instance
1338 * @slot: slot to clear D-bit
1339 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1340 * @mask: indicates which pages we should clear D-bit
1342 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1344 static void kvm_mmu_clear_dirty_pt_masked(struct kvm
*kvm
,
1345 struct kvm_memory_slot
*slot
,
1346 gfn_t gfn_offset
, unsigned long mask
)
1348 struct kvm_rmap_head
*rmap_head
;
1350 if (tdp_mmu_enabled
)
1351 kvm_tdp_mmu_clear_dirty_pt_masked(kvm
, slot
,
1352 slot
->base_gfn
+ gfn_offset
, mask
, false);
1354 if (!kvm_memslots_have_rmaps(kvm
))
1358 rmap_head
= gfn_to_rmap(slot
->base_gfn
+ gfn_offset
+ __ffs(mask
),
1360 __rmap_clear_dirty(kvm
, rmap_head
, slot
);
1362 /* clear the first set bit */
1368 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1371 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1372 * enable dirty logging for them.
1374 * We need to care about huge page mappings: e.g. during dirty logging we may
1375 * have such mappings.
1377 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm
*kvm
,
1378 struct kvm_memory_slot
*slot
,
1379 gfn_t gfn_offset
, unsigned long mask
)
1382 * Huge pages are NOT write protected when we start dirty logging in
1383 * initially-all-set mode; must write protect them here so that they
1384 * are split to 4K on the first write.
1386 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1387 * of memslot has no such restriction, so the range can cross two large
1390 if (kvm_dirty_log_manual_protect_and_init_set(kvm
)) {
1391 gfn_t start
= slot
->base_gfn
+ gfn_offset
+ __ffs(mask
);
1392 gfn_t end
= slot
->base_gfn
+ gfn_offset
+ __fls(mask
);
1394 if (READ_ONCE(eager_page_split
))
1395 kvm_mmu_try_split_huge_pages(kvm
, slot
, start
, end
, PG_LEVEL_4K
);
1397 kvm_mmu_slot_gfn_write_protect(kvm
, slot
, start
, PG_LEVEL_2M
);
1399 /* Cross two large pages? */
1400 if (ALIGN(start
<< PAGE_SHIFT
, PMD_SIZE
) !=
1401 ALIGN(end
<< PAGE_SHIFT
, PMD_SIZE
))
1402 kvm_mmu_slot_gfn_write_protect(kvm
, slot
, end
,
1406 /* Now handle 4K PTEs. */
1407 if (kvm_x86_ops
.cpu_dirty_log_size
)
1408 kvm_mmu_clear_dirty_pt_masked(kvm
, slot
, gfn_offset
, mask
);
1410 kvm_mmu_write_protect_pt_masked(kvm
, slot
, gfn_offset
, mask
);
1413 int kvm_cpu_dirty_log_size(void)
1415 return kvm_x86_ops
.cpu_dirty_log_size
;
1418 bool kvm_mmu_slot_gfn_write_protect(struct kvm
*kvm
,
1419 struct kvm_memory_slot
*slot
, u64 gfn
,
1422 struct kvm_rmap_head
*rmap_head
;
1424 bool write_protected
= false;
1426 if (kvm_memslots_have_rmaps(kvm
)) {
1427 for (i
= min_level
; i
<= KVM_MAX_HUGEPAGE_LEVEL
; ++i
) {
1428 rmap_head
= gfn_to_rmap(gfn
, i
, slot
);
1429 write_protected
|= rmap_write_protect(rmap_head
, true);
1433 if (tdp_mmu_enabled
)
1435 kvm_tdp_mmu_write_protect_gfn(kvm
, slot
, gfn
, min_level
);
1437 return write_protected
;
1440 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu
*vcpu
, u64 gfn
)
1442 struct kvm_memory_slot
*slot
;
1444 slot
= kvm_vcpu_gfn_to_memslot(vcpu
, gfn
);
1445 return kvm_mmu_slot_gfn_write_protect(vcpu
->kvm
, slot
, gfn
, PG_LEVEL_4K
);
1448 static bool __kvm_zap_rmap(struct kvm
*kvm
, struct kvm_rmap_head
*rmap_head
,
1449 const struct kvm_memory_slot
*slot
)
1451 return kvm_zap_all_rmap_sptes(kvm
, rmap_head
);
1454 static bool kvm_zap_rmap(struct kvm
*kvm
, struct kvm_rmap_head
*rmap_head
,
1455 struct kvm_memory_slot
*slot
, gfn_t gfn
, int level
,
1458 return __kvm_zap_rmap(kvm
, rmap_head
, slot
);
1461 static bool kvm_set_pte_rmap(struct kvm
*kvm
, struct kvm_rmap_head
*rmap_head
,
1462 struct kvm_memory_slot
*slot
, gfn_t gfn
, int level
,
1466 struct rmap_iterator iter
;
1467 bool need_flush
= false;
1471 WARN_ON_ONCE(pte_huge(pte
));
1472 new_pfn
= pte_pfn(pte
);
1475 for_each_rmap_spte(rmap_head
, &iter
, sptep
) {
1478 if (pte_write(pte
)) {
1479 kvm_zap_one_rmap_spte(kvm
, rmap_head
, sptep
);
1482 new_spte
= kvm_mmu_changed_pte_notifier_make_spte(
1485 mmu_spte_clear_track_bits(kvm
, sptep
);
1486 mmu_spte_set(sptep
, new_spte
);
1490 if (need_flush
&& kvm_available_flush_remote_tlbs_range()) {
1491 kvm_flush_remote_tlbs_gfn(kvm
, gfn
, level
);
1498 struct slot_rmap_walk_iterator
{
1500 const struct kvm_memory_slot
*slot
;
1506 /* output fields. */
1508 struct kvm_rmap_head
*rmap
;
1511 /* private field. */
1512 struct kvm_rmap_head
*end_rmap
;
1515 static void rmap_walk_init_level(struct slot_rmap_walk_iterator
*iterator
,
1518 iterator
->level
= level
;
1519 iterator
->gfn
= iterator
->start_gfn
;
1520 iterator
->rmap
= gfn_to_rmap(iterator
->gfn
, level
, iterator
->slot
);
1521 iterator
->end_rmap
= gfn_to_rmap(iterator
->end_gfn
, level
, iterator
->slot
);
1524 static void slot_rmap_walk_init(struct slot_rmap_walk_iterator
*iterator
,
1525 const struct kvm_memory_slot
*slot
,
1526 int start_level
, int end_level
,
1527 gfn_t start_gfn
, gfn_t end_gfn
)
1529 iterator
->slot
= slot
;
1530 iterator
->start_level
= start_level
;
1531 iterator
->end_level
= end_level
;
1532 iterator
->start_gfn
= start_gfn
;
1533 iterator
->end_gfn
= end_gfn
;
1535 rmap_walk_init_level(iterator
, iterator
->start_level
);
1538 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator
*iterator
)
1540 return !!iterator
->rmap
;
1543 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator
*iterator
)
1545 while (++iterator
->rmap
<= iterator
->end_rmap
) {
1546 iterator
->gfn
+= (1UL << KVM_HPAGE_GFN_SHIFT(iterator
->level
));
1548 if (iterator
->rmap
->val
)
1552 if (++iterator
->level
> iterator
->end_level
) {
1553 iterator
->rmap
= NULL
;
1557 rmap_walk_init_level(iterator
, iterator
->level
);
1560 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1561 _start_gfn, _end_gfn, _iter_) \
1562 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1563 _end_level_, _start_gfn, _end_gfn); \
1564 slot_rmap_walk_okay(_iter_); \
1565 slot_rmap_walk_next(_iter_))
1567 typedef bool (*rmap_handler_t
)(struct kvm
*kvm
, struct kvm_rmap_head
*rmap_head
,
1568 struct kvm_memory_slot
*slot
, gfn_t gfn
,
1569 int level
, pte_t pte
);
1571 static __always_inline
bool kvm_handle_gfn_range(struct kvm
*kvm
,
1572 struct kvm_gfn_range
*range
,
1573 rmap_handler_t handler
)
1575 struct slot_rmap_walk_iterator iterator
;
1578 for_each_slot_rmap_range(range
->slot
, PG_LEVEL_4K
, KVM_MAX_HUGEPAGE_LEVEL
,
1579 range
->start
, range
->end
- 1, &iterator
)
1580 ret
|= handler(kvm
, iterator
.rmap
, range
->slot
, iterator
.gfn
,
1581 iterator
.level
, range
->arg
.pte
);
1586 bool kvm_unmap_gfn_range(struct kvm
*kvm
, struct kvm_gfn_range
*range
)
1590 if (kvm_memslots_have_rmaps(kvm
))
1591 flush
= kvm_handle_gfn_range(kvm
, range
, kvm_zap_rmap
);
1593 if (tdp_mmu_enabled
)
1594 flush
= kvm_tdp_mmu_unmap_gfn_range(kvm
, range
, flush
);
1596 if (kvm_x86_ops
.set_apic_access_page_addr
&&
1597 range
->slot
->id
== APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
)
1598 kvm_make_all_cpus_request(kvm
, KVM_REQ_APIC_PAGE_RELOAD
);
1603 bool kvm_set_spte_gfn(struct kvm
*kvm
, struct kvm_gfn_range
*range
)
1607 if (kvm_memslots_have_rmaps(kvm
))
1608 flush
= kvm_handle_gfn_range(kvm
, range
, kvm_set_pte_rmap
);
1610 if (tdp_mmu_enabled
)
1611 flush
|= kvm_tdp_mmu_set_spte_gfn(kvm
, range
);
1616 static bool kvm_age_rmap(struct kvm
*kvm
, struct kvm_rmap_head
*rmap_head
,
1617 struct kvm_memory_slot
*slot
, gfn_t gfn
, int level
,
1621 struct rmap_iterator iter
;
1624 for_each_rmap_spte(rmap_head
, &iter
, sptep
)
1625 young
|= mmu_spte_age(sptep
);
1630 static bool kvm_test_age_rmap(struct kvm
*kvm
, struct kvm_rmap_head
*rmap_head
,
1631 struct kvm_memory_slot
*slot
, gfn_t gfn
,
1632 int level
, pte_t unused
)
1635 struct rmap_iterator iter
;
1637 for_each_rmap_spte(rmap_head
, &iter
, sptep
)
1638 if (is_accessed_spte(*sptep
))
1643 #define RMAP_RECYCLE_THRESHOLD 1000
1645 static void __rmap_add(struct kvm
*kvm
,
1646 struct kvm_mmu_memory_cache
*cache
,
1647 const struct kvm_memory_slot
*slot
,
1648 u64
*spte
, gfn_t gfn
, unsigned int access
)
1650 struct kvm_mmu_page
*sp
;
1651 struct kvm_rmap_head
*rmap_head
;
1654 sp
= sptep_to_sp(spte
);
1655 kvm_mmu_page_set_translation(sp
, spte_index(spte
), gfn
, access
);
1656 kvm_update_page_stats(kvm
, sp
->role
.level
, 1);
1658 rmap_head
= gfn_to_rmap(gfn
, sp
->role
.level
, slot
);
1659 rmap_count
= pte_list_add(cache
, spte
, rmap_head
);
1661 if (rmap_count
> kvm
->stat
.max_mmu_rmap_size
)
1662 kvm
->stat
.max_mmu_rmap_size
= rmap_count
;
1663 if (rmap_count
> RMAP_RECYCLE_THRESHOLD
) {
1664 kvm_zap_all_rmap_sptes(kvm
, rmap_head
);
1665 kvm_flush_remote_tlbs_gfn(kvm
, gfn
, sp
->role
.level
);
1669 static void rmap_add(struct kvm_vcpu
*vcpu
, const struct kvm_memory_slot
*slot
,
1670 u64
*spte
, gfn_t gfn
, unsigned int access
)
1672 struct kvm_mmu_memory_cache
*cache
= &vcpu
->arch
.mmu_pte_list_desc_cache
;
1674 __rmap_add(vcpu
->kvm
, cache
, slot
, spte
, gfn
, access
);
1677 bool kvm_age_gfn(struct kvm
*kvm
, struct kvm_gfn_range
*range
)
1681 if (kvm_memslots_have_rmaps(kvm
))
1682 young
= kvm_handle_gfn_range(kvm
, range
, kvm_age_rmap
);
1684 if (tdp_mmu_enabled
)
1685 young
|= kvm_tdp_mmu_age_gfn_range(kvm
, range
);
1690 bool kvm_test_age_gfn(struct kvm
*kvm
, struct kvm_gfn_range
*range
)
1694 if (kvm_memslots_have_rmaps(kvm
))
1695 young
= kvm_handle_gfn_range(kvm
, range
, kvm_test_age_rmap
);
1697 if (tdp_mmu_enabled
)
1698 young
|= kvm_tdp_mmu_test_age_gfn(kvm
, range
);
1703 static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page
*sp
)
1705 #ifdef CONFIG_KVM_PROVE_MMU
1708 for (i
= 0; i
< SPTE_ENT_PER_PAGE
; i
++) {
1709 if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp
->spt
[i
])))
1710 pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
1711 sp
->spt
[i
], &sp
->spt
[i
],
1712 kvm_mmu_page_get_gfn(sp
, i
));
1718 * This value is the sum of all of the kvm instances's
1719 * kvm->arch.n_used_mmu_pages values. We need a global,
1720 * aggregate version in order to make the slab shrinker
1723 static inline void kvm_mod_used_mmu_pages(struct kvm
*kvm
, long nr
)
1725 kvm
->arch
.n_used_mmu_pages
+= nr
;
1726 percpu_counter_add(&kvm_total_used_mmu_pages
, nr
);
1729 static void kvm_account_mmu_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
1731 kvm_mod_used_mmu_pages(kvm
, +1);
1732 kvm_account_pgtable_pages((void *)sp
->spt
, +1);
1735 static void kvm_unaccount_mmu_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
1737 kvm_mod_used_mmu_pages(kvm
, -1);
1738 kvm_account_pgtable_pages((void *)sp
->spt
, -1);
1741 static void kvm_mmu_free_shadow_page(struct kvm_mmu_page
*sp
)
1743 kvm_mmu_check_sptes_at_free(sp
);
1745 hlist_del(&sp
->hash_link
);
1746 list_del(&sp
->link
);
1747 free_page((unsigned long)sp
->spt
);
1748 if (!sp
->role
.direct
)
1749 free_page((unsigned long)sp
->shadowed_translation
);
1750 kmem_cache_free(mmu_page_header_cache
, sp
);
1753 static unsigned kvm_page_table_hashfn(gfn_t gfn
)
1755 return hash_64(gfn
, KVM_MMU_HASH_SHIFT
);
1758 static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache
*cache
,
1759 struct kvm_mmu_page
*sp
, u64
*parent_pte
)
1764 pte_list_add(cache
, parent_pte
, &sp
->parent_ptes
);
1767 static void mmu_page_remove_parent_pte(struct kvm
*kvm
, struct kvm_mmu_page
*sp
,
1770 pte_list_remove(kvm
, parent_pte
, &sp
->parent_ptes
);
1773 static void drop_parent_pte(struct kvm
*kvm
, struct kvm_mmu_page
*sp
,
1776 mmu_page_remove_parent_pte(kvm
, sp
, parent_pte
);
1777 mmu_spte_clear_no_track(parent_pte
);
1780 static void mark_unsync(u64
*spte
);
1781 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page
*sp
)
1784 struct rmap_iterator iter
;
1786 for_each_rmap_spte(&sp
->parent_ptes
, &iter
, sptep
) {
1791 static void mark_unsync(u64
*spte
)
1793 struct kvm_mmu_page
*sp
;
1795 sp
= sptep_to_sp(spte
);
1796 if (__test_and_set_bit(spte_index(spte
), sp
->unsync_child_bitmap
))
1798 if (sp
->unsync_children
++)
1800 kvm_mmu_mark_parents_unsync(sp
);
1803 #define KVM_PAGE_ARRAY_NR 16
1805 struct kvm_mmu_pages
{
1806 struct mmu_page_and_offset
{
1807 struct kvm_mmu_page
*sp
;
1809 } page
[KVM_PAGE_ARRAY_NR
];
1813 static int mmu_pages_add(struct kvm_mmu_pages
*pvec
, struct kvm_mmu_page
*sp
,
1819 for (i
=0; i
< pvec
->nr
; i
++)
1820 if (pvec
->page
[i
].sp
== sp
)
1823 pvec
->page
[pvec
->nr
].sp
= sp
;
1824 pvec
->page
[pvec
->nr
].idx
= idx
;
1826 return (pvec
->nr
== KVM_PAGE_ARRAY_NR
);
1829 static inline void clear_unsync_child_bit(struct kvm_mmu_page
*sp
, int idx
)
1831 --sp
->unsync_children
;
1832 WARN_ON_ONCE((int)sp
->unsync_children
< 0);
1833 __clear_bit(idx
, sp
->unsync_child_bitmap
);
1836 static int __mmu_unsync_walk(struct kvm_mmu_page
*sp
,
1837 struct kvm_mmu_pages
*pvec
)
1839 int i
, ret
, nr_unsync_leaf
= 0;
1841 for_each_set_bit(i
, sp
->unsync_child_bitmap
, 512) {
1842 struct kvm_mmu_page
*child
;
1843 u64 ent
= sp
->spt
[i
];
1845 if (!is_shadow_present_pte(ent
) || is_large_pte(ent
)) {
1846 clear_unsync_child_bit(sp
, i
);
1850 child
= spte_to_child_sp(ent
);
1852 if (child
->unsync_children
) {
1853 if (mmu_pages_add(pvec
, child
, i
))
1856 ret
= __mmu_unsync_walk(child
, pvec
);
1858 clear_unsync_child_bit(sp
, i
);
1860 } else if (ret
> 0) {
1861 nr_unsync_leaf
+= ret
;
1864 } else if (child
->unsync
) {
1866 if (mmu_pages_add(pvec
, child
, i
))
1869 clear_unsync_child_bit(sp
, i
);
1872 return nr_unsync_leaf
;
1875 #define INVALID_INDEX (-1)
1877 static int mmu_unsync_walk(struct kvm_mmu_page
*sp
,
1878 struct kvm_mmu_pages
*pvec
)
1881 if (!sp
->unsync_children
)
1884 mmu_pages_add(pvec
, sp
, INVALID_INDEX
);
1885 return __mmu_unsync_walk(sp
, pvec
);
1888 static void kvm_unlink_unsync_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
1890 WARN_ON_ONCE(!sp
->unsync
);
1891 trace_kvm_mmu_sync_page(sp
);
1893 --kvm
->stat
.mmu_unsync
;
1896 static bool kvm_mmu_prepare_zap_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
,
1897 struct list_head
*invalid_list
);
1898 static void kvm_mmu_commit_zap_page(struct kvm
*kvm
,
1899 struct list_head
*invalid_list
);
1901 static bool sp_has_gptes(struct kvm_mmu_page
*sp
)
1903 if (sp
->role
.direct
)
1906 if (sp
->role
.passthrough
)
1912 #define for_each_valid_sp(_kvm, _sp, _list) \
1913 hlist_for_each_entry(_sp, _list, hash_link) \
1914 if (is_obsolete_sp((_kvm), (_sp))) { \
1917 #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
1918 for_each_valid_sp(_kvm, _sp, \
1919 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
1920 if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
1922 static bool kvm_sync_page_check(struct kvm_vcpu
*vcpu
, struct kvm_mmu_page
*sp
)
1924 union kvm_mmu_page_role root_role
= vcpu
->arch
.mmu
->root_role
;
1927 * Ignore various flags when verifying that it's safe to sync a shadow
1928 * page using the current MMU context.
1930 * - level: not part of the overall MMU role and will never match as the MMU's
1931 * level tracks the root level
1932 * - access: updated based on the new guest PTE
1933 * - quadrant: not part of the overall MMU role (similar to level)
1935 const union kvm_mmu_page_role sync_role_ign
= {
1943 * Direct pages can never be unsync, and KVM should never attempt to
1944 * sync a shadow page for a different MMU context, e.g. if the role
1945 * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
1946 * reserved bits checks will be wrong, etc...
1948 if (WARN_ON_ONCE(sp
->role
.direct
|| !vcpu
->arch
.mmu
->sync_spte
||
1949 (sp
->role
.word
^ root_role
.word
) & ~sync_role_ign
.word
))
1955 static int kvm_sync_spte(struct kvm_vcpu
*vcpu
, struct kvm_mmu_page
*sp
, int i
)
1960 return vcpu
->arch
.mmu
->sync_spte(vcpu
, sp
, i
);
1963 static int __kvm_sync_page(struct kvm_vcpu
*vcpu
, struct kvm_mmu_page
*sp
)
1968 if (!kvm_sync_page_check(vcpu
, sp
))
1971 for (i
= 0; i
< SPTE_ENT_PER_PAGE
; i
++) {
1972 int ret
= kvm_sync_spte(vcpu
, sp
, i
);
1980 * Note, any flush is purely for KVM's correctness, e.g. when dropping
1981 * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
1982 * unmap or dirty logging event doesn't fail to flush. The guest is
1983 * responsible for flushing the TLB to ensure any changes in protection
1984 * bits are recognized, i.e. until the guest flushes or page faults on
1985 * a relevant address, KVM is architecturally allowed to let vCPUs use
1986 * cached translations with the old protection bits.
1991 static int kvm_sync_page(struct kvm_vcpu
*vcpu
, struct kvm_mmu_page
*sp
,
1992 struct list_head
*invalid_list
)
1994 int ret
= __kvm_sync_page(vcpu
, sp
);
1997 kvm_mmu_prepare_zap_page(vcpu
->kvm
, sp
, invalid_list
);
2001 static bool kvm_mmu_remote_flush_or_zap(struct kvm
*kvm
,
2002 struct list_head
*invalid_list
,
2005 if (!remote_flush
&& list_empty(invalid_list
))
2008 if (!list_empty(invalid_list
))
2009 kvm_mmu_commit_zap_page(kvm
, invalid_list
);
2011 kvm_flush_remote_tlbs(kvm
);
2015 static bool is_obsolete_sp(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
2017 if (sp
->role
.invalid
)
2020 /* TDP MMU pages do not use the MMU generation. */
2021 return !is_tdp_mmu_page(sp
) &&
2022 unlikely(sp
->mmu_valid_gen
!= kvm
->arch
.mmu_valid_gen
);
2025 struct mmu_page_path
{
2026 struct kvm_mmu_page
*parent
[PT64_ROOT_MAX_LEVEL
];
2027 unsigned int idx
[PT64_ROOT_MAX_LEVEL
];
2030 #define for_each_sp(pvec, sp, parents, i) \
2031 for (i = mmu_pages_first(&pvec, &parents); \
2032 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
2033 i = mmu_pages_next(&pvec, &parents, i))
2035 static int mmu_pages_next(struct kvm_mmu_pages
*pvec
,
2036 struct mmu_page_path
*parents
,
2041 for (n
= i
+1; n
< pvec
->nr
; n
++) {
2042 struct kvm_mmu_page
*sp
= pvec
->page
[n
].sp
;
2043 unsigned idx
= pvec
->page
[n
].idx
;
2044 int level
= sp
->role
.level
;
2046 parents
->idx
[level
-1] = idx
;
2047 if (level
== PG_LEVEL_4K
)
2050 parents
->parent
[level
-2] = sp
;
2056 static int mmu_pages_first(struct kvm_mmu_pages
*pvec
,
2057 struct mmu_page_path
*parents
)
2059 struct kvm_mmu_page
*sp
;
2065 WARN_ON_ONCE(pvec
->page
[0].idx
!= INVALID_INDEX
);
2067 sp
= pvec
->page
[0].sp
;
2068 level
= sp
->role
.level
;
2069 WARN_ON_ONCE(level
== PG_LEVEL_4K
);
2071 parents
->parent
[level
-2] = sp
;
2073 /* Also set up a sentinel. Further entries in pvec are all
2074 * children of sp, so this element is never overwritten.
2076 parents
->parent
[level
-1] = NULL
;
2077 return mmu_pages_next(pvec
, parents
, 0);
2080 static void mmu_pages_clear_parents(struct mmu_page_path
*parents
)
2082 struct kvm_mmu_page
*sp
;
2083 unsigned int level
= 0;
2086 unsigned int idx
= parents
->idx
[level
];
2087 sp
= parents
->parent
[level
];
2091 WARN_ON_ONCE(idx
== INVALID_INDEX
);
2092 clear_unsync_child_bit(sp
, idx
);
2094 } while (!sp
->unsync_children
);
2097 static int mmu_sync_children(struct kvm_vcpu
*vcpu
,
2098 struct kvm_mmu_page
*parent
, bool can_yield
)
2101 struct kvm_mmu_page
*sp
;
2102 struct mmu_page_path parents
;
2103 struct kvm_mmu_pages pages
;
2104 LIST_HEAD(invalid_list
);
2107 while (mmu_unsync_walk(parent
, &pages
)) {
2108 bool protected = false;
2110 for_each_sp(pages
, sp
, parents
, i
)
2111 protected |= kvm_vcpu_write_protect_gfn(vcpu
, sp
->gfn
);
2114 kvm_mmu_remote_flush_or_zap(vcpu
->kvm
, &invalid_list
, true);
2118 for_each_sp(pages
, sp
, parents
, i
) {
2119 kvm_unlink_unsync_page(vcpu
->kvm
, sp
);
2120 flush
|= kvm_sync_page(vcpu
, sp
, &invalid_list
) > 0;
2121 mmu_pages_clear_parents(&parents
);
2123 if (need_resched() || rwlock_needbreak(&vcpu
->kvm
->mmu_lock
)) {
2124 kvm_mmu_remote_flush_or_zap(vcpu
->kvm
, &invalid_list
, flush
);
2126 kvm_make_request(KVM_REQ_MMU_SYNC
, vcpu
);
2130 cond_resched_rwlock_write(&vcpu
->kvm
->mmu_lock
);
2135 kvm_mmu_remote_flush_or_zap(vcpu
->kvm
, &invalid_list
, flush
);
2139 static void __clear_sp_write_flooding_count(struct kvm_mmu_page
*sp
)
2141 atomic_set(&sp
->write_flooding_count
, 0);
2144 static void clear_sp_write_flooding_count(u64
*spte
)
2146 __clear_sp_write_flooding_count(sptep_to_sp(spte
));
2150 * The vCPU is required when finding indirect shadow pages; the shadow
2151 * page may already exist and syncing it needs the vCPU pointer in
2152 * order to read guest page tables. Direct shadow pages are never
2153 * unsync, thus @vcpu can be NULL if @role.direct is true.
2155 static struct kvm_mmu_page
*kvm_mmu_find_shadow_page(struct kvm
*kvm
,
2156 struct kvm_vcpu
*vcpu
,
2158 struct hlist_head
*sp_list
,
2159 union kvm_mmu_page_role role
)
2161 struct kvm_mmu_page
*sp
;
2164 LIST_HEAD(invalid_list
);
2166 for_each_valid_sp(kvm
, sp
, sp_list
) {
2167 if (sp
->gfn
!= gfn
) {
2172 if (sp
->role
.word
!= role
.word
) {
2174 * If the guest is creating an upper-level page, zap
2175 * unsync pages for the same gfn. While it's possible
2176 * the guest is using recursive page tables, in all
2177 * likelihood the guest has stopped using the unsync
2178 * page and is installing a completely unrelated page.
2179 * Unsync pages must not be left as is, because the new
2180 * upper-level page will be write-protected.
2182 if (role
.level
> PG_LEVEL_4K
&& sp
->unsync
)
2183 kvm_mmu_prepare_zap_page(kvm
, sp
,
2188 /* unsync and write-flooding only apply to indirect SPs. */
2189 if (sp
->role
.direct
)
2193 if (KVM_BUG_ON(!vcpu
, kvm
))
2197 * The page is good, but is stale. kvm_sync_page does
2198 * get the latest guest state, but (unlike mmu_unsync_children)
2199 * it doesn't write-protect the page or mark it synchronized!
2200 * This way the validity of the mapping is ensured, but the
2201 * overhead of write protection is not incurred until the
2202 * guest invalidates the TLB mapping. This allows multiple
2203 * SPs for a single gfn to be unsync.
2205 * If the sync fails, the page is zapped. If so, break
2206 * in order to rebuild it.
2208 ret
= kvm_sync_page(vcpu
, sp
, &invalid_list
);
2212 WARN_ON_ONCE(!list_empty(&invalid_list
));
2214 kvm_flush_remote_tlbs(kvm
);
2217 __clear_sp_write_flooding_count(sp
);
2223 ++kvm
->stat
.mmu_cache_miss
;
2226 kvm_mmu_commit_zap_page(kvm
, &invalid_list
);
2228 if (collisions
> kvm
->stat
.max_mmu_page_hash_collisions
)
2229 kvm
->stat
.max_mmu_page_hash_collisions
= collisions
;
2233 /* Caches used when allocating a new shadow page. */
2234 struct shadow_page_caches
{
2235 struct kvm_mmu_memory_cache
*page_header_cache
;
2236 struct kvm_mmu_memory_cache
*shadow_page_cache
;
2237 struct kvm_mmu_memory_cache
*shadowed_info_cache
;
2240 static struct kvm_mmu_page
*kvm_mmu_alloc_shadow_page(struct kvm
*kvm
,
2241 struct shadow_page_caches
*caches
,
2243 struct hlist_head
*sp_list
,
2244 union kvm_mmu_page_role role
)
2246 struct kvm_mmu_page
*sp
;
2248 sp
= kvm_mmu_memory_cache_alloc(caches
->page_header_cache
);
2249 sp
->spt
= kvm_mmu_memory_cache_alloc(caches
->shadow_page_cache
);
2251 sp
->shadowed_translation
= kvm_mmu_memory_cache_alloc(caches
->shadowed_info_cache
);
2253 set_page_private(virt_to_page(sp
->spt
), (unsigned long)sp
);
2255 INIT_LIST_HEAD(&sp
->possible_nx_huge_page_link
);
2258 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2259 * depends on valid pages being added to the head of the list. See
2260 * comments in kvm_zap_obsolete_pages().
2262 sp
->mmu_valid_gen
= kvm
->arch
.mmu_valid_gen
;
2263 list_add(&sp
->link
, &kvm
->arch
.active_mmu_pages
);
2264 kvm_account_mmu_page(kvm
, sp
);
2268 hlist_add_head(&sp
->hash_link
, sp_list
);
2269 if (sp_has_gptes(sp
))
2270 account_shadowed(kvm
, sp
);
2275 /* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
2276 static struct kvm_mmu_page
*__kvm_mmu_get_shadow_page(struct kvm
*kvm
,
2277 struct kvm_vcpu
*vcpu
,
2278 struct shadow_page_caches
*caches
,
2280 union kvm_mmu_page_role role
)
2282 struct hlist_head
*sp_list
;
2283 struct kvm_mmu_page
*sp
;
2284 bool created
= false;
2286 sp_list
= &kvm
->arch
.mmu_page_hash
[kvm_page_table_hashfn(gfn
)];
2288 sp
= kvm_mmu_find_shadow_page(kvm
, vcpu
, gfn
, sp_list
, role
);
2291 sp
= kvm_mmu_alloc_shadow_page(kvm
, caches
, gfn
, sp_list
, role
);
2294 trace_kvm_mmu_get_page(sp
, created
);
2298 static struct kvm_mmu_page
*kvm_mmu_get_shadow_page(struct kvm_vcpu
*vcpu
,
2300 union kvm_mmu_page_role role
)
2302 struct shadow_page_caches caches
= {
2303 .page_header_cache
= &vcpu
->arch
.mmu_page_header_cache
,
2304 .shadow_page_cache
= &vcpu
->arch
.mmu_shadow_page_cache
,
2305 .shadowed_info_cache
= &vcpu
->arch
.mmu_shadowed_info_cache
,
2308 return __kvm_mmu_get_shadow_page(vcpu
->kvm
, vcpu
, &caches
, gfn
, role
);
2311 static union kvm_mmu_page_role
kvm_mmu_child_role(u64
*sptep
, bool direct
,
2312 unsigned int access
)
2314 struct kvm_mmu_page
*parent_sp
= sptep_to_sp(sptep
);
2315 union kvm_mmu_page_role role
;
2317 role
= parent_sp
->role
;
2319 role
.access
= access
;
2320 role
.direct
= direct
;
2321 role
.passthrough
= 0;
2324 * If the guest has 4-byte PTEs then that means it's using 32-bit,
2325 * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
2326 * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
2327 * shadow each guest page table with multiple shadow page tables, which
2328 * requires extra bookkeeping in the role.
2330 * Specifically, to shadow the guest's page directory (which covers a
2331 * 4GiB address space), KVM uses 4 PAE page directories, each mapping
2332 * 1GiB of the address space. @role.quadrant encodes which quarter of
2333 * the address space each maps.
2335 * To shadow the guest's page tables (which each map a 4MiB region), KVM
2336 * uses 2 PAE page tables, each mapping a 2MiB region. For these,
2337 * @role.quadrant encodes which half of the region they map.
2339 * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
2340 * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow
2341 * PDPTEs; those 4 PAE page directories are pre-allocated and their
2342 * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes
2343 * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume
2344 * bit 21 in the PTE (the child here), KVM propagates that bit to the
2345 * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE
2346 * covers bit 21 (see above), thus the quadrant is calculated from the
2347 * _least_ significant bit of the PDE index.
2349 if (role
.has_4_byte_gpte
) {
2350 WARN_ON_ONCE(role
.level
!= PG_LEVEL_4K
);
2351 role
.quadrant
= spte_index(sptep
) & 1;
2357 static struct kvm_mmu_page
*kvm_mmu_get_child_sp(struct kvm_vcpu
*vcpu
,
2358 u64
*sptep
, gfn_t gfn
,
2359 bool direct
, unsigned int access
)
2361 union kvm_mmu_page_role role
;
2363 if (is_shadow_present_pte(*sptep
) && !is_large_pte(*sptep
))
2364 return ERR_PTR(-EEXIST
);
2366 role
= kvm_mmu_child_role(sptep
, direct
, access
);
2367 return kvm_mmu_get_shadow_page(vcpu
, gfn
, role
);
2370 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator
*iterator
,
2371 struct kvm_vcpu
*vcpu
, hpa_t root
,
2374 iterator
->addr
= addr
;
2375 iterator
->shadow_addr
= root
;
2376 iterator
->level
= vcpu
->arch
.mmu
->root_role
.level
;
2378 if (iterator
->level
>= PT64_ROOT_4LEVEL
&&
2379 vcpu
->arch
.mmu
->cpu_role
.base
.level
< PT64_ROOT_4LEVEL
&&
2380 !vcpu
->arch
.mmu
->root_role
.direct
)
2381 iterator
->level
= PT32E_ROOT_LEVEL
;
2383 if (iterator
->level
== PT32E_ROOT_LEVEL
) {
2385 * prev_root is currently only used for 64-bit hosts. So only
2386 * the active root_hpa is valid here.
2388 BUG_ON(root
!= vcpu
->arch
.mmu
->root
.hpa
);
2390 iterator
->shadow_addr
2391 = vcpu
->arch
.mmu
->pae_root
[(addr
>> 30) & 3];
2392 iterator
->shadow_addr
&= SPTE_BASE_ADDR_MASK
;
2394 if (!iterator
->shadow_addr
)
2395 iterator
->level
= 0;
2399 static void shadow_walk_init(struct kvm_shadow_walk_iterator
*iterator
,
2400 struct kvm_vcpu
*vcpu
, u64 addr
)
2402 shadow_walk_init_using_root(iterator
, vcpu
, vcpu
->arch
.mmu
->root
.hpa
,
2406 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator
*iterator
)
2408 if (iterator
->level
< PG_LEVEL_4K
)
2411 iterator
->index
= SPTE_INDEX(iterator
->addr
, iterator
->level
);
2412 iterator
->sptep
= ((u64
*)__va(iterator
->shadow_addr
)) + iterator
->index
;
2416 static void __shadow_walk_next(struct kvm_shadow_walk_iterator
*iterator
,
2419 if (!is_shadow_present_pte(spte
) || is_last_spte(spte
, iterator
->level
)) {
2420 iterator
->level
= 0;
2424 iterator
->shadow_addr
= spte
& SPTE_BASE_ADDR_MASK
;
2428 static void shadow_walk_next(struct kvm_shadow_walk_iterator
*iterator
)
2430 __shadow_walk_next(iterator
, *iterator
->sptep
);
2433 static void __link_shadow_page(struct kvm
*kvm
,
2434 struct kvm_mmu_memory_cache
*cache
, u64
*sptep
,
2435 struct kvm_mmu_page
*sp
, bool flush
)
2439 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK
!= PT_WRITABLE_MASK
);
2442 * If an SPTE is present already, it must be a leaf and therefore
2443 * a large one. Drop it, and flush the TLB if needed, before
2446 if (is_shadow_present_pte(*sptep
))
2447 drop_large_spte(kvm
, sptep
, flush
);
2449 spte
= make_nonleaf_spte(sp
->spt
, sp_ad_disabled(sp
));
2451 mmu_spte_set(sptep
, spte
);
2453 mmu_page_add_parent_pte(cache
, sp
, sptep
);
2456 * The non-direct sub-pagetable must be updated before linking. For
2457 * L1 sp, the pagetable is updated via kvm_sync_page() in
2458 * kvm_mmu_find_shadow_page() without write-protecting the gfn,
2459 * so sp->unsync can be true or false. For higher level non-direct
2460 * sp, the pagetable is updated/synced via mmu_sync_children() in
2461 * FNAME(fetch)(), so sp->unsync_children can only be false.
2462 * WARN_ON_ONCE() if anything happens unexpectedly.
2464 if (WARN_ON_ONCE(sp
->unsync_children
) || sp
->unsync
)
2468 static void link_shadow_page(struct kvm_vcpu
*vcpu
, u64
*sptep
,
2469 struct kvm_mmu_page
*sp
)
2471 __link_shadow_page(vcpu
->kvm
, &vcpu
->arch
.mmu_pte_list_desc_cache
, sptep
, sp
, true);
2474 static void validate_direct_spte(struct kvm_vcpu
*vcpu
, u64
*sptep
,
2475 unsigned direct_access
)
2477 if (is_shadow_present_pte(*sptep
) && !is_large_pte(*sptep
)) {
2478 struct kvm_mmu_page
*child
;
2481 * For the direct sp, if the guest pte's dirty bit
2482 * changed form clean to dirty, it will corrupt the
2483 * sp's access: allow writable in the read-only sp,
2484 * so we should update the spte at this point to get
2485 * a new sp with the correct access.
2487 child
= spte_to_child_sp(*sptep
);
2488 if (child
->role
.access
== direct_access
)
2491 drop_parent_pte(vcpu
->kvm
, child
, sptep
);
2492 kvm_flush_remote_tlbs_sptep(vcpu
->kvm
, sptep
);
2496 /* Returns the number of zapped non-leaf child shadow pages. */
2497 static int mmu_page_zap_pte(struct kvm
*kvm
, struct kvm_mmu_page
*sp
,
2498 u64
*spte
, struct list_head
*invalid_list
)
2501 struct kvm_mmu_page
*child
;
2504 if (is_shadow_present_pte(pte
)) {
2505 if (is_last_spte(pte
, sp
->role
.level
)) {
2506 drop_spte(kvm
, spte
);
2508 child
= spte_to_child_sp(pte
);
2509 drop_parent_pte(kvm
, child
, spte
);
2512 * Recursively zap nested TDP SPs, parentless SPs are
2513 * unlikely to be used again in the near future. This
2514 * avoids retaining a large number of stale nested SPs.
2516 if (tdp_enabled
&& invalid_list
&&
2517 child
->role
.guest_mode
&& !child
->parent_ptes
.val
)
2518 return kvm_mmu_prepare_zap_page(kvm
, child
,
2521 } else if (is_mmio_spte(pte
)) {
2522 mmu_spte_clear_no_track(spte
);
2527 static int kvm_mmu_page_unlink_children(struct kvm
*kvm
,
2528 struct kvm_mmu_page
*sp
,
2529 struct list_head
*invalid_list
)
2534 for (i
= 0; i
< SPTE_ENT_PER_PAGE
; ++i
)
2535 zapped
+= mmu_page_zap_pte(kvm
, sp
, sp
->spt
+ i
, invalid_list
);
2540 static void kvm_mmu_unlink_parents(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
2543 struct rmap_iterator iter
;
2545 while ((sptep
= rmap_get_first(&sp
->parent_ptes
, &iter
)))
2546 drop_parent_pte(kvm
, sp
, sptep
);
2549 static int mmu_zap_unsync_children(struct kvm
*kvm
,
2550 struct kvm_mmu_page
*parent
,
2551 struct list_head
*invalid_list
)
2554 struct mmu_page_path parents
;
2555 struct kvm_mmu_pages pages
;
2557 if (parent
->role
.level
== PG_LEVEL_4K
)
2560 while (mmu_unsync_walk(parent
, &pages
)) {
2561 struct kvm_mmu_page
*sp
;
2563 for_each_sp(pages
, sp
, parents
, i
) {
2564 kvm_mmu_prepare_zap_page(kvm
, sp
, invalid_list
);
2565 mmu_pages_clear_parents(&parents
);
2573 static bool __kvm_mmu_prepare_zap_page(struct kvm
*kvm
,
2574 struct kvm_mmu_page
*sp
,
2575 struct list_head
*invalid_list
,
2578 bool list_unstable
, zapped_root
= false;
2580 lockdep_assert_held_write(&kvm
->mmu_lock
);
2581 trace_kvm_mmu_prepare_zap_page(sp
);
2582 ++kvm
->stat
.mmu_shadow_zapped
;
2583 *nr_zapped
= mmu_zap_unsync_children(kvm
, sp
, invalid_list
);
2584 *nr_zapped
+= kvm_mmu_page_unlink_children(kvm
, sp
, invalid_list
);
2585 kvm_mmu_unlink_parents(kvm
, sp
);
2587 /* Zapping children means active_mmu_pages has become unstable. */
2588 list_unstable
= *nr_zapped
;
2590 if (!sp
->role
.invalid
&& sp_has_gptes(sp
))
2591 unaccount_shadowed(kvm
, sp
);
2594 kvm_unlink_unsync_page(kvm
, sp
);
2595 if (!sp
->root_count
) {
2600 * Already invalid pages (previously active roots) are not on
2601 * the active page list. See list_del() in the "else" case of
2604 if (sp
->role
.invalid
)
2605 list_add(&sp
->link
, invalid_list
);
2607 list_move(&sp
->link
, invalid_list
);
2608 kvm_unaccount_mmu_page(kvm
, sp
);
2611 * Remove the active root from the active page list, the root
2612 * will be explicitly freed when the root_count hits zero.
2614 list_del(&sp
->link
);
2617 * Obsolete pages cannot be used on any vCPUs, see the comment
2618 * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
2619 * treats invalid shadow pages as being obsolete.
2621 zapped_root
= !is_obsolete_sp(kvm
, sp
);
2624 if (sp
->nx_huge_page_disallowed
)
2625 unaccount_nx_huge_page(kvm
, sp
);
2627 sp
->role
.invalid
= 1;
2630 * Make the request to free obsolete roots after marking the root
2631 * invalid, otherwise other vCPUs may not see it as invalid.
2634 kvm_make_all_cpus_request(kvm
, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS
);
2635 return list_unstable
;
2638 static bool kvm_mmu_prepare_zap_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
,
2639 struct list_head
*invalid_list
)
2643 __kvm_mmu_prepare_zap_page(kvm
, sp
, invalid_list
, &nr_zapped
);
2647 static void kvm_mmu_commit_zap_page(struct kvm
*kvm
,
2648 struct list_head
*invalid_list
)
2650 struct kvm_mmu_page
*sp
, *nsp
;
2652 if (list_empty(invalid_list
))
2656 * We need to make sure everyone sees our modifications to
2657 * the page tables and see changes to vcpu->mode here. The barrier
2658 * in the kvm_flush_remote_tlbs() achieves this. This pairs
2659 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2661 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2662 * guest mode and/or lockless shadow page table walks.
2664 kvm_flush_remote_tlbs(kvm
);
2666 list_for_each_entry_safe(sp
, nsp
, invalid_list
, link
) {
2667 WARN_ON_ONCE(!sp
->role
.invalid
|| sp
->root_count
);
2668 kvm_mmu_free_shadow_page(sp
);
2672 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm
*kvm
,
2673 unsigned long nr_to_zap
)
2675 unsigned long total_zapped
= 0;
2676 struct kvm_mmu_page
*sp
, *tmp
;
2677 LIST_HEAD(invalid_list
);
2681 if (list_empty(&kvm
->arch
.active_mmu_pages
))
2685 list_for_each_entry_safe_reverse(sp
, tmp
, &kvm
->arch
.active_mmu_pages
, link
) {
2687 * Don't zap active root pages, the page itself can't be freed
2688 * and zapping it will just force vCPUs to realloc and reload.
2693 unstable
= __kvm_mmu_prepare_zap_page(kvm
, sp
, &invalid_list
,
2695 total_zapped
+= nr_zapped
;
2696 if (total_zapped
>= nr_to_zap
)
2703 kvm_mmu_commit_zap_page(kvm
, &invalid_list
);
2705 kvm
->stat
.mmu_recycled
+= total_zapped
;
2706 return total_zapped
;
2709 static inline unsigned long kvm_mmu_available_pages(struct kvm
*kvm
)
2711 if (kvm
->arch
.n_max_mmu_pages
> kvm
->arch
.n_used_mmu_pages
)
2712 return kvm
->arch
.n_max_mmu_pages
-
2713 kvm
->arch
.n_used_mmu_pages
;
2718 static int make_mmu_pages_available(struct kvm_vcpu
*vcpu
)
2720 unsigned long avail
= kvm_mmu_available_pages(vcpu
->kvm
);
2722 if (likely(avail
>= KVM_MIN_FREE_MMU_PAGES
))
2725 kvm_mmu_zap_oldest_mmu_pages(vcpu
->kvm
, KVM_REFILL_PAGES
- avail
);
2728 * Note, this check is intentionally soft, it only guarantees that one
2729 * page is available, while the caller may end up allocating as many as
2730 * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
2731 * exceeding the (arbitrary by default) limit will not harm the host,
2732 * being too aggressive may unnecessarily kill the guest, and getting an
2733 * exact count is far more trouble than it's worth, especially in the
2736 if (!kvm_mmu_available_pages(vcpu
->kvm
))
2742 * Changing the number of mmu pages allocated to the vm
2743 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2745 void kvm_mmu_change_mmu_pages(struct kvm
*kvm
, unsigned long goal_nr_mmu_pages
)
2747 write_lock(&kvm
->mmu_lock
);
2749 if (kvm
->arch
.n_used_mmu_pages
> goal_nr_mmu_pages
) {
2750 kvm_mmu_zap_oldest_mmu_pages(kvm
, kvm
->arch
.n_used_mmu_pages
-
2753 goal_nr_mmu_pages
= kvm
->arch
.n_used_mmu_pages
;
2756 kvm
->arch
.n_max_mmu_pages
= goal_nr_mmu_pages
;
2758 write_unlock(&kvm
->mmu_lock
);
2761 int kvm_mmu_unprotect_page(struct kvm
*kvm
, gfn_t gfn
)
2763 struct kvm_mmu_page
*sp
;
2764 LIST_HEAD(invalid_list
);
2768 write_lock(&kvm
->mmu_lock
);
2769 for_each_gfn_valid_sp_with_gptes(kvm
, sp
, gfn
) {
2771 kvm_mmu_prepare_zap_page(kvm
, sp
, &invalid_list
);
2773 kvm_mmu_commit_zap_page(kvm
, &invalid_list
);
2774 write_unlock(&kvm
->mmu_lock
);
2779 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu
*vcpu
, gva_t gva
)
2784 if (vcpu
->arch
.mmu
->root_role
.direct
)
2787 gpa
= kvm_mmu_gva_to_gpa_read(vcpu
, gva
, NULL
);
2789 r
= kvm_mmu_unprotect_page(vcpu
->kvm
, gpa
>> PAGE_SHIFT
);
2794 static void kvm_unsync_page(struct kvm
*kvm
, struct kvm_mmu_page
*sp
)
2796 trace_kvm_mmu_unsync_page(sp
);
2797 ++kvm
->stat
.mmu_unsync
;
2800 kvm_mmu_mark_parents_unsync(sp
);
2804 * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2805 * KVM is creating a writable mapping for said gfn. Returns 0 if all pages
2806 * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2807 * be write-protected.
2809 int mmu_try_to_unsync_pages(struct kvm
*kvm
, const struct kvm_memory_slot
*slot
,
2810 gfn_t gfn
, bool can_unsync
, bool prefetch
)
2812 struct kvm_mmu_page
*sp
;
2813 bool locked
= false;
2816 * Force write-protection if the page is being tracked. Note, the page
2817 * track machinery is used to write-protect upper-level shadow pages,
2818 * i.e. this guards the role.level == 4K assertion below!
2820 if (kvm_gfn_is_write_tracked(kvm
, slot
, gfn
))
2824 * The page is not write-tracked, mark existing shadow pages unsync
2825 * unless KVM is synchronizing an unsync SP (can_unsync = false). In
2826 * that case, KVM must complete emulation of the guest TLB flush before
2827 * allowing shadow pages to become unsync (writable by the guest).
2829 for_each_gfn_valid_sp_with_gptes(kvm
, sp
, gfn
) {
2840 * TDP MMU page faults require an additional spinlock as they
2841 * run with mmu_lock held for read, not write, and the unsync
2842 * logic is not thread safe. Take the spinklock regardless of
2843 * the MMU type to avoid extra conditionals/parameters, there's
2844 * no meaningful penalty if mmu_lock is held for write.
2848 spin_lock(&kvm
->arch
.mmu_unsync_pages_lock
);
2851 * Recheck after taking the spinlock, a different vCPU
2852 * may have since marked the page unsync. A false
2853 * positive on the unprotected check above is not
2854 * possible as clearing sp->unsync _must_ hold mmu_lock
2855 * for write, i.e. unsync cannot transition from 0->1
2856 * while this CPU holds mmu_lock for read (or write).
2858 if (READ_ONCE(sp
->unsync
))
2862 WARN_ON_ONCE(sp
->role
.level
!= PG_LEVEL_4K
);
2863 kvm_unsync_page(kvm
, sp
);
2866 spin_unlock(&kvm
->arch
.mmu_unsync_pages_lock
);
2869 * We need to ensure that the marking of unsync pages is visible
2870 * before the SPTE is updated to allow writes because
2871 * kvm_mmu_sync_roots() checks the unsync flags without holding
2872 * the MMU lock and so can race with this. If the SPTE was updated
2873 * before the page had been marked as unsync-ed, something like the
2874 * following could happen:
2877 * ---------------------------------------------------------------------
2878 * 1.2 Host updates SPTE
2880 * 2.1 Guest writes a GPTE for GVA X.
2881 * (GPTE being in the guest page table shadowed
2882 * by the SP from CPU 1.)
2883 * This reads SPTE during the page table walk.
2884 * Since SPTE.W is read as 1, there is no
2887 * 2.2 Guest issues TLB flush.
2888 * That causes a VM Exit.
2890 * 2.3 Walking of unsync pages sees sp->unsync is
2891 * false and skips the page.
2893 * 2.4 Guest accesses GVA X.
2894 * Since the mapping in the SP was not updated,
2895 * so the old mapping for GVA X incorrectly
2899 * (sp->unsync = true)
2901 * The write barrier below ensures that 1.1 happens before 1.2 and thus
2902 * the situation in 2.4 does not arise. It pairs with the read barrier
2903 * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2910 static int mmu_set_spte(struct kvm_vcpu
*vcpu
, struct kvm_memory_slot
*slot
,
2911 u64
*sptep
, unsigned int pte_access
, gfn_t gfn
,
2912 kvm_pfn_t pfn
, struct kvm_page_fault
*fault
)
2914 struct kvm_mmu_page
*sp
= sptep_to_sp(sptep
);
2915 int level
= sp
->role
.level
;
2916 int was_rmapped
= 0;
2917 int ret
= RET_PF_FIXED
;
2922 /* Prefetching always gets a writable pfn. */
2923 bool host_writable
= !fault
|| fault
->map_writable
;
2924 bool prefetch
= !fault
|| fault
->prefetch
;
2925 bool write_fault
= fault
&& fault
->write
;
2927 if (unlikely(is_noslot_pfn(pfn
))) {
2928 vcpu
->stat
.pf_mmio_spte_created
++;
2929 mark_mmio_spte(vcpu
, sptep
, gfn
, pte_access
);
2930 return RET_PF_EMULATE
;
2933 if (is_shadow_present_pte(*sptep
)) {
2935 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2936 * the parent of the now unreachable PTE.
2938 if (level
> PG_LEVEL_4K
&& !is_large_pte(*sptep
)) {
2939 struct kvm_mmu_page
*child
;
2942 child
= spte_to_child_sp(pte
);
2943 drop_parent_pte(vcpu
->kvm
, child
, sptep
);
2945 } else if (pfn
!= spte_to_pfn(*sptep
)) {
2946 drop_spte(vcpu
->kvm
, sptep
);
2952 wrprot
= make_spte(vcpu
, sp
, slot
, pte_access
, gfn
, pfn
, *sptep
, prefetch
,
2953 true, host_writable
, &spte
);
2955 if (*sptep
== spte
) {
2956 ret
= RET_PF_SPURIOUS
;
2958 flush
|= mmu_spte_update(sptep
, spte
);
2959 trace_kvm_mmu_set_spte(level
, gfn
, sptep
);
2964 ret
= RET_PF_EMULATE
;
2968 kvm_flush_remote_tlbs_gfn(vcpu
->kvm
, gfn
, level
);
2971 WARN_ON_ONCE(ret
== RET_PF_SPURIOUS
);
2972 rmap_add(vcpu
, slot
, sptep
, gfn
, pte_access
);
2974 /* Already rmapped but the pte_access bits may have changed. */
2975 kvm_mmu_page_set_access(sp
, spte_index(sptep
), pte_access
);
2981 static int direct_pte_prefetch_many(struct kvm_vcpu
*vcpu
,
2982 struct kvm_mmu_page
*sp
,
2983 u64
*start
, u64
*end
)
2985 struct page
*pages
[PTE_PREFETCH_NUM
];
2986 struct kvm_memory_slot
*slot
;
2987 unsigned int access
= sp
->role
.access
;
2991 gfn
= kvm_mmu_page_get_gfn(sp
, spte_index(start
));
2992 slot
= gfn_to_memslot_dirty_bitmap(vcpu
, gfn
, access
& ACC_WRITE_MASK
);
2996 ret
= gfn_to_page_many_atomic(slot
, gfn
, pages
, end
- start
);
3000 for (i
= 0; i
< ret
; i
++, gfn
++, start
++) {
3001 mmu_set_spte(vcpu
, slot
, start
, access
, gfn
,
3002 page_to_pfn(pages
[i
]), NULL
);
3009 static void __direct_pte_prefetch(struct kvm_vcpu
*vcpu
,
3010 struct kvm_mmu_page
*sp
, u64
*sptep
)
3012 u64
*spte
, *start
= NULL
;
3015 WARN_ON_ONCE(!sp
->role
.direct
);
3017 i
= spte_index(sptep
) & ~(PTE_PREFETCH_NUM
- 1);
3020 for (i
= 0; i
< PTE_PREFETCH_NUM
; i
++, spte
++) {
3021 if (is_shadow_present_pte(*spte
) || spte
== sptep
) {
3024 if (direct_pte_prefetch_many(vcpu
, sp
, start
, spte
) < 0)
3031 direct_pte_prefetch_many(vcpu
, sp
, start
, spte
);
3034 static void direct_pte_prefetch(struct kvm_vcpu
*vcpu
, u64
*sptep
)
3036 struct kvm_mmu_page
*sp
;
3038 sp
= sptep_to_sp(sptep
);
3041 * Without accessed bits, there's no way to distinguish between
3042 * actually accessed translations and prefetched, so disable pte
3043 * prefetch if accessed bits aren't available.
3045 if (sp_ad_disabled(sp
))
3048 if (sp
->role
.level
> PG_LEVEL_4K
)
3052 * If addresses are being invalidated, skip prefetching to avoid
3053 * accidentally prefetching those addresses.
3055 if (unlikely(vcpu
->kvm
->mmu_invalidate_in_progress
))
3058 __direct_pte_prefetch(vcpu
, sp
, sptep
);
3062 * Lookup the mapping level for @gfn in the current mm.
3064 * WARNING! Use of host_pfn_mapping_level() requires the caller and the end
3065 * consumer to be tied into KVM's handlers for MMU notifier events!
3067 * There are several ways to safely use this helper:
3069 * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
3070 * consuming it. In this case, mmu_lock doesn't need to be held during the
3071 * lookup, but it does need to be held while checking the MMU notifier.
3073 * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
3074 * event for the hva. This can be done by explicit checking the MMU notifier
3075 * or by ensuring that KVM already has a valid mapping that covers the hva.
3077 * - Do not use the result to install new mappings, e.g. use the host mapping
3078 * level only to decide whether or not to zap an entry. In this case, it's
3079 * not required to hold mmu_lock (though it's highly likely the caller will
3080 * want to hold mmu_lock anyways, e.g. to modify SPTEs).
3082 * Note! The lookup can still race with modifications to host page tables, but
3083 * the above "rules" ensure KVM will not _consume_ the result of the walk if a
3084 * race with the primary MMU occurs.
3086 static int host_pfn_mapping_level(struct kvm
*kvm
, gfn_t gfn
,
3087 const struct kvm_memory_slot
*slot
)
3089 int level
= PG_LEVEL_4K
;
3091 unsigned long flags
;
3098 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
3099 * is not solely for performance, it's also necessary to avoid the
3100 * "writable" check in __gfn_to_hva_many(), which will always fail on
3101 * read-only memslots due to gfn_to_hva() assuming writes. Earlier
3102 * page fault steps have already verified the guest isn't writing a
3103 * read-only memslot.
3105 hva
= __gfn_to_hva_memslot(slot
, gfn
);
3108 * Disable IRQs to prevent concurrent tear down of host page tables,
3109 * e.g. if the primary MMU promotes a P*D to a huge page and then frees
3110 * the original page table.
3112 local_irq_save(flags
);
3115 * Read each entry once. As above, a non-leaf entry can be promoted to
3116 * a huge page _during_ this walk. Re-reading the entry could send the
3117 * walk into the weeks, e.g. p*d_large() returns false (sees the old
3118 * value) and then p*d_offset() walks into the target huge page instead
3119 * of the old page table (sees the new value).
3121 pgd
= READ_ONCE(*pgd_offset(kvm
->mm
, hva
));
3125 p4d
= READ_ONCE(*p4d_offset(&pgd
, hva
));
3126 if (p4d_none(p4d
) || !p4d_present(p4d
))
3129 pud
= READ_ONCE(*pud_offset(&p4d
, hva
));
3130 if (pud_none(pud
) || !pud_present(pud
))
3133 if (pud_large(pud
)) {
3134 level
= PG_LEVEL_1G
;
3138 pmd
= READ_ONCE(*pmd_offset(&pud
, hva
));
3139 if (pmd_none(pmd
) || !pmd_present(pmd
))
3143 level
= PG_LEVEL_2M
;
3146 local_irq_restore(flags
);
3150 int kvm_mmu_max_mapping_level(struct kvm
*kvm
,
3151 const struct kvm_memory_slot
*slot
, gfn_t gfn
,
3154 struct kvm_lpage_info
*linfo
;
3157 max_level
= min(max_level
, max_huge_page_level
);
3158 for ( ; max_level
> PG_LEVEL_4K
; max_level
--) {
3159 linfo
= lpage_info_slot(gfn
, slot
, max_level
);
3160 if (!linfo
->disallow_lpage
)
3164 if (max_level
== PG_LEVEL_4K
)
3167 host_level
= host_pfn_mapping_level(kvm
, gfn
, slot
);
3168 return min(host_level
, max_level
);
3171 void kvm_mmu_hugepage_adjust(struct kvm_vcpu
*vcpu
, struct kvm_page_fault
*fault
)
3173 struct kvm_memory_slot
*slot
= fault
->slot
;
3176 fault
->huge_page_disallowed
= fault
->exec
&& fault
->nx_huge_page_workaround_enabled
;
3178 if (unlikely(fault
->max_level
== PG_LEVEL_4K
))
3181 if (is_error_noslot_pfn(fault
->pfn
))
3184 if (kvm_slot_dirty_track_enabled(slot
))
3188 * Enforce the iTLB multihit workaround after capturing the requested
3189 * level, which will be used to do precise, accurate accounting.
3191 fault
->req_level
= kvm_mmu_max_mapping_level(vcpu
->kvm
, slot
,
3192 fault
->gfn
, fault
->max_level
);
3193 if (fault
->req_level
== PG_LEVEL_4K
|| fault
->huge_page_disallowed
)
3197 * mmu_invalidate_retry() was successful and mmu_lock is held, so
3198 * the pmd can't be split from under us.
3200 fault
->goal_level
= fault
->req_level
;
3201 mask
= KVM_PAGES_PER_HPAGE(fault
->goal_level
) - 1;
3202 VM_BUG_ON((fault
->gfn
& mask
) != (fault
->pfn
& mask
));
3203 fault
->pfn
&= ~mask
;
3206 void disallowed_hugepage_adjust(struct kvm_page_fault
*fault
, u64 spte
, int cur_level
)
3208 if (cur_level
> PG_LEVEL_4K
&&
3209 cur_level
== fault
->goal_level
&&
3210 is_shadow_present_pte(spte
) &&
3211 !is_large_pte(spte
) &&
3212 spte_to_child_sp(spte
)->nx_huge_page_disallowed
) {
3214 * A small SPTE exists for this pfn, but FNAME(fetch),
3215 * direct_map(), or kvm_tdp_mmu_map() would like to create a
3216 * large PTE instead: just force them to go down another level,
3217 * patching back for them into pfn the next 9 bits of the
3220 u64 page_mask
= KVM_PAGES_PER_HPAGE(cur_level
) -
3221 KVM_PAGES_PER_HPAGE(cur_level
- 1);
3222 fault
->pfn
|= fault
->gfn
& page_mask
;
3223 fault
->goal_level
--;
3227 static int direct_map(struct kvm_vcpu
*vcpu
, struct kvm_page_fault
*fault
)
3229 struct kvm_shadow_walk_iterator it
;
3230 struct kvm_mmu_page
*sp
;
3232 gfn_t base_gfn
= fault
->gfn
;
3234 kvm_mmu_hugepage_adjust(vcpu
, fault
);
3236 trace_kvm_mmu_spte_requested(fault
);
3237 for_each_shadow_entry(vcpu
, fault
->addr
, it
) {
3239 * We cannot overwrite existing page tables with an NX
3240 * large page, as the leaf could be executable.
3242 if (fault
->nx_huge_page_workaround_enabled
)
3243 disallowed_hugepage_adjust(fault
, *it
.sptep
, it
.level
);
3245 base_gfn
= gfn_round_for_level(fault
->gfn
, it
.level
);
3246 if (it
.level
== fault
->goal_level
)
3249 sp
= kvm_mmu_get_child_sp(vcpu
, it
.sptep
, base_gfn
, true, ACC_ALL
);
3250 if (sp
== ERR_PTR(-EEXIST
))
3253 link_shadow_page(vcpu
, it
.sptep
, sp
);
3254 if (fault
->huge_page_disallowed
)
3255 account_nx_huge_page(vcpu
->kvm
, sp
,
3256 fault
->req_level
>= it
.level
);
3259 if (WARN_ON_ONCE(it
.level
!= fault
->goal_level
))
3262 ret
= mmu_set_spte(vcpu
, fault
->slot
, it
.sptep
, ACC_ALL
,
3263 base_gfn
, fault
->pfn
, fault
);
3264 if (ret
== RET_PF_SPURIOUS
)
3267 direct_pte_prefetch(vcpu
, it
.sptep
);
3271 static void kvm_send_hwpoison_signal(struct kvm_memory_slot
*slot
, gfn_t gfn
)
3273 unsigned long hva
= gfn_to_hva_memslot(slot
, gfn
);
3275 send_sig_mceerr(BUS_MCEERR_AR
, (void __user
*)hva
, PAGE_SHIFT
, current
);
3278 static int kvm_handle_error_pfn(struct kvm_vcpu
*vcpu
, struct kvm_page_fault
*fault
)
3280 if (is_sigpending_pfn(fault
->pfn
)) {
3281 kvm_handle_signal_exit(vcpu
);
3286 * Do not cache the mmio info caused by writing the readonly gfn
3287 * into the spte otherwise read access on readonly gfn also can
3288 * caused mmio page fault and treat it as mmio access.
3290 if (fault
->pfn
== KVM_PFN_ERR_RO_FAULT
)
3291 return RET_PF_EMULATE
;
3293 if (fault
->pfn
== KVM_PFN_ERR_HWPOISON
) {
3294 kvm_send_hwpoison_signal(fault
->slot
, fault
->gfn
);
3295 return RET_PF_RETRY
;
3301 static int kvm_handle_noslot_fault(struct kvm_vcpu
*vcpu
,
3302 struct kvm_page_fault
*fault
,
3303 unsigned int access
)
3305 gva_t gva
= fault
->is_tdp
? 0 : fault
->addr
;
3307 vcpu_cache_mmio_info(vcpu
, gva
, fault
->gfn
,
3308 access
& shadow_mmio_access_mask
);
3311 * If MMIO caching is disabled, emulate immediately without
3312 * touching the shadow page tables as attempting to install an
3313 * MMIO SPTE will just be an expensive nop.
3315 if (unlikely(!enable_mmio_caching
))
3316 return RET_PF_EMULATE
;
3319 * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
3320 * any guest that generates such gfns is running nested and is being
3321 * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
3322 * only if L1's MAXPHYADDR is inaccurate with respect to the
3325 if (unlikely(fault
->gfn
> kvm_mmu_max_gfn()))
3326 return RET_PF_EMULATE
;
3328 return RET_PF_CONTINUE
;
3331 static bool page_fault_can_be_fast(struct kvm_page_fault
*fault
)
3334 * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
3335 * reach the common page fault handler if the SPTE has an invalid MMIO
3336 * generation number. Refreshing the MMIO generation needs to go down
3337 * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag!
3343 * #PF can be fast if:
3345 * 1. The shadow page table entry is not present and A/D bits are
3346 * disabled _by KVM_, which could mean that the fault is potentially
3347 * caused by access tracking (if enabled). If A/D bits are enabled
3348 * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
3349 * bits for L2 and employ access tracking, but the fast page fault
3350 * mechanism only supports direct MMUs.
3351 * 2. The shadow page table entry is present, the access is a write,
3352 * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
3353 * the fault was caused by a write-protection violation. If the
3354 * SPTE is MMU-writable (determined later), the fault can be fixed
3355 * by setting the Writable bit, which can be done out of mmu_lock.
3357 if (!fault
->present
)
3358 return !kvm_ad_enabled();
3361 * Note, instruction fetches and writes are mutually exclusive, ignore
3364 return fault
->write
;
3368 * Returns true if the SPTE was fixed successfully. Otherwise,
3369 * someone else modified the SPTE from its original value.
3371 static bool fast_pf_fix_direct_spte(struct kvm_vcpu
*vcpu
,
3372 struct kvm_page_fault
*fault
,
3373 u64
*sptep
, u64 old_spte
, u64 new_spte
)
3376 * Theoretically we could also set dirty bit (and flush TLB) here in
3377 * order to eliminate unnecessary PML logging. See comments in
3378 * set_spte. But fast_page_fault is very unlikely to happen with PML
3379 * enabled, so we do not do this. This might result in the same GPA
3380 * to be logged in PML buffer again when the write really happens, and
3381 * eventually to be called by mark_page_dirty twice. But it's also no
3382 * harm. This also avoids the TLB flush needed after setting dirty bit
3383 * so non-PML cases won't be impacted.
3385 * Compare with set_spte where instead shadow_dirty_mask is set.
3387 if (!try_cmpxchg64(sptep
, &old_spte
, new_spte
))
3390 if (is_writable_pte(new_spte
) && !is_writable_pte(old_spte
))
3391 mark_page_dirty_in_slot(vcpu
->kvm
, fault
->slot
, fault
->gfn
);
3396 static bool is_access_allowed(struct kvm_page_fault
*fault
, u64 spte
)
3399 return is_executable_pte(spte
);
3402 return is_writable_pte(spte
);
3404 /* Fault was on Read access */
3405 return spte
& PT_PRESENT_MASK
;
3409 * Returns the last level spte pointer of the shadow page walk for the given
3410 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3411 * walk could be performed, returns NULL and *spte does not contain valid data.
3414 * - Must be called between walk_shadow_page_lockless_{begin,end}.
3415 * - The returned sptep must not be used after walk_shadow_page_lockless_end.
3417 static u64
*fast_pf_get_last_sptep(struct kvm_vcpu
*vcpu
, gpa_t gpa
, u64
*spte
)
3419 struct kvm_shadow_walk_iterator iterator
;
3423 for_each_shadow_entry_lockless(vcpu
, gpa
, iterator
, old_spte
) {
3424 sptep
= iterator
.sptep
;
3432 * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3434 static int fast_page_fault(struct kvm_vcpu
*vcpu
, struct kvm_page_fault
*fault
)
3436 struct kvm_mmu_page
*sp
;
3437 int ret
= RET_PF_INVALID
;
3440 uint retry_count
= 0;
3442 if (!page_fault_can_be_fast(fault
))
3445 walk_shadow_page_lockless_begin(vcpu
);
3450 if (tdp_mmu_enabled
)
3451 sptep
= kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu
, fault
->addr
, &spte
);
3453 sptep
= fast_pf_get_last_sptep(vcpu
, fault
->addr
, &spte
);
3456 * It's entirely possible for the mapping to have been zapped
3457 * by a different task, but the root page should always be
3458 * available as the vCPU holds a reference to its root(s).
3460 if (WARN_ON_ONCE(!sptep
))
3461 spte
= REMOVED_SPTE
;
3463 if (!is_shadow_present_pte(spte
))
3466 sp
= sptep_to_sp(sptep
);
3467 if (!is_last_spte(spte
, sp
->role
.level
))
3471 * Check whether the memory access that caused the fault would
3472 * still cause it if it were to be performed right now. If not,
3473 * then this is a spurious fault caused by TLB lazily flushed,
3474 * or some other CPU has already fixed the PTE after the
3475 * current CPU took the fault.
3477 * Need not check the access of upper level table entries since
3478 * they are always ACC_ALL.
3480 if (is_access_allowed(fault
, spte
)) {
3481 ret
= RET_PF_SPURIOUS
;
3488 * KVM only supports fixing page faults outside of MMU lock for
3489 * direct MMUs, nested MMUs are always indirect, and KVM always
3490 * uses A/D bits for non-nested MMUs. Thus, if A/D bits are
3491 * enabled, the SPTE can't be an access-tracked SPTE.
3493 if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte
))
3494 new_spte
= restore_acc_track_spte(new_spte
);
3497 * To keep things simple, only SPTEs that are MMU-writable can
3498 * be made fully writable outside of mmu_lock, e.g. only SPTEs
3499 * that were write-protected for dirty-logging or access
3500 * tracking are handled here. Don't bother checking if the
3501 * SPTE is writable to prioritize running with A/D bits enabled.
3502 * The is_access_allowed() check above handles the common case
3503 * of the fault being spurious, and the SPTE is known to be
3504 * shadow-present, i.e. except for access tracking restoration
3505 * making the new SPTE writable, the check is wasteful.
3507 if (fault
->write
&& is_mmu_writable_spte(spte
)) {
3508 new_spte
|= PT_WRITABLE_MASK
;
3511 * Do not fix write-permission on the large spte when
3512 * dirty logging is enabled. Since we only dirty the
3513 * first page into the dirty-bitmap in
3514 * fast_pf_fix_direct_spte(), other pages are missed
3515 * if its slot has dirty logging enabled.
3517 * Instead, we let the slow page fault path create a
3518 * normal spte to fix the access.
3520 if (sp
->role
.level
> PG_LEVEL_4K
&&
3521 kvm_slot_dirty_track_enabled(fault
->slot
))
3525 /* Verify that the fault can be handled in the fast path */
3526 if (new_spte
== spte
||
3527 !is_access_allowed(fault
, new_spte
))
3531 * Currently, fast page fault only works for direct mapping
3532 * since the gfn is not stable for indirect shadow page. See
3533 * Documentation/virt/kvm/locking.rst to get more detail.
3535 if (fast_pf_fix_direct_spte(vcpu
, fault
, sptep
, spte
, new_spte
)) {
3540 if (++retry_count
> 4) {
3541 pr_warn_once("Fast #PF retrying more than 4 times.\n");
3547 trace_fast_page_fault(vcpu
, fault
, sptep
, spte
, ret
);
3548 walk_shadow_page_lockless_end(vcpu
);
3550 if (ret
!= RET_PF_INVALID
)
3551 vcpu
->stat
.pf_fast
++;
3556 static void mmu_free_root_page(struct kvm
*kvm
, hpa_t
*root_hpa
,
3557 struct list_head
*invalid_list
)
3559 struct kvm_mmu_page
*sp
;
3561 if (!VALID_PAGE(*root_hpa
))
3564 sp
= root_to_sp(*root_hpa
);
3565 if (WARN_ON_ONCE(!sp
))
3568 if (is_tdp_mmu_page(sp
))
3569 kvm_tdp_mmu_put_root(kvm
, sp
, false);
3570 else if (!--sp
->root_count
&& sp
->role
.invalid
)
3571 kvm_mmu_prepare_zap_page(kvm
, sp
, invalid_list
);
3573 *root_hpa
= INVALID_PAGE
;
3576 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3577 void kvm_mmu_free_roots(struct kvm
*kvm
, struct kvm_mmu
*mmu
,
3578 ulong roots_to_free
)
3581 LIST_HEAD(invalid_list
);
3582 bool free_active_root
;
3584 WARN_ON_ONCE(roots_to_free
& ~KVM_MMU_ROOTS_ALL
);
3586 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS
>= BITS_PER_LONG
);
3588 /* Before acquiring the MMU lock, see if we need to do any real work. */
3589 free_active_root
= (roots_to_free
& KVM_MMU_ROOT_CURRENT
)
3590 && VALID_PAGE(mmu
->root
.hpa
);
3592 if (!free_active_root
) {
3593 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++)
3594 if ((roots_to_free
& KVM_MMU_ROOT_PREVIOUS(i
)) &&
3595 VALID_PAGE(mmu
->prev_roots
[i
].hpa
))
3598 if (i
== KVM_MMU_NUM_PREV_ROOTS
)
3602 write_lock(&kvm
->mmu_lock
);
3604 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++)
3605 if (roots_to_free
& KVM_MMU_ROOT_PREVIOUS(i
))
3606 mmu_free_root_page(kvm
, &mmu
->prev_roots
[i
].hpa
,
3609 if (free_active_root
) {
3610 if (kvm_mmu_is_dummy_root(mmu
->root
.hpa
)) {
3611 /* Nothing to cleanup for dummy roots. */
3612 } else if (root_to_sp(mmu
->root
.hpa
)) {
3613 mmu_free_root_page(kvm
, &mmu
->root
.hpa
, &invalid_list
);
3614 } else if (mmu
->pae_root
) {
3615 for (i
= 0; i
< 4; ++i
) {
3616 if (!IS_VALID_PAE_ROOT(mmu
->pae_root
[i
]))
3619 mmu_free_root_page(kvm
, &mmu
->pae_root
[i
],
3621 mmu
->pae_root
[i
] = INVALID_PAE_ROOT
;
3624 mmu
->root
.hpa
= INVALID_PAGE
;
3628 kvm_mmu_commit_zap_page(kvm
, &invalid_list
);
3629 write_unlock(&kvm
->mmu_lock
);
3631 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots
);
3633 void kvm_mmu_free_guest_mode_roots(struct kvm
*kvm
, struct kvm_mmu
*mmu
)
3635 unsigned long roots_to_free
= 0;
3636 struct kvm_mmu_page
*sp
;
3641 * This should not be called while L2 is active, L2 can't invalidate
3642 * _only_ its own roots, e.g. INVVPID unconditionally exits.
3644 WARN_ON_ONCE(mmu
->root_role
.guest_mode
);
3646 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++) {
3647 root_hpa
= mmu
->prev_roots
[i
].hpa
;
3648 if (!VALID_PAGE(root_hpa
))
3651 sp
= root_to_sp(root_hpa
);
3652 if (!sp
|| sp
->role
.guest_mode
)
3653 roots_to_free
|= KVM_MMU_ROOT_PREVIOUS(i
);
3656 kvm_mmu_free_roots(kvm
, mmu
, roots_to_free
);
3658 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots
);
3660 static hpa_t
mmu_alloc_root(struct kvm_vcpu
*vcpu
, gfn_t gfn
, int quadrant
,
3663 union kvm_mmu_page_role role
= vcpu
->arch
.mmu
->root_role
;
3664 struct kvm_mmu_page
*sp
;
3667 role
.quadrant
= quadrant
;
3669 WARN_ON_ONCE(quadrant
&& !role
.has_4_byte_gpte
);
3670 WARN_ON_ONCE(role
.direct
&& role
.has_4_byte_gpte
);
3672 sp
= kvm_mmu_get_shadow_page(vcpu
, gfn
, role
);
3675 return __pa(sp
->spt
);
3678 static int mmu_alloc_direct_roots(struct kvm_vcpu
*vcpu
)
3680 struct kvm_mmu
*mmu
= vcpu
->arch
.mmu
;
3681 u8 shadow_root_level
= mmu
->root_role
.level
;
3686 write_lock(&vcpu
->kvm
->mmu_lock
);
3687 r
= make_mmu_pages_available(vcpu
);
3691 if (tdp_mmu_enabled
) {
3692 root
= kvm_tdp_mmu_get_vcpu_root_hpa(vcpu
);
3693 mmu
->root
.hpa
= root
;
3694 } else if (shadow_root_level
>= PT64_ROOT_4LEVEL
) {
3695 root
= mmu_alloc_root(vcpu
, 0, 0, shadow_root_level
);
3696 mmu
->root
.hpa
= root
;
3697 } else if (shadow_root_level
== PT32E_ROOT_LEVEL
) {
3698 if (WARN_ON_ONCE(!mmu
->pae_root
)) {
3703 for (i
= 0; i
< 4; ++i
) {
3704 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu
->pae_root
[i
]));
3706 root
= mmu_alloc_root(vcpu
, i
<< (30 - PAGE_SHIFT
), 0,
3708 mmu
->pae_root
[i
] = root
| PT_PRESENT_MASK
|
3711 mmu
->root
.hpa
= __pa(mmu
->pae_root
);
3713 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level
);
3718 /* root.pgd is ignored for direct MMUs. */
3721 write_unlock(&vcpu
->kvm
->mmu_lock
);
3725 static int mmu_first_shadow_root_alloc(struct kvm
*kvm
)
3727 struct kvm_memslots
*slots
;
3728 struct kvm_memory_slot
*slot
;
3732 * Check if this is the first shadow root being allocated before
3735 if (kvm_shadow_root_allocated(kvm
))
3738 mutex_lock(&kvm
->slots_arch_lock
);
3740 /* Recheck, under the lock, whether this is the first shadow root. */
3741 if (kvm_shadow_root_allocated(kvm
))
3745 * Check if anything actually needs to be allocated, e.g. all metadata
3746 * will be allocated upfront if TDP is disabled.
3748 if (kvm_memslots_have_rmaps(kvm
) &&
3749 kvm_page_track_write_tracking_enabled(kvm
))
3752 for (i
= 0; i
< KVM_ADDRESS_SPACE_NUM
; i
++) {
3753 slots
= __kvm_memslots(kvm
, i
);
3754 kvm_for_each_memslot(slot
, bkt
, slots
) {
3756 * Both of these functions are no-ops if the target is
3757 * already allocated, so unconditionally calling both
3758 * is safe. Intentionally do NOT free allocations on
3759 * failure to avoid having to track which allocations
3760 * were made now versus when the memslot was created.
3761 * The metadata is guaranteed to be freed when the slot
3762 * is freed, and will be kept/used if userspace retries
3763 * KVM_RUN instead of killing the VM.
3765 r
= memslot_rmap_alloc(slot
, slot
->npages
);
3768 r
= kvm_page_track_write_tracking_alloc(slot
);
3775 * Ensure that shadow_root_allocated becomes true strictly after
3776 * all the related pointers are set.
3779 smp_store_release(&kvm
->arch
.shadow_root_allocated
, true);
3782 mutex_unlock(&kvm
->slots_arch_lock
);
3786 static int mmu_alloc_shadow_roots(struct kvm_vcpu
*vcpu
)
3788 struct kvm_mmu
*mmu
= vcpu
->arch
.mmu
;
3789 u64 pdptrs
[4], pm_mask
;
3790 gfn_t root_gfn
, root_pgd
;
3794 root_pgd
= kvm_mmu_get_guest_pgd(vcpu
, mmu
);
3795 root_gfn
= root_pgd
>> PAGE_SHIFT
;
3797 if (!kvm_vcpu_is_visible_gfn(vcpu
, root_gfn
)) {
3798 mmu
->root
.hpa
= kvm_mmu_get_dummy_root();
3803 * On SVM, reading PDPTRs might access guest memory, which might fault
3804 * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
3806 if (mmu
->cpu_role
.base
.level
== PT32E_ROOT_LEVEL
) {
3807 for (i
= 0; i
< 4; ++i
) {
3808 pdptrs
[i
] = mmu
->get_pdptr(vcpu
, i
);
3809 if (!(pdptrs
[i
] & PT_PRESENT_MASK
))
3812 if (!kvm_vcpu_is_visible_gfn(vcpu
, pdptrs
[i
] >> PAGE_SHIFT
))
3817 r
= mmu_first_shadow_root_alloc(vcpu
->kvm
);
3821 write_lock(&vcpu
->kvm
->mmu_lock
);
3822 r
= make_mmu_pages_available(vcpu
);
3827 * Do we shadow a long mode page table? If so we need to
3828 * write-protect the guests page table root.
3830 if (mmu
->cpu_role
.base
.level
>= PT64_ROOT_4LEVEL
) {
3831 root
= mmu_alloc_root(vcpu
, root_gfn
, 0,
3832 mmu
->root_role
.level
);
3833 mmu
->root
.hpa
= root
;
3837 if (WARN_ON_ONCE(!mmu
->pae_root
)) {
3843 * We shadow a 32 bit page table. This may be a legacy 2-level
3844 * or a PAE 3-level page table. In either case we need to be aware that
3845 * the shadow page table may be a PAE or a long mode page table.
3847 pm_mask
= PT_PRESENT_MASK
| shadow_me_value
;
3848 if (mmu
->root_role
.level
>= PT64_ROOT_4LEVEL
) {
3849 pm_mask
|= PT_ACCESSED_MASK
| PT_WRITABLE_MASK
| PT_USER_MASK
;
3851 if (WARN_ON_ONCE(!mmu
->pml4_root
)) {
3855 mmu
->pml4_root
[0] = __pa(mmu
->pae_root
) | pm_mask
;
3857 if (mmu
->root_role
.level
== PT64_ROOT_5LEVEL
) {
3858 if (WARN_ON_ONCE(!mmu
->pml5_root
)) {
3862 mmu
->pml5_root
[0] = __pa(mmu
->pml4_root
) | pm_mask
;
3866 for (i
= 0; i
< 4; ++i
) {
3867 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu
->pae_root
[i
]));
3869 if (mmu
->cpu_role
.base
.level
== PT32E_ROOT_LEVEL
) {
3870 if (!(pdptrs
[i
] & PT_PRESENT_MASK
)) {
3871 mmu
->pae_root
[i
] = INVALID_PAE_ROOT
;
3874 root_gfn
= pdptrs
[i
] >> PAGE_SHIFT
;
3878 * If shadowing 32-bit non-PAE page tables, each PAE page
3879 * directory maps one quarter of the guest's non-PAE page
3880 * directory. Othwerise each PAE page direct shadows one guest
3881 * PAE page directory so that quadrant should be 0.
3883 quadrant
= (mmu
->cpu_role
.base
.level
== PT32_ROOT_LEVEL
) ? i
: 0;
3885 root
= mmu_alloc_root(vcpu
, root_gfn
, quadrant
, PT32_ROOT_LEVEL
);
3886 mmu
->pae_root
[i
] = root
| pm_mask
;
3889 if (mmu
->root_role
.level
== PT64_ROOT_5LEVEL
)
3890 mmu
->root
.hpa
= __pa(mmu
->pml5_root
);
3891 else if (mmu
->root_role
.level
== PT64_ROOT_4LEVEL
)
3892 mmu
->root
.hpa
= __pa(mmu
->pml4_root
);
3894 mmu
->root
.hpa
= __pa(mmu
->pae_root
);
3897 mmu
->root
.pgd
= root_pgd
;
3899 write_unlock(&vcpu
->kvm
->mmu_lock
);
3904 static int mmu_alloc_special_roots(struct kvm_vcpu
*vcpu
)
3906 struct kvm_mmu
*mmu
= vcpu
->arch
.mmu
;
3907 bool need_pml5
= mmu
->root_role
.level
> PT64_ROOT_4LEVEL
;
3908 u64
*pml5_root
= NULL
;
3909 u64
*pml4_root
= NULL
;
3913 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3914 * tables are allocated and initialized at root creation as there is no
3915 * equivalent level in the guest's NPT to shadow. Allocate the tables
3916 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3918 if (mmu
->root_role
.direct
||
3919 mmu
->cpu_role
.base
.level
>= PT64_ROOT_4LEVEL
||
3920 mmu
->root_role
.level
< PT64_ROOT_4LEVEL
)
3924 * NPT, the only paging mode that uses this horror, uses a fixed number
3925 * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3926 * all MMus are 5-level. Thus, this can safely require that pml5_root
3927 * is allocated if the other roots are valid and pml5 is needed, as any
3928 * prior MMU would also have required pml5.
3930 if (mmu
->pae_root
&& mmu
->pml4_root
&& (!need_pml5
|| mmu
->pml5_root
))
3934 * The special roots should always be allocated in concert. Yell and
3935 * bail if KVM ends up in a state where only one of the roots is valid.
3937 if (WARN_ON_ONCE(!tdp_enabled
|| mmu
->pae_root
|| mmu
->pml4_root
||
3938 (need_pml5
&& mmu
->pml5_root
)))
3942 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3943 * doesn't need to be decrypted.
3945 pae_root
= (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT
);
3949 #ifdef CONFIG_X86_64
3950 pml4_root
= (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT
);
3955 pml5_root
= (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT
);
3961 mmu
->pae_root
= pae_root
;
3962 mmu
->pml4_root
= pml4_root
;
3963 mmu
->pml5_root
= pml5_root
;
3967 #ifdef CONFIG_X86_64
3969 free_page((unsigned long)pml4_root
);
3971 free_page((unsigned long)pae_root
);
3976 static bool is_unsync_root(hpa_t root
)
3978 struct kvm_mmu_page
*sp
;
3980 if (!VALID_PAGE(root
) || kvm_mmu_is_dummy_root(root
))
3984 * The read barrier orders the CPU's read of SPTE.W during the page table
3985 * walk before the reads of sp->unsync/sp->unsync_children here.
3987 * Even if another CPU was marking the SP as unsync-ed simultaneously,
3988 * any guest page table changes are not guaranteed to be visible anyway
3989 * until this VCPU issues a TLB flush strictly after those changes are
3990 * made. We only need to ensure that the other CPU sets these flags
3991 * before any actual changes to the page tables are made. The comments
3992 * in mmu_try_to_unsync_pages() describe what could go wrong if this
3993 * requirement isn't satisfied.
3996 sp
= root_to_sp(root
);
3999 * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
4000 * PDPTEs for a given PAE root need to be synchronized individually.
4002 if (WARN_ON_ONCE(!sp
))
4005 if (sp
->unsync
|| sp
->unsync_children
)
4011 void kvm_mmu_sync_roots(struct kvm_vcpu
*vcpu
)
4014 struct kvm_mmu_page
*sp
;
4016 if (vcpu
->arch
.mmu
->root_role
.direct
)
4019 if (!VALID_PAGE(vcpu
->arch
.mmu
->root
.hpa
))
4022 vcpu_clear_mmio_info(vcpu
, MMIO_GVA_ANY
);
4024 if (vcpu
->arch
.mmu
->cpu_role
.base
.level
>= PT64_ROOT_4LEVEL
) {
4025 hpa_t root
= vcpu
->arch
.mmu
->root
.hpa
;
4027 if (!is_unsync_root(root
))
4030 sp
= root_to_sp(root
);
4032 write_lock(&vcpu
->kvm
->mmu_lock
);
4033 mmu_sync_children(vcpu
, sp
, true);
4034 write_unlock(&vcpu
->kvm
->mmu_lock
);
4038 write_lock(&vcpu
->kvm
->mmu_lock
);
4040 for (i
= 0; i
< 4; ++i
) {
4041 hpa_t root
= vcpu
->arch
.mmu
->pae_root
[i
];
4043 if (IS_VALID_PAE_ROOT(root
)) {
4044 sp
= spte_to_child_sp(root
);
4045 mmu_sync_children(vcpu
, sp
, true);
4049 write_unlock(&vcpu
->kvm
->mmu_lock
);
4052 void kvm_mmu_sync_prev_roots(struct kvm_vcpu
*vcpu
)
4054 unsigned long roots_to_free
= 0;
4057 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++)
4058 if (is_unsync_root(vcpu
->arch
.mmu
->prev_roots
[i
].hpa
))
4059 roots_to_free
|= KVM_MMU_ROOT_PREVIOUS(i
);
4061 /* sync prev_roots by simply freeing them */
4062 kvm_mmu_free_roots(vcpu
->kvm
, vcpu
->arch
.mmu
, roots_to_free
);
4065 static gpa_t
nonpaging_gva_to_gpa(struct kvm_vcpu
*vcpu
, struct kvm_mmu
*mmu
,
4066 gpa_t vaddr
, u64 access
,
4067 struct x86_exception
*exception
)
4070 exception
->error_code
= 0;
4071 return kvm_translate_gpa(vcpu
, mmu
, vaddr
, access
, exception
);
4074 static bool mmio_info_in_cache(struct kvm_vcpu
*vcpu
, u64 addr
, bool direct
)
4077 * A nested guest cannot use the MMIO cache if it is using nested
4078 * page tables, because cr2 is a nGPA while the cache stores GPAs.
4080 if (mmu_is_nested(vcpu
))
4084 return vcpu_match_mmio_gpa(vcpu
, addr
);
4086 return vcpu_match_mmio_gva(vcpu
, addr
);
4090 * Return the level of the lowest level SPTE added to sptes.
4091 * That SPTE may be non-present.
4093 * Must be called between walk_shadow_page_lockless_{begin,end}.
4095 static int get_walk(struct kvm_vcpu
*vcpu
, u64 addr
, u64
*sptes
, int *root_level
)
4097 struct kvm_shadow_walk_iterator iterator
;
4101 for (shadow_walk_init(&iterator
, vcpu
, addr
),
4102 *root_level
= iterator
.level
;
4103 shadow_walk_okay(&iterator
);
4104 __shadow_walk_next(&iterator
, spte
)) {
4105 leaf
= iterator
.level
;
4106 spte
= mmu_spte_get_lockless(iterator
.sptep
);
4114 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
4115 static bool get_mmio_spte(struct kvm_vcpu
*vcpu
, u64 addr
, u64
*sptep
)
4117 u64 sptes
[PT64_ROOT_MAX_LEVEL
+ 1];
4118 struct rsvd_bits_validate
*rsvd_check
;
4119 int root
, leaf
, level
;
4120 bool reserved
= false;
4122 walk_shadow_page_lockless_begin(vcpu
);
4124 if (is_tdp_mmu_active(vcpu
))
4125 leaf
= kvm_tdp_mmu_get_walk(vcpu
, addr
, sptes
, &root
);
4127 leaf
= get_walk(vcpu
, addr
, sptes
, &root
);
4129 walk_shadow_page_lockless_end(vcpu
);
4131 if (unlikely(leaf
< 0)) {
4136 *sptep
= sptes
[leaf
];
4139 * Skip reserved bits checks on the terminal leaf if it's not a valid
4140 * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by
4141 * design, always have reserved bits set. The purpose of the checks is
4142 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
4144 if (!is_shadow_present_pte(sptes
[leaf
]))
4147 rsvd_check
= &vcpu
->arch
.mmu
->shadow_zero_check
;
4149 for (level
= root
; level
>= leaf
; level
--)
4150 reserved
|= is_rsvd_spte(rsvd_check
, sptes
[level
], level
);
4153 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
4155 for (level
= root
; level
>= leaf
; level
--)
4156 pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
4157 sptes
[level
], level
,
4158 get_rsvd_bits(rsvd_check
, sptes
[level
], level
));
4164 static int handle_mmio_page_fault(struct kvm_vcpu
*vcpu
, u64 addr
, bool direct
)
4169 if (mmio_info_in_cache(vcpu
, addr
, direct
))
4170 return RET_PF_EMULATE
;
4172 reserved
= get_mmio_spte(vcpu
, addr
, &spte
);
4173 if (WARN_ON_ONCE(reserved
))
4176 if (is_mmio_spte(spte
)) {
4177 gfn_t gfn
= get_mmio_spte_gfn(spte
);
4178 unsigned int access
= get_mmio_spte_access(spte
);
4180 if (!check_mmio_spte(vcpu
, spte
))
4181 return RET_PF_INVALID
;
4186 trace_handle_mmio_page_fault(addr
, gfn
, access
);
4187 vcpu_cache_mmio_info(vcpu
, addr
, gfn
, access
);
4188 return RET_PF_EMULATE
;
4192 * If the page table is zapped by other cpus, let CPU fault again on
4195 return RET_PF_RETRY
;
4198 static bool page_fault_handle_page_track(struct kvm_vcpu
*vcpu
,
4199 struct kvm_page_fault
*fault
)
4201 if (unlikely(fault
->rsvd
))
4204 if (!fault
->present
|| !fault
->write
)
4208 * guest is writing the page which is write tracked which can
4209 * not be fixed by page fault handler.
4211 if (kvm_gfn_is_write_tracked(vcpu
->kvm
, fault
->slot
, fault
->gfn
))
4217 static void shadow_page_table_clear_flood(struct kvm_vcpu
*vcpu
, gva_t addr
)
4219 struct kvm_shadow_walk_iterator iterator
;
4222 walk_shadow_page_lockless_begin(vcpu
);
4223 for_each_shadow_entry_lockless(vcpu
, addr
, iterator
, spte
)
4224 clear_sp_write_flooding_count(iterator
.sptep
);
4225 walk_shadow_page_lockless_end(vcpu
);
4228 static u32
alloc_apf_token(struct kvm_vcpu
*vcpu
)
4230 /* make sure the token value is not 0 */
4231 u32 id
= vcpu
->arch
.apf
.id
;
4234 vcpu
->arch
.apf
.id
= 1;
4236 return (vcpu
->arch
.apf
.id
++ << 12) | vcpu
->vcpu_id
;
4239 static bool kvm_arch_setup_async_pf(struct kvm_vcpu
*vcpu
, gpa_t cr2_or_gpa
,
4242 struct kvm_arch_async_pf arch
;
4244 arch
.token
= alloc_apf_token(vcpu
);
4246 arch
.direct_map
= vcpu
->arch
.mmu
->root_role
.direct
;
4247 arch
.cr3
= kvm_mmu_get_guest_pgd(vcpu
, vcpu
->arch
.mmu
);
4249 return kvm_setup_async_pf(vcpu
, cr2_or_gpa
,
4250 kvm_vcpu_gfn_to_hva(vcpu
, gfn
), &arch
);
4253 void kvm_arch_async_page_ready(struct kvm_vcpu
*vcpu
, struct kvm_async_pf
*work
)
4257 if ((vcpu
->arch
.mmu
->root_role
.direct
!= work
->arch
.direct_map
) ||
4261 r
= kvm_mmu_reload(vcpu
);
4265 if (!vcpu
->arch
.mmu
->root_role
.direct
&&
4266 work
->arch
.cr3
!= kvm_mmu_get_guest_pgd(vcpu
, vcpu
->arch
.mmu
))
4269 kvm_mmu_do_page_fault(vcpu
, work
->cr2_or_gpa
, 0, true, NULL
);
4272 static int __kvm_faultin_pfn(struct kvm_vcpu
*vcpu
, struct kvm_page_fault
*fault
)
4274 struct kvm_memory_slot
*slot
= fault
->slot
;
4278 * Retry the page fault if the gfn hit a memslot that is being deleted
4279 * or moved. This ensures any existing SPTEs for the old memslot will
4280 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4282 if (slot
&& (slot
->flags
& KVM_MEMSLOT_INVALID
))
4283 return RET_PF_RETRY
;
4285 if (!kvm_is_visible_memslot(slot
)) {
4286 /* Don't expose private memslots to L2. */
4287 if (is_guest_mode(vcpu
)) {
4289 fault
->pfn
= KVM_PFN_NOSLOT
;
4290 fault
->map_writable
= false;
4291 return RET_PF_CONTINUE
;
4294 * If the APIC access page exists but is disabled, go directly
4295 * to emulation without caching the MMIO access or creating a
4296 * MMIO SPTE. That way the cache doesn't need to be purged
4297 * when the AVIC is re-enabled.
4299 if (slot
&& slot
->id
== APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
&&
4300 !kvm_apicv_activated(vcpu
->kvm
))
4301 return RET_PF_EMULATE
;
4305 fault
->pfn
= __gfn_to_pfn_memslot(slot
, fault
->gfn
, false, false, &async
,
4306 fault
->write
, &fault
->map_writable
,
4309 return RET_PF_CONTINUE
; /* *pfn has correct page already */
4311 if (!fault
->prefetch
&& kvm_can_do_async_pf(vcpu
)) {
4312 trace_kvm_try_async_get_page(fault
->addr
, fault
->gfn
);
4313 if (kvm_find_async_pf_gfn(vcpu
, fault
->gfn
)) {
4314 trace_kvm_async_pf_repeated_fault(fault
->addr
, fault
->gfn
);
4315 kvm_make_request(KVM_REQ_APF_HALT
, vcpu
);
4316 return RET_PF_RETRY
;
4317 } else if (kvm_arch_setup_async_pf(vcpu
, fault
->addr
, fault
->gfn
)) {
4318 return RET_PF_RETRY
;
4323 * Allow gup to bail on pending non-fatal signals when it's also allowed
4324 * to wait for IO. Note, gup always bails if it is unable to quickly
4325 * get a page and a fatal signal, i.e. SIGKILL, is pending.
4327 fault
->pfn
= __gfn_to_pfn_memslot(slot
, fault
->gfn
, false, true, NULL
,
4328 fault
->write
, &fault
->map_writable
,
4330 return RET_PF_CONTINUE
;
4333 static int kvm_faultin_pfn(struct kvm_vcpu
*vcpu
, struct kvm_page_fault
*fault
,
4334 unsigned int access
)
4338 fault
->mmu_seq
= vcpu
->kvm
->mmu_invalidate_seq
;
4341 ret
= __kvm_faultin_pfn(vcpu
, fault
);
4342 if (ret
!= RET_PF_CONTINUE
)
4345 if (unlikely(is_error_pfn(fault
->pfn
)))
4346 return kvm_handle_error_pfn(vcpu
, fault
);
4348 if (unlikely(!fault
->slot
))
4349 return kvm_handle_noslot_fault(vcpu
, fault
, access
);
4351 return RET_PF_CONTINUE
;
4355 * Returns true if the page fault is stale and needs to be retried, i.e. if the
4356 * root was invalidated by a memslot update or a relevant mmu_notifier fired.
4358 static bool is_page_fault_stale(struct kvm_vcpu
*vcpu
,
4359 struct kvm_page_fault
*fault
)
4361 struct kvm_mmu_page
*sp
= root_to_sp(vcpu
->arch
.mmu
->root
.hpa
);
4363 /* Special roots, e.g. pae_root, are not backed by shadow pages. */
4364 if (sp
&& is_obsolete_sp(vcpu
->kvm
, sp
))
4368 * Roots without an associated shadow page are considered invalid if
4369 * there is a pending request to free obsolete roots. The request is
4370 * only a hint that the current root _may_ be obsolete and needs to be
4371 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4372 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4373 * to reload even if no vCPU is actively using the root.
4375 if (!sp
&& kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS
, vcpu
))
4378 return fault
->slot
&&
4379 mmu_invalidate_retry_gfn(vcpu
->kvm
, fault
->mmu_seq
, fault
->gfn
);
4382 static int direct_page_fault(struct kvm_vcpu
*vcpu
, struct kvm_page_fault
*fault
)
4386 /* Dummy roots are used only for shadowing bad guest roots. */
4387 if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu
->arch
.mmu
->root
.hpa
)))
4388 return RET_PF_RETRY
;
4390 if (page_fault_handle_page_track(vcpu
, fault
))
4391 return RET_PF_EMULATE
;
4393 r
= fast_page_fault(vcpu
, fault
);
4394 if (r
!= RET_PF_INVALID
)
4397 r
= mmu_topup_memory_caches(vcpu
, false);
4401 r
= kvm_faultin_pfn(vcpu
, fault
, ACC_ALL
);
4402 if (r
!= RET_PF_CONTINUE
)
4406 write_lock(&vcpu
->kvm
->mmu_lock
);
4408 if (is_page_fault_stale(vcpu
, fault
))
4411 r
= make_mmu_pages_available(vcpu
);
4415 r
= direct_map(vcpu
, fault
);
4418 write_unlock(&vcpu
->kvm
->mmu_lock
);
4419 kvm_release_pfn_clean(fault
->pfn
);
4423 static int nonpaging_page_fault(struct kvm_vcpu
*vcpu
,
4424 struct kvm_page_fault
*fault
)
4426 /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4427 fault
->max_level
= PG_LEVEL_2M
;
4428 return direct_page_fault(vcpu
, fault
);
4431 int kvm_handle_page_fault(struct kvm_vcpu
*vcpu
, u64 error_code
,
4432 u64 fault_address
, char *insn
, int insn_len
)
4435 u32 flags
= vcpu
->arch
.apf
.host_apf_flags
;
4437 #ifndef CONFIG_X86_64
4438 /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4439 if (WARN_ON_ONCE(fault_address
>> 32))
4443 vcpu
->arch
.l1tf_flush_l1d
= true;
4445 trace_kvm_page_fault(vcpu
, fault_address
, error_code
);
4447 if (kvm_event_needs_reinjection(vcpu
))
4448 kvm_mmu_unprotect_page_virt(vcpu
, fault_address
);
4449 r
= kvm_mmu_page_fault(vcpu
, fault_address
, error_code
, insn
,
4451 } else if (flags
& KVM_PV_REASON_PAGE_NOT_PRESENT
) {
4452 vcpu
->arch
.apf
.host_apf_flags
= 0;
4453 local_irq_disable();
4454 kvm_async_pf_task_wait_schedule(fault_address
);
4457 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags
);
4462 EXPORT_SYMBOL_GPL(kvm_handle_page_fault
);
4464 #ifdef CONFIG_X86_64
4465 static int kvm_tdp_mmu_page_fault(struct kvm_vcpu
*vcpu
,
4466 struct kvm_page_fault
*fault
)
4470 if (page_fault_handle_page_track(vcpu
, fault
))
4471 return RET_PF_EMULATE
;
4473 r
= fast_page_fault(vcpu
, fault
);
4474 if (r
!= RET_PF_INVALID
)
4477 r
= mmu_topup_memory_caches(vcpu
, false);
4481 r
= kvm_faultin_pfn(vcpu
, fault
, ACC_ALL
);
4482 if (r
!= RET_PF_CONTINUE
)
4486 read_lock(&vcpu
->kvm
->mmu_lock
);
4488 if (is_page_fault_stale(vcpu
, fault
))
4491 r
= kvm_tdp_mmu_map(vcpu
, fault
);
4494 read_unlock(&vcpu
->kvm
->mmu_lock
);
4495 kvm_release_pfn_clean(fault
->pfn
);
4500 bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma
)
4503 * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
4504 * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
4505 * to honor the memtype from the guest's MTRRs so that guest accesses
4506 * to memory that is DMA'd aren't cached against the guest's wishes.
4508 * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
4509 * e.g. KVM will force UC memtype for host MMIO.
4511 return vm_has_noncoherent_dma
&& shadow_memtype_mask
;
4514 int kvm_tdp_page_fault(struct kvm_vcpu
*vcpu
, struct kvm_page_fault
*fault
)
4517 * If the guest's MTRRs may be used to compute the "real" memtype,
4518 * restrict the mapping level to ensure KVM uses a consistent memtype
4519 * across the entire mapping.
4521 if (kvm_mmu_honors_guest_mtrrs(vcpu
->kvm
)) {
4522 for ( ; fault
->max_level
> PG_LEVEL_4K
; --fault
->max_level
) {
4523 int page_num
= KVM_PAGES_PER_HPAGE(fault
->max_level
);
4524 gfn_t base
= gfn_round_for_level(fault
->gfn
,
4527 if (kvm_mtrr_check_gfn_range_consistency(vcpu
, base
, page_num
))
4532 #ifdef CONFIG_X86_64
4533 if (tdp_mmu_enabled
)
4534 return kvm_tdp_mmu_page_fault(vcpu
, fault
);
4537 return direct_page_fault(vcpu
, fault
);
4540 static void nonpaging_init_context(struct kvm_mmu
*context
)
4542 context
->page_fault
= nonpaging_page_fault
;
4543 context
->gva_to_gpa
= nonpaging_gva_to_gpa
;
4544 context
->sync_spte
= NULL
;
4547 static inline bool is_root_usable(struct kvm_mmu_root_info
*root
, gpa_t pgd
,
4548 union kvm_mmu_page_role role
)
4550 struct kvm_mmu_page
*sp
;
4552 if (!VALID_PAGE(root
->hpa
))
4555 if (!role
.direct
&& pgd
!= root
->pgd
)
4558 sp
= root_to_sp(root
->hpa
);
4559 if (WARN_ON_ONCE(!sp
))
4562 return role
.word
== sp
->role
.word
;
4566 * Find out if a previously cached root matching the new pgd/role is available,
4567 * and insert the current root as the MRU in the cache.
4568 * If a matching root is found, it is assigned to kvm_mmu->root and
4570 * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4571 * evicted to make room for the current root, and false is returned.
4573 static bool cached_root_find_and_keep_current(struct kvm
*kvm
, struct kvm_mmu
*mmu
,
4575 union kvm_mmu_page_role new_role
)
4579 if (is_root_usable(&mmu
->root
, new_pgd
, new_role
))
4582 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++) {
4584 * The swaps end up rotating the cache like this:
4585 * C 0 1 2 3 (on entry to the function)
4589 * 3 C 0 1 2 (on exit from the loop)
4591 swap(mmu
->root
, mmu
->prev_roots
[i
]);
4592 if (is_root_usable(&mmu
->root
, new_pgd
, new_role
))
4596 kvm_mmu_free_roots(kvm
, mmu
, KVM_MMU_ROOT_CURRENT
);
4601 * Find out if a previously cached root matching the new pgd/role is available.
4602 * On entry, mmu->root is invalid.
4603 * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4604 * of the cache becomes invalid, and true is returned.
4605 * If no match is found, kvm_mmu->root is left invalid and false is returned.
4607 static bool cached_root_find_without_current(struct kvm
*kvm
, struct kvm_mmu
*mmu
,
4609 union kvm_mmu_page_role new_role
)
4613 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++)
4614 if (is_root_usable(&mmu
->prev_roots
[i
], new_pgd
, new_role
))
4620 swap(mmu
->root
, mmu
->prev_roots
[i
]);
4621 /* Bubble up the remaining roots. */
4622 for (; i
< KVM_MMU_NUM_PREV_ROOTS
- 1; i
++)
4623 mmu
->prev_roots
[i
] = mmu
->prev_roots
[i
+ 1];
4624 mmu
->prev_roots
[i
].hpa
= INVALID_PAGE
;
4628 static bool fast_pgd_switch(struct kvm
*kvm
, struct kvm_mmu
*mmu
,
4629 gpa_t new_pgd
, union kvm_mmu_page_role new_role
)
4632 * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
4633 * avoid having to deal with PDPTEs and other complexities.
4635 if (VALID_PAGE(mmu
->root
.hpa
) && !root_to_sp(mmu
->root
.hpa
))
4636 kvm_mmu_free_roots(kvm
, mmu
, KVM_MMU_ROOT_CURRENT
);
4638 if (VALID_PAGE(mmu
->root
.hpa
))
4639 return cached_root_find_and_keep_current(kvm
, mmu
, new_pgd
, new_role
);
4641 return cached_root_find_without_current(kvm
, mmu
, new_pgd
, new_role
);
4644 void kvm_mmu_new_pgd(struct kvm_vcpu
*vcpu
, gpa_t new_pgd
)
4646 struct kvm_mmu
*mmu
= vcpu
->arch
.mmu
;
4647 union kvm_mmu_page_role new_role
= mmu
->root_role
;
4650 * Return immediately if no usable root was found, kvm_mmu_reload()
4651 * will establish a valid root prior to the next VM-Enter.
4653 if (!fast_pgd_switch(vcpu
->kvm
, mmu
, new_pgd
, new_role
))
4657 * It's possible that the cached previous root page is obsolete because
4658 * of a change in the MMU generation number. However, changing the
4659 * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4660 * which will free the root set here and allocate a new one.
4662 kvm_make_request(KVM_REQ_LOAD_MMU_PGD
, vcpu
);
4664 if (force_flush_and_sync_on_reuse
) {
4665 kvm_make_request(KVM_REQ_MMU_SYNC
, vcpu
);
4666 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT
, vcpu
);
4670 * The last MMIO access's GVA and GPA are cached in the VCPU. When
4671 * switching to a new CR3, that GVA->GPA mapping may no longer be
4672 * valid. So clear any cached MMIO info even when we don't need to sync
4673 * the shadow page tables.
4675 vcpu_clear_mmio_info(vcpu
, MMIO_GVA_ANY
);
4678 * If this is a direct root page, it doesn't have a write flooding
4679 * count. Otherwise, clear the write flooding count.
4681 if (!new_role
.direct
) {
4682 struct kvm_mmu_page
*sp
= root_to_sp(vcpu
->arch
.mmu
->root
.hpa
);
4684 if (!WARN_ON_ONCE(!sp
))
4685 __clear_sp_write_flooding_count(sp
);
4688 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd
);
4690 static bool sync_mmio_spte(struct kvm_vcpu
*vcpu
, u64
*sptep
, gfn_t gfn
,
4691 unsigned int access
)
4693 if (unlikely(is_mmio_spte(*sptep
))) {
4694 if (gfn
!= get_mmio_spte_gfn(*sptep
)) {
4695 mmu_spte_clear_no_track(sptep
);
4699 mark_mmio_spte(vcpu
, sptep
, gfn
, access
);
4706 #define PTTYPE_EPT 18 /* arbitrary */
4707 #define PTTYPE PTTYPE_EPT
4708 #include "paging_tmpl.h"
4712 #include "paging_tmpl.h"
4716 #include "paging_tmpl.h"
4719 static void __reset_rsvds_bits_mask(struct rsvd_bits_validate
*rsvd_check
,
4720 u64 pa_bits_rsvd
, int level
, bool nx
,
4721 bool gbpages
, bool pse
, bool amd
)
4723 u64 gbpages_bit_rsvd
= 0;
4724 u64 nonleaf_bit8_rsvd
= 0;
4727 rsvd_check
->bad_mt_xwr
= 0;
4730 gbpages_bit_rsvd
= rsvd_bits(7, 7);
4732 if (level
== PT32E_ROOT_LEVEL
)
4733 high_bits_rsvd
= pa_bits_rsvd
& rsvd_bits(0, 62);
4735 high_bits_rsvd
= pa_bits_rsvd
& rsvd_bits(0, 51);
4737 /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4739 high_bits_rsvd
|= rsvd_bits(63, 63);
4742 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4743 * leaf entries) on AMD CPUs only.
4746 nonleaf_bit8_rsvd
= rsvd_bits(8, 8);
4749 case PT32_ROOT_LEVEL
:
4750 /* no rsvd bits for 2 level 4K page table entries */
4751 rsvd_check
->rsvd_bits_mask
[0][1] = 0;
4752 rsvd_check
->rsvd_bits_mask
[0][0] = 0;
4753 rsvd_check
->rsvd_bits_mask
[1][0] =
4754 rsvd_check
->rsvd_bits_mask
[0][0];
4757 rsvd_check
->rsvd_bits_mask
[1][1] = 0;
4761 if (is_cpuid_PSE36())
4762 /* 36bits PSE 4MB page */
4763 rsvd_check
->rsvd_bits_mask
[1][1] = rsvd_bits(17, 21);
4765 /* 32 bits PSE 4MB page */
4766 rsvd_check
->rsvd_bits_mask
[1][1] = rsvd_bits(13, 21);
4768 case PT32E_ROOT_LEVEL
:
4769 rsvd_check
->rsvd_bits_mask
[0][2] = rsvd_bits(63, 63) |
4772 rsvd_bits(1, 2); /* PDPTE */
4773 rsvd_check
->rsvd_bits_mask
[0][1] = high_bits_rsvd
; /* PDE */
4774 rsvd_check
->rsvd_bits_mask
[0][0] = high_bits_rsvd
; /* PTE */
4775 rsvd_check
->rsvd_bits_mask
[1][1] = high_bits_rsvd
|
4776 rsvd_bits(13, 20); /* large page */
4777 rsvd_check
->rsvd_bits_mask
[1][0] =
4778 rsvd_check
->rsvd_bits_mask
[0][0];
4780 case PT64_ROOT_5LEVEL
:
4781 rsvd_check
->rsvd_bits_mask
[0][4] = high_bits_rsvd
|
4784 rsvd_check
->rsvd_bits_mask
[1][4] =
4785 rsvd_check
->rsvd_bits_mask
[0][4];
4787 case PT64_ROOT_4LEVEL
:
4788 rsvd_check
->rsvd_bits_mask
[0][3] = high_bits_rsvd
|
4791 rsvd_check
->rsvd_bits_mask
[0][2] = high_bits_rsvd
|
4793 rsvd_check
->rsvd_bits_mask
[0][1] = high_bits_rsvd
;
4794 rsvd_check
->rsvd_bits_mask
[0][0] = high_bits_rsvd
;
4795 rsvd_check
->rsvd_bits_mask
[1][3] =
4796 rsvd_check
->rsvd_bits_mask
[0][3];
4797 rsvd_check
->rsvd_bits_mask
[1][2] = high_bits_rsvd
|
4800 rsvd_check
->rsvd_bits_mask
[1][1] = high_bits_rsvd
|
4801 rsvd_bits(13, 20); /* large page */
4802 rsvd_check
->rsvd_bits_mask
[1][0] =
4803 rsvd_check
->rsvd_bits_mask
[0][0];
4808 static void reset_guest_rsvds_bits_mask(struct kvm_vcpu
*vcpu
,
4809 struct kvm_mmu
*context
)
4811 __reset_rsvds_bits_mask(&context
->guest_rsvd_check
,
4812 vcpu
->arch
.reserved_gpa_bits
,
4813 context
->cpu_role
.base
.level
, is_efer_nx(context
),
4814 guest_can_use(vcpu
, X86_FEATURE_GBPAGES
),
4815 is_cr4_pse(context
),
4816 guest_cpuid_is_amd_or_hygon(vcpu
));
4819 static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate
*rsvd_check
,
4820 u64 pa_bits_rsvd
, bool execonly
,
4821 int huge_page_level
)
4823 u64 high_bits_rsvd
= pa_bits_rsvd
& rsvd_bits(0, 51);
4824 u64 large_1g_rsvd
= 0, large_2m_rsvd
= 0;
4827 if (huge_page_level
< PG_LEVEL_1G
)
4828 large_1g_rsvd
= rsvd_bits(7, 7);
4829 if (huge_page_level
< PG_LEVEL_2M
)
4830 large_2m_rsvd
= rsvd_bits(7, 7);
4832 rsvd_check
->rsvd_bits_mask
[0][4] = high_bits_rsvd
| rsvd_bits(3, 7);
4833 rsvd_check
->rsvd_bits_mask
[0][3] = high_bits_rsvd
| rsvd_bits(3, 7);
4834 rsvd_check
->rsvd_bits_mask
[0][2] = high_bits_rsvd
| rsvd_bits(3, 6) | large_1g_rsvd
;
4835 rsvd_check
->rsvd_bits_mask
[0][1] = high_bits_rsvd
| rsvd_bits(3, 6) | large_2m_rsvd
;
4836 rsvd_check
->rsvd_bits_mask
[0][0] = high_bits_rsvd
;
4839 rsvd_check
->rsvd_bits_mask
[1][4] = rsvd_check
->rsvd_bits_mask
[0][4];
4840 rsvd_check
->rsvd_bits_mask
[1][3] = rsvd_check
->rsvd_bits_mask
[0][3];
4841 rsvd_check
->rsvd_bits_mask
[1][2] = high_bits_rsvd
| rsvd_bits(12, 29) | large_1g_rsvd
;
4842 rsvd_check
->rsvd_bits_mask
[1][1] = high_bits_rsvd
| rsvd_bits(12, 20) | large_2m_rsvd
;
4843 rsvd_check
->rsvd_bits_mask
[1][0] = rsvd_check
->rsvd_bits_mask
[0][0];
4845 bad_mt_xwr
= 0xFFull
<< (2 * 8); /* bits 3..5 must not be 2 */
4846 bad_mt_xwr
|= 0xFFull
<< (3 * 8); /* bits 3..5 must not be 3 */
4847 bad_mt_xwr
|= 0xFFull
<< (7 * 8); /* bits 3..5 must not be 7 */
4848 bad_mt_xwr
|= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
4849 bad_mt_xwr
|= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
4851 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4852 bad_mt_xwr
|= REPEAT_BYTE(1ull << 4);
4854 rsvd_check
->bad_mt_xwr
= bad_mt_xwr
;
4857 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu
*vcpu
,
4858 struct kvm_mmu
*context
, bool execonly
, int huge_page_level
)
4860 __reset_rsvds_bits_mask_ept(&context
->guest_rsvd_check
,
4861 vcpu
->arch
.reserved_gpa_bits
, execonly
,
4865 static inline u64
reserved_hpa_bits(void)
4867 return rsvd_bits(shadow_phys_bits
, 63);
4871 * the page table on host is the shadow page table for the page
4872 * table in guest or amd nested guest, its mmu features completely
4873 * follow the features in guest.
4875 static void reset_shadow_zero_bits_mask(struct kvm_vcpu
*vcpu
,
4876 struct kvm_mmu
*context
)
4878 /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
4880 /* KVM doesn't use 2-level page tables for the shadow MMU. */
4881 bool is_pse
= false;
4882 struct rsvd_bits_validate
*shadow_zero_check
;
4885 WARN_ON_ONCE(context
->root_role
.level
< PT32E_ROOT_LEVEL
);
4887 shadow_zero_check
= &context
->shadow_zero_check
;
4888 __reset_rsvds_bits_mask(shadow_zero_check
, reserved_hpa_bits(),
4889 context
->root_role
.level
,
4890 context
->root_role
.efer_nx
,
4891 guest_can_use(vcpu
, X86_FEATURE_GBPAGES
),
4894 if (!shadow_me_mask
)
4897 for (i
= context
->root_role
.level
; --i
>= 0;) {
4899 * So far shadow_me_value is a constant during KVM's life
4900 * time. Bits in shadow_me_value are allowed to be set.
4901 * Bits in shadow_me_mask but not in shadow_me_value are
4902 * not allowed to be set.
4904 shadow_zero_check
->rsvd_bits_mask
[0][i
] |= shadow_me_mask
;
4905 shadow_zero_check
->rsvd_bits_mask
[1][i
] |= shadow_me_mask
;
4906 shadow_zero_check
->rsvd_bits_mask
[0][i
] &= ~shadow_me_value
;
4907 shadow_zero_check
->rsvd_bits_mask
[1][i
] &= ~shadow_me_value
;
4912 static inline bool boot_cpu_is_amd(void)
4914 WARN_ON_ONCE(!tdp_enabled
);
4915 return shadow_x_mask
== 0;
4919 * the direct page table on host, use as much mmu features as
4920 * possible, however, kvm currently does not do execution-protection.
4922 static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu
*context
)
4924 struct rsvd_bits_validate
*shadow_zero_check
;
4927 shadow_zero_check
= &context
->shadow_zero_check
;
4929 if (boot_cpu_is_amd())
4930 __reset_rsvds_bits_mask(shadow_zero_check
, reserved_hpa_bits(),
4931 context
->root_role
.level
, true,
4932 boot_cpu_has(X86_FEATURE_GBPAGES
),
4935 __reset_rsvds_bits_mask_ept(shadow_zero_check
,
4936 reserved_hpa_bits(), false,
4937 max_huge_page_level
);
4939 if (!shadow_me_mask
)
4942 for (i
= context
->root_role
.level
; --i
>= 0;) {
4943 shadow_zero_check
->rsvd_bits_mask
[0][i
] &= ~shadow_me_mask
;
4944 shadow_zero_check
->rsvd_bits_mask
[1][i
] &= ~shadow_me_mask
;
4949 * as the comments in reset_shadow_zero_bits_mask() except it
4950 * is the shadow page table for intel nested guest.
4953 reset_ept_shadow_zero_bits_mask(struct kvm_mmu
*context
, bool execonly
)
4955 __reset_rsvds_bits_mask_ept(&context
->shadow_zero_check
,
4956 reserved_hpa_bits(), execonly
,
4957 max_huge_page_level
);
4960 #define BYTE_MASK(access) \
4961 ((1 & (access) ? 2 : 0) | \
4962 (2 & (access) ? 4 : 0) | \
4963 (3 & (access) ? 8 : 0) | \
4964 (4 & (access) ? 16 : 0) | \
4965 (5 & (access) ? 32 : 0) | \
4966 (6 & (access) ? 64 : 0) | \
4967 (7 & (access) ? 128 : 0))
4970 static void update_permission_bitmask(struct kvm_mmu
*mmu
, bool ept
)
4974 const u8 x
= BYTE_MASK(ACC_EXEC_MASK
);
4975 const u8 w
= BYTE_MASK(ACC_WRITE_MASK
);
4976 const u8 u
= BYTE_MASK(ACC_USER_MASK
);
4978 bool cr4_smep
= is_cr4_smep(mmu
);
4979 bool cr4_smap
= is_cr4_smap(mmu
);
4980 bool cr0_wp
= is_cr0_wp(mmu
);
4981 bool efer_nx
= is_efer_nx(mmu
);
4983 for (byte
= 0; byte
< ARRAY_SIZE(mmu
->permissions
); ++byte
) {
4984 unsigned pfec
= byte
<< 1;
4987 * Each "*f" variable has a 1 bit for each UWX value
4988 * that causes a fault with the given PFEC.
4991 /* Faults from writes to non-writable pages */
4992 u8 wf
= (pfec
& PFERR_WRITE_MASK
) ? (u8
)~w
: 0;
4993 /* Faults from user mode accesses to supervisor pages */
4994 u8 uf
= (pfec
& PFERR_USER_MASK
) ? (u8
)~u
: 0;
4995 /* Faults from fetches of non-executable pages*/
4996 u8 ff
= (pfec
& PFERR_FETCH_MASK
) ? (u8
)~x
: 0;
4997 /* Faults from kernel mode fetches of user pages */
4999 /* Faults from kernel mode accesses of user pages */
5003 /* Faults from kernel mode accesses to user pages */
5004 u8 kf
= (pfec
& PFERR_USER_MASK
) ? 0 : u
;
5006 /* Not really needed: !nx will cause pte.nx to fault */
5010 /* Allow supervisor writes if !cr0.wp */
5012 wf
= (pfec
& PFERR_USER_MASK
) ? wf
: 0;
5014 /* Disallow supervisor fetches of user code if cr4.smep */
5016 smepf
= (pfec
& PFERR_FETCH_MASK
) ? kf
: 0;
5019 * SMAP:kernel-mode data accesses from user-mode
5020 * mappings should fault. A fault is considered
5021 * as a SMAP violation if all of the following
5022 * conditions are true:
5023 * - X86_CR4_SMAP is set in CR4
5024 * - A user page is accessed
5025 * - The access is not a fetch
5026 * - The access is supervisor mode
5027 * - If implicit supervisor access or X86_EFLAGS_AC is clear
5029 * Here, we cover the first four conditions.
5030 * The fifth is computed dynamically in permission_fault();
5031 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
5032 * *not* subject to SMAP restrictions.
5035 smapf
= (pfec
& (PFERR_RSVD_MASK
|PFERR_FETCH_MASK
)) ? 0 : kf
;
5038 mmu
->permissions
[byte
] = ff
| uf
| wf
| smepf
| smapf
;
5043 * PKU is an additional mechanism by which the paging controls access to
5044 * user-mode addresses based on the value in the PKRU register. Protection
5045 * key violations are reported through a bit in the page fault error code.
5046 * Unlike other bits of the error code, the PK bit is not known at the
5047 * call site of e.g. gva_to_gpa; it must be computed directly in
5048 * permission_fault based on two bits of PKRU, on some machine state (CR4,
5049 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
5051 * In particular the following conditions come from the error code, the
5052 * page tables and the machine state:
5053 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
5054 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
5055 * - PK is always zero if U=0 in the page tables
5056 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
5058 * The PKRU bitmask caches the result of these four conditions. The error
5059 * code (minus the P bit) and the page table's U bit form an index into the
5060 * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
5061 * with the two bits of the PKRU register corresponding to the protection key.
5062 * For the first three conditions above the bits will be 00, thus masking
5063 * away both AD and WD. For all reads or if the last condition holds, WD
5064 * only will be masked away.
5066 static void update_pkru_bitmask(struct kvm_mmu
*mmu
)
5073 if (!is_cr4_pke(mmu
))
5076 wp
= is_cr0_wp(mmu
);
5078 for (bit
= 0; bit
< ARRAY_SIZE(mmu
->permissions
); ++bit
) {
5079 unsigned pfec
, pkey_bits
;
5080 bool check_pkey
, check_write
, ff
, uf
, wf
, pte_user
;
5083 ff
= pfec
& PFERR_FETCH_MASK
;
5084 uf
= pfec
& PFERR_USER_MASK
;
5085 wf
= pfec
& PFERR_WRITE_MASK
;
5087 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
5088 pte_user
= pfec
& PFERR_RSVD_MASK
;
5091 * Only need to check the access which is not an
5092 * instruction fetch and is to a user page.
5094 check_pkey
= (!ff
&& pte_user
);
5096 * write access is controlled by PKRU if it is a
5097 * user access or CR0.WP = 1.
5099 check_write
= check_pkey
&& wf
&& (uf
|| wp
);
5101 /* PKRU.AD stops both read and write access. */
5102 pkey_bits
= !!check_pkey
;
5103 /* PKRU.WD stops write access. */
5104 pkey_bits
|= (!!check_write
) << 1;
5106 mmu
->pkru_mask
|= (pkey_bits
& 3) << pfec
;
5110 static void reset_guest_paging_metadata(struct kvm_vcpu
*vcpu
,
5111 struct kvm_mmu
*mmu
)
5113 if (!is_cr0_pg(mmu
))
5116 reset_guest_rsvds_bits_mask(vcpu
, mmu
);
5117 update_permission_bitmask(mmu
, false);
5118 update_pkru_bitmask(mmu
);
5121 static void paging64_init_context(struct kvm_mmu
*context
)
5123 context
->page_fault
= paging64_page_fault
;
5124 context
->gva_to_gpa
= paging64_gva_to_gpa
;
5125 context
->sync_spte
= paging64_sync_spte
;
5128 static void paging32_init_context(struct kvm_mmu
*context
)
5130 context
->page_fault
= paging32_page_fault
;
5131 context
->gva_to_gpa
= paging32_gva_to_gpa
;
5132 context
->sync_spte
= paging32_sync_spte
;
5135 static union kvm_cpu_role
kvm_calc_cpu_role(struct kvm_vcpu
*vcpu
,
5136 const struct kvm_mmu_role_regs
*regs
)
5138 union kvm_cpu_role role
= {0};
5140 role
.base
.access
= ACC_ALL
;
5141 role
.base
.smm
= is_smm(vcpu
);
5142 role
.base
.guest_mode
= is_guest_mode(vcpu
);
5145 if (!____is_cr0_pg(regs
)) {
5146 role
.base
.direct
= 1;
5150 role
.base
.efer_nx
= ____is_efer_nx(regs
);
5151 role
.base
.cr0_wp
= ____is_cr0_wp(regs
);
5152 role
.base
.smep_andnot_wp
= ____is_cr4_smep(regs
) && !____is_cr0_wp(regs
);
5153 role
.base
.smap_andnot_wp
= ____is_cr4_smap(regs
) && !____is_cr0_wp(regs
);
5154 role
.base
.has_4_byte_gpte
= !____is_cr4_pae(regs
);
5156 if (____is_efer_lma(regs
))
5157 role
.base
.level
= ____is_cr4_la57(regs
) ? PT64_ROOT_5LEVEL
5159 else if (____is_cr4_pae(regs
))
5160 role
.base
.level
= PT32E_ROOT_LEVEL
;
5162 role
.base
.level
= PT32_ROOT_LEVEL
;
5164 role
.ext
.cr4_smep
= ____is_cr4_smep(regs
);
5165 role
.ext
.cr4_smap
= ____is_cr4_smap(regs
);
5166 role
.ext
.cr4_pse
= ____is_cr4_pse(regs
);
5168 /* PKEY and LA57 are active iff long mode is active. */
5169 role
.ext
.cr4_pke
= ____is_efer_lma(regs
) && ____is_cr4_pke(regs
);
5170 role
.ext
.cr4_la57
= ____is_efer_lma(regs
) && ____is_cr4_la57(regs
);
5171 role
.ext
.efer_lma
= ____is_efer_lma(regs
);
5175 void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu
*vcpu
,
5176 struct kvm_mmu
*mmu
)
5178 const bool cr0_wp
= kvm_is_cr0_bit_set(vcpu
, X86_CR0_WP
);
5180 BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS
& KVM_POSSIBLE_CR0_GUEST_BITS
) != X86_CR0_WP
);
5181 BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS
& KVM_POSSIBLE_CR4_GUEST_BITS
));
5183 if (is_cr0_wp(mmu
) == cr0_wp
)
5186 mmu
->cpu_role
.base
.cr0_wp
= cr0_wp
;
5187 reset_guest_paging_metadata(vcpu
, mmu
);
5190 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu
*vcpu
)
5192 /* tdp_root_level is architecture forced level, use it if nonzero */
5194 return tdp_root_level
;
5196 /* Use 5-level TDP if and only if it's useful/necessary. */
5197 if (max_tdp_level
== 5 && cpuid_maxphyaddr(vcpu
) <= 48)
5200 return max_tdp_level
;
5203 static union kvm_mmu_page_role
5204 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu
*vcpu
,
5205 union kvm_cpu_role cpu_role
)
5207 union kvm_mmu_page_role role
= {0};
5209 role
.access
= ACC_ALL
;
5211 role
.efer_nx
= true;
5212 role
.smm
= cpu_role
.base
.smm
;
5213 role
.guest_mode
= cpu_role
.base
.guest_mode
;
5214 role
.ad_disabled
= !kvm_ad_enabled();
5215 role
.level
= kvm_mmu_get_tdp_level(vcpu
);
5217 role
.has_4_byte_gpte
= false;
5222 static void init_kvm_tdp_mmu(struct kvm_vcpu
*vcpu
,
5223 union kvm_cpu_role cpu_role
)
5225 struct kvm_mmu
*context
= &vcpu
->arch
.root_mmu
;
5226 union kvm_mmu_page_role root_role
= kvm_calc_tdp_mmu_root_page_role(vcpu
, cpu_role
);
5228 if (cpu_role
.as_u64
== context
->cpu_role
.as_u64
&&
5229 root_role
.word
== context
->root_role
.word
)
5232 context
->cpu_role
.as_u64
= cpu_role
.as_u64
;
5233 context
->root_role
.word
= root_role
.word
;
5234 context
->page_fault
= kvm_tdp_page_fault
;
5235 context
->sync_spte
= NULL
;
5236 context
->get_guest_pgd
= get_guest_cr3
;
5237 context
->get_pdptr
= kvm_pdptr_read
;
5238 context
->inject_page_fault
= kvm_inject_page_fault
;
5240 if (!is_cr0_pg(context
))
5241 context
->gva_to_gpa
= nonpaging_gva_to_gpa
;
5242 else if (is_cr4_pae(context
))
5243 context
->gva_to_gpa
= paging64_gva_to_gpa
;
5245 context
->gva_to_gpa
= paging32_gva_to_gpa
;
5247 reset_guest_paging_metadata(vcpu
, context
);
5248 reset_tdp_shadow_zero_bits_mask(context
);
5251 static void shadow_mmu_init_context(struct kvm_vcpu
*vcpu
, struct kvm_mmu
*context
,
5252 union kvm_cpu_role cpu_role
,
5253 union kvm_mmu_page_role root_role
)
5255 if (cpu_role
.as_u64
== context
->cpu_role
.as_u64
&&
5256 root_role
.word
== context
->root_role
.word
)
5259 context
->cpu_role
.as_u64
= cpu_role
.as_u64
;
5260 context
->root_role
.word
= root_role
.word
;
5262 if (!is_cr0_pg(context
))
5263 nonpaging_init_context(context
);
5264 else if (is_cr4_pae(context
))
5265 paging64_init_context(context
);
5267 paging32_init_context(context
);
5269 reset_guest_paging_metadata(vcpu
, context
);
5270 reset_shadow_zero_bits_mask(vcpu
, context
);
5273 static void kvm_init_shadow_mmu(struct kvm_vcpu
*vcpu
,
5274 union kvm_cpu_role cpu_role
)
5276 struct kvm_mmu
*context
= &vcpu
->arch
.root_mmu
;
5277 union kvm_mmu_page_role root_role
;
5279 root_role
= cpu_role
.base
;
5281 /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
5282 root_role
.level
= max_t(u32
, root_role
.level
, PT32E_ROOT_LEVEL
);
5285 * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
5286 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
5287 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
5288 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
5289 * The iTLB multi-hit workaround can be toggled at any time, so assume
5290 * NX can be used by any non-nested shadow MMU to avoid having to reset
5293 root_role
.efer_nx
= true;
5295 shadow_mmu_init_context(vcpu
, context
, cpu_role
, root_role
);
5298 void kvm_init_shadow_npt_mmu(struct kvm_vcpu
*vcpu
, unsigned long cr0
,
5299 unsigned long cr4
, u64 efer
, gpa_t nested_cr3
)
5301 struct kvm_mmu
*context
= &vcpu
->arch
.guest_mmu
;
5302 struct kvm_mmu_role_regs regs
= {
5304 .cr4
= cr4
& ~X86_CR4_PKE
,
5307 union kvm_cpu_role cpu_role
= kvm_calc_cpu_role(vcpu
, ®s
);
5308 union kvm_mmu_page_role root_role
;
5310 /* NPT requires CR0.PG=1. */
5311 WARN_ON_ONCE(cpu_role
.base
.direct
);
5313 root_role
= cpu_role
.base
;
5314 root_role
.level
= kvm_mmu_get_tdp_level(vcpu
);
5315 if (root_role
.level
== PT64_ROOT_5LEVEL
&&
5316 cpu_role
.base
.level
== PT64_ROOT_4LEVEL
)
5317 root_role
.passthrough
= 1;
5319 shadow_mmu_init_context(vcpu
, context
, cpu_role
, root_role
);
5320 kvm_mmu_new_pgd(vcpu
, nested_cr3
);
5322 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu
);
5324 static union kvm_cpu_role
5325 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu
*vcpu
, bool accessed_dirty
,
5326 bool execonly
, u8 level
)
5328 union kvm_cpu_role role
= {0};
5331 * KVM does not support SMM transfer monitors, and consequently does not
5332 * support the "entry to SMM" control either. role.base.smm is always 0.
5334 WARN_ON_ONCE(is_smm(vcpu
));
5335 role
.base
.level
= level
;
5336 role
.base
.has_4_byte_gpte
= false;
5337 role
.base
.direct
= false;
5338 role
.base
.ad_disabled
= !accessed_dirty
;
5339 role
.base
.guest_mode
= true;
5340 role
.base
.access
= ACC_ALL
;
5343 role
.ext
.execonly
= execonly
;
5349 void kvm_init_shadow_ept_mmu(struct kvm_vcpu
*vcpu
, bool execonly
,
5350 int huge_page_level
, bool accessed_dirty
,
5353 struct kvm_mmu
*context
= &vcpu
->arch
.guest_mmu
;
5354 u8 level
= vmx_eptp_page_walk_level(new_eptp
);
5355 union kvm_cpu_role new_mode
=
5356 kvm_calc_shadow_ept_root_page_role(vcpu
, accessed_dirty
,
5359 if (new_mode
.as_u64
!= context
->cpu_role
.as_u64
) {
5360 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
5361 context
->cpu_role
.as_u64
= new_mode
.as_u64
;
5362 context
->root_role
.word
= new_mode
.base
.word
;
5364 context
->page_fault
= ept_page_fault
;
5365 context
->gva_to_gpa
= ept_gva_to_gpa
;
5366 context
->sync_spte
= ept_sync_spte
;
5368 update_permission_bitmask(context
, true);
5369 context
->pkru_mask
= 0;
5370 reset_rsvds_bits_mask_ept(vcpu
, context
, execonly
, huge_page_level
);
5371 reset_ept_shadow_zero_bits_mask(context
, execonly
);
5374 kvm_mmu_new_pgd(vcpu
, new_eptp
);
5376 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu
);
5378 static void init_kvm_softmmu(struct kvm_vcpu
*vcpu
,
5379 union kvm_cpu_role cpu_role
)
5381 struct kvm_mmu
*context
= &vcpu
->arch
.root_mmu
;
5383 kvm_init_shadow_mmu(vcpu
, cpu_role
);
5385 context
->get_guest_pgd
= get_guest_cr3
;
5386 context
->get_pdptr
= kvm_pdptr_read
;
5387 context
->inject_page_fault
= kvm_inject_page_fault
;
5390 static void init_kvm_nested_mmu(struct kvm_vcpu
*vcpu
,
5391 union kvm_cpu_role new_mode
)
5393 struct kvm_mmu
*g_context
= &vcpu
->arch
.nested_mmu
;
5395 if (new_mode
.as_u64
== g_context
->cpu_role
.as_u64
)
5398 g_context
->cpu_role
.as_u64
= new_mode
.as_u64
;
5399 g_context
->get_guest_pgd
= get_guest_cr3
;
5400 g_context
->get_pdptr
= kvm_pdptr_read
;
5401 g_context
->inject_page_fault
= kvm_inject_page_fault
;
5404 * L2 page tables are never shadowed, so there is no need to sync
5407 g_context
->sync_spte
= NULL
;
5410 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5411 * L1's nested page tables (e.g. EPT12). The nested translation
5412 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5413 * L2's page tables as the first level of translation and L1's
5414 * nested page tables as the second level of translation. Basically
5415 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5417 if (!is_paging(vcpu
))
5418 g_context
->gva_to_gpa
= nonpaging_gva_to_gpa
;
5419 else if (is_long_mode(vcpu
))
5420 g_context
->gva_to_gpa
= paging64_gva_to_gpa
;
5421 else if (is_pae(vcpu
))
5422 g_context
->gva_to_gpa
= paging64_gva_to_gpa
;
5424 g_context
->gva_to_gpa
= paging32_gva_to_gpa
;
5426 reset_guest_paging_metadata(vcpu
, g_context
);
5429 void kvm_init_mmu(struct kvm_vcpu
*vcpu
)
5431 struct kvm_mmu_role_regs regs
= vcpu_to_role_regs(vcpu
);
5432 union kvm_cpu_role cpu_role
= kvm_calc_cpu_role(vcpu
, ®s
);
5434 if (mmu_is_nested(vcpu
))
5435 init_kvm_nested_mmu(vcpu
, cpu_role
);
5436 else if (tdp_enabled
)
5437 init_kvm_tdp_mmu(vcpu
, cpu_role
);
5439 init_kvm_softmmu(vcpu
, cpu_role
);
5441 EXPORT_SYMBOL_GPL(kvm_init_mmu
);
5443 void kvm_mmu_after_set_cpuid(struct kvm_vcpu
*vcpu
)
5446 * Invalidate all MMU roles to force them to reinitialize as CPUID
5447 * information is factored into reserved bit calculations.
5449 * Correctly handling multiple vCPU models with respect to paging and
5450 * physical address properties) in a single VM would require tracking
5451 * all relevant CPUID information in kvm_mmu_page_role. That is very
5452 * undesirable as it would increase the memory requirements for
5453 * gfn_write_track (see struct kvm_mmu_page_role comments). For now
5454 * that problem is swept under the rug; KVM's CPUID API is horrific and
5455 * it's all but impossible to solve it without introducing a new API.
5457 vcpu
->arch
.root_mmu
.root_role
.word
= 0;
5458 vcpu
->arch
.guest_mmu
.root_role
.word
= 0;
5459 vcpu
->arch
.nested_mmu
.root_role
.word
= 0;
5460 vcpu
->arch
.root_mmu
.cpu_role
.ext
.valid
= 0;
5461 vcpu
->arch
.guest_mmu
.cpu_role
.ext
.valid
= 0;
5462 vcpu
->arch
.nested_mmu
.cpu_role
.ext
.valid
= 0;
5463 kvm_mmu_reset_context(vcpu
);
5466 * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5467 * kvm_arch_vcpu_ioctl().
5469 KVM_BUG_ON(kvm_vcpu_has_run(vcpu
), vcpu
->kvm
);
5472 void kvm_mmu_reset_context(struct kvm_vcpu
*vcpu
)
5474 kvm_mmu_unload(vcpu
);
5477 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context
);
5479 int kvm_mmu_load(struct kvm_vcpu
*vcpu
)
5483 r
= mmu_topup_memory_caches(vcpu
, !vcpu
->arch
.mmu
->root_role
.direct
);
5486 r
= mmu_alloc_special_roots(vcpu
);
5489 if (vcpu
->arch
.mmu
->root_role
.direct
)
5490 r
= mmu_alloc_direct_roots(vcpu
);
5492 r
= mmu_alloc_shadow_roots(vcpu
);
5496 kvm_mmu_sync_roots(vcpu
);
5498 kvm_mmu_load_pgd(vcpu
);
5501 * Flush any TLB entries for the new root, the provenance of the root
5502 * is unknown. Even if KVM ensures there are no stale TLB entries
5503 * for a freed root, in theory another hypervisor could have left
5504 * stale entries. Flushing on alloc also allows KVM to skip the TLB
5505 * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5507 static_call(kvm_x86_flush_tlb_current
)(vcpu
);
5512 void kvm_mmu_unload(struct kvm_vcpu
*vcpu
)
5514 struct kvm
*kvm
= vcpu
->kvm
;
5516 kvm_mmu_free_roots(kvm
, &vcpu
->arch
.root_mmu
, KVM_MMU_ROOTS_ALL
);
5517 WARN_ON_ONCE(VALID_PAGE(vcpu
->arch
.root_mmu
.root
.hpa
));
5518 kvm_mmu_free_roots(kvm
, &vcpu
->arch
.guest_mmu
, KVM_MMU_ROOTS_ALL
);
5519 WARN_ON_ONCE(VALID_PAGE(vcpu
->arch
.guest_mmu
.root
.hpa
));
5520 vcpu_clear_mmio_info(vcpu
, MMIO_GVA_ANY
);
5523 static bool is_obsolete_root(struct kvm
*kvm
, hpa_t root_hpa
)
5525 struct kvm_mmu_page
*sp
;
5527 if (!VALID_PAGE(root_hpa
))
5531 * When freeing obsolete roots, treat roots as obsolete if they don't
5532 * have an associated shadow page, as it's impossible to determine if
5533 * such roots are fresh or stale. This does mean KVM will get false
5534 * positives and free roots that don't strictly need to be freed, but
5535 * such false positives are relatively rare:
5537 * (a) only PAE paging and nested NPT have roots without shadow pages
5538 * (or any shadow paging flavor with a dummy root, see note below)
5539 * (b) remote reloads due to a memslot update obsoletes _all_ roots
5540 * (c) KVM doesn't track previous roots for PAE paging, and the guest
5541 * is unlikely to zap an in-use PGD.
5543 * Note! Dummy roots are unique in that they are obsoleted by memslot
5544 * _creation_! See also FNAME(fetch).
5546 sp
= root_to_sp(root_hpa
);
5547 return !sp
|| is_obsolete_sp(kvm
, sp
);
5550 static void __kvm_mmu_free_obsolete_roots(struct kvm
*kvm
, struct kvm_mmu
*mmu
)
5552 unsigned long roots_to_free
= 0;
5555 if (is_obsolete_root(kvm
, mmu
->root
.hpa
))
5556 roots_to_free
|= KVM_MMU_ROOT_CURRENT
;
5558 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++) {
5559 if (is_obsolete_root(kvm
, mmu
->prev_roots
[i
].hpa
))
5560 roots_to_free
|= KVM_MMU_ROOT_PREVIOUS(i
);
5564 kvm_mmu_free_roots(kvm
, mmu
, roots_to_free
);
5567 void kvm_mmu_free_obsolete_roots(struct kvm_vcpu
*vcpu
)
5569 __kvm_mmu_free_obsolete_roots(vcpu
->kvm
, &vcpu
->arch
.root_mmu
);
5570 __kvm_mmu_free_obsolete_roots(vcpu
->kvm
, &vcpu
->arch
.guest_mmu
);
5573 static u64
mmu_pte_write_fetch_gpte(struct kvm_vcpu
*vcpu
, gpa_t
*gpa
,
5580 * Assume that the pte write on a page table of the same type
5581 * as the current vcpu paging mode since we update the sptes only
5582 * when they have the same mode.
5584 if (is_pae(vcpu
) && *bytes
== 4) {
5585 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5590 if (*bytes
== 4 || *bytes
== 8) {
5591 r
= kvm_vcpu_read_guest_atomic(vcpu
, *gpa
, &gentry
, *bytes
);
5600 * If we're seeing too many writes to a page, it may no longer be a page table,
5601 * or we may be forking, in which case it is better to unmap the page.
5603 static bool detect_write_flooding(struct kvm_mmu_page
*sp
)
5606 * Skip write-flooding detected for the sp whose level is 1, because
5607 * it can become unsync, then the guest page is not write-protected.
5609 if (sp
->role
.level
== PG_LEVEL_4K
)
5612 atomic_inc(&sp
->write_flooding_count
);
5613 return atomic_read(&sp
->write_flooding_count
) >= 3;
5617 * Misaligned accesses are too much trouble to fix up; also, they usually
5618 * indicate a page is not used as a page table.
5620 static bool detect_write_misaligned(struct kvm_mmu_page
*sp
, gpa_t gpa
,
5623 unsigned offset
, pte_size
, misaligned
;
5625 offset
= offset_in_page(gpa
);
5626 pte_size
= sp
->role
.has_4_byte_gpte
? 4 : 8;
5629 * Sometimes, the OS only writes the last one bytes to update status
5630 * bits, for example, in linux, andb instruction is used in clear_bit().
5632 if (!(offset
& (pte_size
- 1)) && bytes
== 1)
5635 misaligned
= (offset
^ (offset
+ bytes
- 1)) & ~(pte_size
- 1);
5636 misaligned
|= bytes
< 4;
5641 static u64
*get_written_sptes(struct kvm_mmu_page
*sp
, gpa_t gpa
, int *nspte
)
5643 unsigned page_offset
, quadrant
;
5647 page_offset
= offset_in_page(gpa
);
5648 level
= sp
->role
.level
;
5650 if (sp
->role
.has_4_byte_gpte
) {
5651 page_offset
<<= 1; /* 32->64 */
5653 * A 32-bit pde maps 4MB while the shadow pdes map
5654 * only 2MB. So we need to double the offset again
5655 * and zap two pdes instead of one.
5657 if (level
== PT32_ROOT_LEVEL
) {
5658 page_offset
&= ~7; /* kill rounding error */
5662 quadrant
= page_offset
>> PAGE_SHIFT
;
5663 page_offset
&= ~PAGE_MASK
;
5664 if (quadrant
!= sp
->role
.quadrant
)
5668 spte
= &sp
->spt
[page_offset
/ sizeof(*spte
)];
5672 void kvm_mmu_track_write(struct kvm_vcpu
*vcpu
, gpa_t gpa
, const u8
*new,
5675 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
5676 struct kvm_mmu_page
*sp
;
5677 LIST_HEAD(invalid_list
);
5678 u64 entry
, gentry
, *spte
;
5683 * If we don't have indirect shadow pages, it means no page is
5684 * write-protected, so we can exit simply.
5686 if (!READ_ONCE(vcpu
->kvm
->arch
.indirect_shadow_pages
))
5689 write_lock(&vcpu
->kvm
->mmu_lock
);
5691 gentry
= mmu_pte_write_fetch_gpte(vcpu
, &gpa
, &bytes
);
5693 ++vcpu
->kvm
->stat
.mmu_pte_write
;
5695 for_each_gfn_valid_sp_with_gptes(vcpu
->kvm
, sp
, gfn
) {
5696 if (detect_write_misaligned(sp
, gpa
, bytes
) ||
5697 detect_write_flooding(sp
)) {
5698 kvm_mmu_prepare_zap_page(vcpu
->kvm
, sp
, &invalid_list
);
5699 ++vcpu
->kvm
->stat
.mmu_flooded
;
5703 spte
= get_written_sptes(sp
, gpa
, &npte
);
5709 mmu_page_zap_pte(vcpu
->kvm
, sp
, spte
, NULL
);
5710 if (gentry
&& sp
->role
.level
!= PG_LEVEL_4K
)
5711 ++vcpu
->kvm
->stat
.mmu_pde_zapped
;
5712 if (is_shadow_present_pte(entry
))
5717 kvm_mmu_remote_flush_or_zap(vcpu
->kvm
, &invalid_list
, flush
);
5718 write_unlock(&vcpu
->kvm
->mmu_lock
);
5721 int noinline
kvm_mmu_page_fault(struct kvm_vcpu
*vcpu
, gpa_t cr2_or_gpa
, u64 error_code
,
5722 void *insn
, int insn_len
)
5724 int r
, emulation_type
= EMULTYPE_PF
;
5725 bool direct
= vcpu
->arch
.mmu
->root_role
.direct
;
5728 * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
5729 * checks when emulating instructions that triggers implicit access.
5730 * WARN if hardware generates a fault with an error code that collides
5731 * with the KVM-defined value. Clear the flag and continue on, i.e.
5732 * don't terminate the VM, as KVM can't possibly be relying on a flag
5733 * that KVM doesn't know about.
5735 if (WARN_ON_ONCE(error_code
& PFERR_IMPLICIT_ACCESS
))
5736 error_code
&= ~PFERR_IMPLICIT_ACCESS
;
5738 if (WARN_ON_ONCE(!VALID_PAGE(vcpu
->arch
.mmu
->root
.hpa
)))
5739 return RET_PF_RETRY
;
5742 if (unlikely(error_code
& PFERR_RSVD_MASK
)) {
5743 r
= handle_mmio_page_fault(vcpu
, cr2_or_gpa
, direct
);
5744 if (r
== RET_PF_EMULATE
)
5748 if (r
== RET_PF_INVALID
) {
5749 r
= kvm_mmu_do_page_fault(vcpu
, cr2_or_gpa
,
5750 lower_32_bits(error_code
), false,
5752 if (KVM_BUG_ON(r
== RET_PF_INVALID
, vcpu
->kvm
))
5758 if (r
!= RET_PF_EMULATE
)
5762 * Before emulating the instruction, check if the error code
5763 * was due to a RO violation while translating the guest page.
5764 * This can occur when using nested virtualization with nested
5765 * paging in both guests. If true, we simply unprotect the page
5766 * and resume the guest.
5768 if (vcpu
->arch
.mmu
->root_role
.direct
&&
5769 (error_code
& PFERR_NESTED_GUEST_PAGE
) == PFERR_NESTED_GUEST_PAGE
) {
5770 kvm_mmu_unprotect_page(vcpu
->kvm
, gpa_to_gfn(cr2_or_gpa
));
5775 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5776 * optimistically try to just unprotect the page and let the processor
5777 * re-execute the instruction that caused the page fault. Do not allow
5778 * retrying MMIO emulation, as it's not only pointless but could also
5779 * cause us to enter an infinite loop because the processor will keep
5780 * faulting on the non-existent MMIO address. Retrying an instruction
5781 * from a nested guest is also pointless and dangerous as we are only
5782 * explicitly shadowing L1's page tables, i.e. unprotecting something
5783 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5785 if (!mmio_info_in_cache(vcpu
, cr2_or_gpa
, direct
) && !is_guest_mode(vcpu
))
5786 emulation_type
|= EMULTYPE_ALLOW_RETRY_PF
;
5788 return x86_emulate_instruction(vcpu
, cr2_or_gpa
, emulation_type
, insn
,
5791 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault
);
5793 static void __kvm_mmu_invalidate_addr(struct kvm_vcpu
*vcpu
, struct kvm_mmu
*mmu
,
5794 u64 addr
, hpa_t root_hpa
)
5796 struct kvm_shadow_walk_iterator iterator
;
5798 vcpu_clear_mmio_info(vcpu
, addr
);
5801 * Walking and synchronizing SPTEs both assume they are operating in
5802 * the context of the current MMU, and would need to be reworked if
5803 * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
5805 if (WARN_ON_ONCE(mmu
!= vcpu
->arch
.mmu
))
5808 if (!VALID_PAGE(root_hpa
))
5811 write_lock(&vcpu
->kvm
->mmu_lock
);
5812 for_each_shadow_entry_using_root(vcpu
, root_hpa
, addr
, iterator
) {
5813 struct kvm_mmu_page
*sp
= sptep_to_sp(iterator
.sptep
);
5816 int ret
= kvm_sync_spte(vcpu
, sp
, iterator
.index
);
5819 mmu_page_zap_pte(vcpu
->kvm
, sp
, iterator
.sptep
, NULL
);
5821 kvm_flush_remote_tlbs_sptep(vcpu
->kvm
, iterator
.sptep
);
5824 if (!sp
->unsync_children
)
5827 write_unlock(&vcpu
->kvm
->mmu_lock
);
5830 void kvm_mmu_invalidate_addr(struct kvm_vcpu
*vcpu
, struct kvm_mmu
*mmu
,
5831 u64 addr
, unsigned long roots
)
5835 WARN_ON_ONCE(roots
& ~KVM_MMU_ROOTS_ALL
);
5837 /* It's actually a GPA for vcpu->arch.guest_mmu. */
5838 if (mmu
!= &vcpu
->arch
.guest_mmu
) {
5839 /* INVLPG on a non-canonical address is a NOP according to the SDM. */
5840 if (is_noncanonical_address(addr
, vcpu
))
5843 static_call(kvm_x86_flush_tlb_gva
)(vcpu
, addr
);
5846 if (!mmu
->sync_spte
)
5849 if (roots
& KVM_MMU_ROOT_CURRENT
)
5850 __kvm_mmu_invalidate_addr(vcpu
, mmu
, addr
, mmu
->root
.hpa
);
5852 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++) {
5853 if (roots
& KVM_MMU_ROOT_PREVIOUS(i
))
5854 __kvm_mmu_invalidate_addr(vcpu
, mmu
, addr
, mmu
->prev_roots
[i
].hpa
);
5857 EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr
);
5859 void kvm_mmu_invlpg(struct kvm_vcpu
*vcpu
, gva_t gva
)
5862 * INVLPG is required to invalidate any global mappings for the VA,
5863 * irrespective of PCID. Blindly sync all roots as it would take
5864 * roughly the same amount of work/time to determine whether any of the
5865 * previous roots have a global mapping.
5867 * Mappings not reachable via the current or previous cached roots will
5868 * be synced when switching to that new cr3, so nothing needs to be
5869 * done here for them.
5871 kvm_mmu_invalidate_addr(vcpu
, vcpu
->arch
.walk_mmu
, gva
, KVM_MMU_ROOTS_ALL
);
5872 ++vcpu
->stat
.invlpg
;
5874 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg
);
5877 void kvm_mmu_invpcid_gva(struct kvm_vcpu
*vcpu
, gva_t gva
, unsigned long pcid
)
5879 struct kvm_mmu
*mmu
= vcpu
->arch
.mmu
;
5880 unsigned long roots
= 0;
5883 if (pcid
== kvm_get_active_pcid(vcpu
))
5884 roots
|= KVM_MMU_ROOT_CURRENT
;
5886 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++) {
5887 if (VALID_PAGE(mmu
->prev_roots
[i
].hpa
) &&
5888 pcid
== kvm_get_pcid(vcpu
, mmu
->prev_roots
[i
].pgd
))
5889 roots
|= KVM_MMU_ROOT_PREVIOUS(i
);
5893 kvm_mmu_invalidate_addr(vcpu
, mmu
, gva
, roots
);
5894 ++vcpu
->stat
.invlpg
;
5897 * Mappings not reachable via the current cr3 or the prev_roots will be
5898 * synced when switching to that cr3, so nothing needs to be done here
5903 void kvm_configure_mmu(bool enable_tdp
, int tdp_forced_root_level
,
5904 int tdp_max_root_level
, int tdp_huge_page_level
)
5906 tdp_enabled
= enable_tdp
;
5907 tdp_root_level
= tdp_forced_root_level
;
5908 max_tdp_level
= tdp_max_root_level
;
5910 #ifdef CONFIG_X86_64
5911 tdp_mmu_enabled
= tdp_mmu_allowed
&& tdp_enabled
;
5914 * max_huge_page_level reflects KVM's MMU capabilities irrespective
5915 * of kernel support, e.g. KVM may be capable of using 1GB pages when
5916 * the kernel is not. But, KVM never creates a page size greater than
5917 * what is used by the kernel for any given HVA, i.e. the kernel's
5918 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
5921 max_huge_page_level
= tdp_huge_page_level
;
5922 else if (boot_cpu_has(X86_FEATURE_GBPAGES
))
5923 max_huge_page_level
= PG_LEVEL_1G
;
5925 max_huge_page_level
= PG_LEVEL_2M
;
5927 EXPORT_SYMBOL_GPL(kvm_configure_mmu
);
5929 /* The return value indicates if tlb flush on all vcpus is needed. */
5930 typedef bool (*slot_rmaps_handler
) (struct kvm
*kvm
,
5931 struct kvm_rmap_head
*rmap_head
,
5932 const struct kvm_memory_slot
*slot
);
5934 static __always_inline
bool __walk_slot_rmaps(struct kvm
*kvm
,
5935 const struct kvm_memory_slot
*slot
,
5936 slot_rmaps_handler fn
,
5937 int start_level
, int end_level
,
5938 gfn_t start_gfn
, gfn_t end_gfn
,
5939 bool flush_on_yield
, bool flush
)
5941 struct slot_rmap_walk_iterator iterator
;
5943 lockdep_assert_held_write(&kvm
->mmu_lock
);
5945 for_each_slot_rmap_range(slot
, start_level
, end_level
, start_gfn
,
5946 end_gfn
, &iterator
) {
5948 flush
|= fn(kvm
, iterator
.rmap
, slot
);
5950 if (need_resched() || rwlock_needbreak(&kvm
->mmu_lock
)) {
5951 if (flush
&& flush_on_yield
) {
5952 kvm_flush_remote_tlbs_range(kvm
, start_gfn
,
5953 iterator
.gfn
- start_gfn
+ 1);
5956 cond_resched_rwlock_write(&kvm
->mmu_lock
);
5963 static __always_inline
bool walk_slot_rmaps(struct kvm
*kvm
,
5964 const struct kvm_memory_slot
*slot
,
5965 slot_rmaps_handler fn
,
5966 int start_level
, int end_level
,
5967 bool flush_on_yield
)
5969 return __walk_slot_rmaps(kvm
, slot
, fn
, start_level
, end_level
,
5970 slot
->base_gfn
, slot
->base_gfn
+ slot
->npages
- 1,
5971 flush_on_yield
, false);
5974 static __always_inline
bool walk_slot_rmaps_4k(struct kvm
*kvm
,
5975 const struct kvm_memory_slot
*slot
,
5976 slot_rmaps_handler fn
,
5977 bool flush_on_yield
)
5979 return walk_slot_rmaps(kvm
, slot
, fn
, PG_LEVEL_4K
, PG_LEVEL_4K
, flush_on_yield
);
5982 static void free_mmu_pages(struct kvm_mmu
*mmu
)
5984 if (!tdp_enabled
&& mmu
->pae_root
)
5985 set_memory_encrypted((unsigned long)mmu
->pae_root
, 1);
5986 free_page((unsigned long)mmu
->pae_root
);
5987 free_page((unsigned long)mmu
->pml4_root
);
5988 free_page((unsigned long)mmu
->pml5_root
);
5991 static int __kvm_mmu_create(struct kvm_vcpu
*vcpu
, struct kvm_mmu
*mmu
)
5996 mmu
->root
.hpa
= INVALID_PAGE
;
5998 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++)
5999 mmu
->prev_roots
[i
] = KVM_MMU_ROOT_INFO_INVALID
;
6001 /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
6002 if (!tdp_enabled
&& mmu
== &vcpu
->arch
.guest_mmu
)
6006 * When using PAE paging, the four PDPTEs are treated as 'root' pages,
6007 * while the PDP table is a per-vCPU construct that's allocated at MMU
6008 * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
6009 * x86_64. Therefore we need to allocate the PDP table in the first
6010 * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
6011 * generally doesn't use PAE paging and can skip allocating the PDP
6012 * table. The main exception, handled here, is SVM's 32-bit NPT. The
6013 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
6014 * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
6016 if (tdp_enabled
&& kvm_mmu_get_tdp_level(vcpu
) > PT32E_ROOT_LEVEL
)
6019 page
= alloc_page(GFP_KERNEL_ACCOUNT
| __GFP_DMA32
);
6023 mmu
->pae_root
= page_address(page
);
6026 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
6027 * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
6028 * that KVM's writes and the CPU's reads get along. Note, this is
6029 * only necessary when using shadow paging, as 64-bit NPT can get at
6030 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
6031 * by 32-bit kernels (when KVM itself uses 32-bit NPT).
6034 set_memory_decrypted((unsigned long)mmu
->pae_root
, 1);
6036 WARN_ON_ONCE(shadow_me_value
);
6038 for (i
= 0; i
< 4; ++i
)
6039 mmu
->pae_root
[i
] = INVALID_PAE_ROOT
;
6044 int kvm_mmu_create(struct kvm_vcpu
*vcpu
)
6048 vcpu
->arch
.mmu_pte_list_desc_cache
.kmem_cache
= pte_list_desc_cache
;
6049 vcpu
->arch
.mmu_pte_list_desc_cache
.gfp_zero
= __GFP_ZERO
;
6051 vcpu
->arch
.mmu_page_header_cache
.kmem_cache
= mmu_page_header_cache
;
6052 vcpu
->arch
.mmu_page_header_cache
.gfp_zero
= __GFP_ZERO
;
6054 vcpu
->arch
.mmu_shadow_page_cache
.gfp_zero
= __GFP_ZERO
;
6056 vcpu
->arch
.mmu
= &vcpu
->arch
.root_mmu
;
6057 vcpu
->arch
.walk_mmu
= &vcpu
->arch
.root_mmu
;
6059 ret
= __kvm_mmu_create(vcpu
, &vcpu
->arch
.guest_mmu
);
6063 ret
= __kvm_mmu_create(vcpu
, &vcpu
->arch
.root_mmu
);
6065 goto fail_allocate_root
;
6069 free_mmu_pages(&vcpu
->arch
.guest_mmu
);
6073 #define BATCH_ZAP_PAGES 10
6074 static void kvm_zap_obsolete_pages(struct kvm
*kvm
)
6076 struct kvm_mmu_page
*sp
, *node
;
6077 int nr_zapped
, batch
= 0;
6081 list_for_each_entry_safe_reverse(sp
, node
,
6082 &kvm
->arch
.active_mmu_pages
, link
) {
6084 * No obsolete valid page exists before a newly created page
6085 * since active_mmu_pages is a FIFO list.
6087 if (!is_obsolete_sp(kvm
, sp
))
6091 * Invalid pages should never land back on the list of active
6092 * pages. Skip the bogus page, otherwise we'll get stuck in an
6093 * infinite loop if the page gets put back on the list (again).
6095 if (WARN_ON_ONCE(sp
->role
.invalid
))
6099 * No need to flush the TLB since we're only zapping shadow
6100 * pages with an obsolete generation number and all vCPUS have
6101 * loaded a new root, i.e. the shadow pages being zapped cannot
6102 * be in active use by the guest.
6104 if (batch
>= BATCH_ZAP_PAGES
&&
6105 cond_resched_rwlock_write(&kvm
->mmu_lock
)) {
6110 unstable
= __kvm_mmu_prepare_zap_page(kvm
, sp
,
6111 &kvm
->arch
.zapped_obsolete_pages
, &nr_zapped
);
6119 * Kick all vCPUs (via remote TLB flush) before freeing the page tables
6120 * to ensure KVM is not in the middle of a lockless shadow page table
6121 * walk, which may reference the pages. The remote TLB flush itself is
6122 * not required and is simply a convenient way to kick vCPUs as needed.
6123 * KVM performs a local TLB flush when allocating a new root (see
6124 * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
6125 * running with an obsolete MMU.
6127 kvm_mmu_commit_zap_page(kvm
, &kvm
->arch
.zapped_obsolete_pages
);
6131 * Fast invalidate all shadow pages and use lock-break technique
6132 * to zap obsolete pages.
6134 * It's required when memslot is being deleted or VM is being
6135 * destroyed, in these cases, we should ensure that KVM MMU does
6136 * not use any resource of the being-deleted slot or all slots
6137 * after calling the function.
6139 static void kvm_mmu_zap_all_fast(struct kvm
*kvm
)
6141 lockdep_assert_held(&kvm
->slots_lock
);
6143 write_lock(&kvm
->mmu_lock
);
6144 trace_kvm_mmu_zap_all_fast(kvm
);
6147 * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
6148 * held for the entire duration of zapping obsolete pages, it's
6149 * impossible for there to be multiple invalid generations associated
6150 * with *valid* shadow pages at any given time, i.e. there is exactly
6151 * one valid generation and (at most) one invalid generation.
6153 kvm
->arch
.mmu_valid_gen
= kvm
->arch
.mmu_valid_gen
? 0 : 1;
6156 * In order to ensure all vCPUs drop their soon-to-be invalid roots,
6157 * invalidating TDP MMU roots must be done while holding mmu_lock for
6158 * write and in the same critical section as making the reload request,
6159 * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
6161 if (tdp_mmu_enabled
)
6162 kvm_tdp_mmu_invalidate_all_roots(kvm
);
6165 * Notify all vcpus to reload its shadow page table and flush TLB.
6166 * Then all vcpus will switch to new shadow page table with the new
6169 * Note: we need to do this under the protection of mmu_lock,
6170 * otherwise, vcpu would purge shadow page but miss tlb flush.
6172 kvm_make_all_cpus_request(kvm
, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS
);
6174 kvm_zap_obsolete_pages(kvm
);
6176 write_unlock(&kvm
->mmu_lock
);
6179 * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
6180 * returning to the caller, e.g. if the zap is in response to a memslot
6181 * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
6182 * associated with the deleted memslot once the update completes, and
6183 * Deferring the zap until the final reference to the root is put would
6184 * lead to use-after-free.
6186 if (tdp_mmu_enabled
)
6187 kvm_tdp_mmu_zap_invalidated_roots(kvm
);
6190 static bool kvm_has_zapped_obsolete_pages(struct kvm
*kvm
)
6192 return unlikely(!list_empty_careful(&kvm
->arch
.zapped_obsolete_pages
));
6195 void kvm_mmu_init_vm(struct kvm
*kvm
)
6197 INIT_LIST_HEAD(&kvm
->arch
.active_mmu_pages
);
6198 INIT_LIST_HEAD(&kvm
->arch
.zapped_obsolete_pages
);
6199 INIT_LIST_HEAD(&kvm
->arch
.possible_nx_huge_pages
);
6200 spin_lock_init(&kvm
->arch
.mmu_unsync_pages_lock
);
6202 if (tdp_mmu_enabled
)
6203 kvm_mmu_init_tdp_mmu(kvm
);
6205 kvm
->arch
.split_page_header_cache
.kmem_cache
= mmu_page_header_cache
;
6206 kvm
->arch
.split_page_header_cache
.gfp_zero
= __GFP_ZERO
;
6208 kvm
->arch
.split_shadow_page_cache
.gfp_zero
= __GFP_ZERO
;
6210 kvm
->arch
.split_desc_cache
.kmem_cache
= pte_list_desc_cache
;
6211 kvm
->arch
.split_desc_cache
.gfp_zero
= __GFP_ZERO
;
6214 static void mmu_free_vm_memory_caches(struct kvm
*kvm
)
6216 kvm_mmu_free_memory_cache(&kvm
->arch
.split_desc_cache
);
6217 kvm_mmu_free_memory_cache(&kvm
->arch
.split_page_header_cache
);
6218 kvm_mmu_free_memory_cache(&kvm
->arch
.split_shadow_page_cache
);
6221 void kvm_mmu_uninit_vm(struct kvm
*kvm
)
6223 if (tdp_mmu_enabled
)
6224 kvm_mmu_uninit_tdp_mmu(kvm
);
6226 mmu_free_vm_memory_caches(kvm
);
6229 static bool kvm_rmap_zap_gfn_range(struct kvm
*kvm
, gfn_t gfn_start
, gfn_t gfn_end
)
6231 const struct kvm_memory_slot
*memslot
;
6232 struct kvm_memslots
*slots
;
6233 struct kvm_memslot_iter iter
;
6238 if (!kvm_memslots_have_rmaps(kvm
))
6241 for (i
= 0; i
< KVM_ADDRESS_SPACE_NUM
; i
++) {
6242 slots
= __kvm_memslots(kvm
, i
);
6244 kvm_for_each_memslot_in_gfn_range(&iter
, slots
, gfn_start
, gfn_end
) {
6245 memslot
= iter
.slot
;
6246 start
= max(gfn_start
, memslot
->base_gfn
);
6247 end
= min(gfn_end
, memslot
->base_gfn
+ memslot
->npages
);
6248 if (WARN_ON_ONCE(start
>= end
))
6251 flush
= __walk_slot_rmaps(kvm
, memslot
, __kvm_zap_rmap
,
6252 PG_LEVEL_4K
, KVM_MAX_HUGEPAGE_LEVEL
,
6253 start
, end
- 1, true, flush
);
6261 * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
6262 * (not including it)
6264 void kvm_zap_gfn_range(struct kvm
*kvm
, gfn_t gfn_start
, gfn_t gfn_end
)
6268 if (WARN_ON_ONCE(gfn_end
<= gfn_start
))
6271 write_lock(&kvm
->mmu_lock
);
6273 kvm_mmu_invalidate_begin(kvm
);
6275 kvm_mmu_invalidate_range_add(kvm
, gfn_start
, gfn_end
);
6277 flush
= kvm_rmap_zap_gfn_range(kvm
, gfn_start
, gfn_end
);
6279 if (tdp_mmu_enabled
)
6280 flush
= kvm_tdp_mmu_zap_leafs(kvm
, gfn_start
, gfn_end
, flush
);
6283 kvm_flush_remote_tlbs_range(kvm
, gfn_start
, gfn_end
- gfn_start
);
6285 kvm_mmu_invalidate_end(kvm
);
6287 write_unlock(&kvm
->mmu_lock
);
6290 static bool slot_rmap_write_protect(struct kvm
*kvm
,
6291 struct kvm_rmap_head
*rmap_head
,
6292 const struct kvm_memory_slot
*slot
)
6294 return rmap_write_protect(rmap_head
, false);
6297 void kvm_mmu_slot_remove_write_access(struct kvm
*kvm
,
6298 const struct kvm_memory_slot
*memslot
,
6301 if (kvm_memslots_have_rmaps(kvm
)) {
6302 write_lock(&kvm
->mmu_lock
);
6303 walk_slot_rmaps(kvm
, memslot
, slot_rmap_write_protect
,
6304 start_level
, KVM_MAX_HUGEPAGE_LEVEL
, false);
6305 write_unlock(&kvm
->mmu_lock
);
6308 if (tdp_mmu_enabled
) {
6309 read_lock(&kvm
->mmu_lock
);
6310 kvm_tdp_mmu_wrprot_slot(kvm
, memslot
, start_level
);
6311 read_unlock(&kvm
->mmu_lock
);
6315 static inline bool need_topup(struct kvm_mmu_memory_cache
*cache
, int min
)
6317 return kvm_mmu_memory_cache_nr_free_objects(cache
) < min
;
6320 static bool need_topup_split_caches_or_resched(struct kvm
*kvm
)
6322 if (need_resched() || rwlock_needbreak(&kvm
->mmu_lock
))
6326 * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
6327 * to split a single huge page. Calculating how many are actually needed
6328 * is possible but not worth the complexity.
6330 return need_topup(&kvm
->arch
.split_desc_cache
, SPLIT_DESC_CACHE_MIN_NR_OBJECTS
) ||
6331 need_topup(&kvm
->arch
.split_page_header_cache
, 1) ||
6332 need_topup(&kvm
->arch
.split_shadow_page_cache
, 1);
6335 static int topup_split_caches(struct kvm
*kvm
)
6338 * Allocating rmap list entries when splitting huge pages for nested
6339 * MMUs is uncommon as KVM needs to use a list if and only if there is
6340 * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
6341 * aliased by multiple L2 gfns and/or from multiple nested roots with
6342 * different roles. Aliasing gfns when using TDP is atypical for VMMs;
6343 * a few gfns are often aliased during boot, e.g. when remapping BIOS,
6344 * but aliasing rarely occurs post-boot or for many gfns. If there is
6345 * only one rmap entry, rmap->val points directly at that one entry and
6346 * doesn't need to allocate a list. Buffer the cache by the default
6347 * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
6348 * encounters an aliased gfn or two.
6350 const int capacity
= SPLIT_DESC_CACHE_MIN_NR_OBJECTS
+
6351 KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
;
6354 lockdep_assert_held(&kvm
->slots_lock
);
6356 r
= __kvm_mmu_topup_memory_cache(&kvm
->arch
.split_desc_cache
, capacity
,
6357 SPLIT_DESC_CACHE_MIN_NR_OBJECTS
);
6361 r
= kvm_mmu_topup_memory_cache(&kvm
->arch
.split_page_header_cache
, 1);
6365 return kvm_mmu_topup_memory_cache(&kvm
->arch
.split_shadow_page_cache
, 1);
6368 static struct kvm_mmu_page
*shadow_mmu_get_sp_for_split(struct kvm
*kvm
, u64
*huge_sptep
)
6370 struct kvm_mmu_page
*huge_sp
= sptep_to_sp(huge_sptep
);
6371 struct shadow_page_caches caches
= {};
6372 union kvm_mmu_page_role role
;
6373 unsigned int access
;
6376 gfn
= kvm_mmu_page_get_gfn(huge_sp
, spte_index(huge_sptep
));
6377 access
= kvm_mmu_page_get_access(huge_sp
, spte_index(huge_sptep
));
6380 * Note, huge page splitting always uses direct shadow pages, regardless
6381 * of whether the huge page itself is mapped by a direct or indirect
6382 * shadow page, since the huge page region itself is being directly
6383 * mapped with smaller pages.
6385 role
= kvm_mmu_child_role(huge_sptep
, /*direct=*/true, access
);
6387 /* Direct SPs do not require a shadowed_info_cache. */
6388 caches
.page_header_cache
= &kvm
->arch
.split_page_header_cache
;
6389 caches
.shadow_page_cache
= &kvm
->arch
.split_shadow_page_cache
;
6391 /* Safe to pass NULL for vCPU since requesting a direct SP. */
6392 return __kvm_mmu_get_shadow_page(kvm
, NULL
, &caches
, gfn
, role
);
6395 static void shadow_mmu_split_huge_page(struct kvm
*kvm
,
6396 const struct kvm_memory_slot
*slot
,
6400 struct kvm_mmu_memory_cache
*cache
= &kvm
->arch
.split_desc_cache
;
6401 u64 huge_spte
= READ_ONCE(*huge_sptep
);
6402 struct kvm_mmu_page
*sp
;
6408 sp
= shadow_mmu_get_sp_for_split(kvm
, huge_sptep
);
6410 for (index
= 0; index
< SPTE_ENT_PER_PAGE
; index
++) {
6411 sptep
= &sp
->spt
[index
];
6412 gfn
= kvm_mmu_page_get_gfn(sp
, index
);
6415 * The SP may already have populated SPTEs, e.g. if this huge
6416 * page is aliased by multiple sptes with the same access
6417 * permissions. These entries are guaranteed to map the same
6418 * gfn-to-pfn translation since the SP is direct, so no need to
6421 * However, if a given SPTE points to a lower level page table,
6422 * that lower level page table may only be partially populated.
6423 * Installing such SPTEs would effectively unmap a potion of the
6424 * huge page. Unmapping guest memory always requires a TLB flush
6425 * since a subsequent operation on the unmapped regions would
6426 * fail to detect the need to flush.
6428 if (is_shadow_present_pte(*sptep
)) {
6429 flush
|= !is_last_spte(*sptep
, sp
->role
.level
);
6433 spte
= make_huge_page_split_spte(kvm
, huge_spte
, sp
->role
, index
);
6434 mmu_spte_set(sptep
, spte
);
6435 __rmap_add(kvm
, cache
, slot
, sptep
, gfn
, sp
->role
.access
);
6438 __link_shadow_page(kvm
, cache
, huge_sptep
, sp
, flush
);
6441 static int shadow_mmu_try_split_huge_page(struct kvm
*kvm
,
6442 const struct kvm_memory_slot
*slot
,
6445 struct kvm_mmu_page
*huge_sp
= sptep_to_sp(huge_sptep
);
6450 /* Grab information for the tracepoint before dropping the MMU lock. */
6451 gfn
= kvm_mmu_page_get_gfn(huge_sp
, spte_index(huge_sptep
));
6452 level
= huge_sp
->role
.level
;
6455 if (kvm_mmu_available_pages(kvm
) <= KVM_MIN_FREE_MMU_PAGES
) {
6460 if (need_topup_split_caches_or_resched(kvm
)) {
6461 write_unlock(&kvm
->mmu_lock
);
6464 * If the topup succeeds, return -EAGAIN to indicate that the
6465 * rmap iterator should be restarted because the MMU lock was
6468 r
= topup_split_caches(kvm
) ?: -EAGAIN
;
6469 write_lock(&kvm
->mmu_lock
);
6473 shadow_mmu_split_huge_page(kvm
, slot
, huge_sptep
);
6476 trace_kvm_mmu_split_huge_page(gfn
, spte
, level
, r
);
6480 static bool shadow_mmu_try_split_huge_pages(struct kvm
*kvm
,
6481 struct kvm_rmap_head
*rmap_head
,
6482 const struct kvm_memory_slot
*slot
)
6484 struct rmap_iterator iter
;
6485 struct kvm_mmu_page
*sp
;
6490 for_each_rmap_spte(rmap_head
, &iter
, huge_sptep
) {
6491 sp
= sptep_to_sp(huge_sptep
);
6493 /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
6494 if (WARN_ON_ONCE(!sp
->role
.guest_mode
))
6497 /* The rmaps should never contain non-leaf SPTEs. */
6498 if (WARN_ON_ONCE(!is_large_pte(*huge_sptep
)))
6501 /* SPs with level >PG_LEVEL_4K should never by unsync. */
6502 if (WARN_ON_ONCE(sp
->unsync
))
6505 /* Don't bother splitting huge pages on invalid SPs. */
6506 if (sp
->role
.invalid
)
6509 r
= shadow_mmu_try_split_huge_page(kvm
, slot
, huge_sptep
);
6512 * The split succeeded or needs to be retried because the MMU
6513 * lock was dropped. Either way, restart the iterator to get it
6514 * back into a consistent state.
6516 if (!r
|| r
== -EAGAIN
)
6519 /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
6526 static void kvm_shadow_mmu_try_split_huge_pages(struct kvm
*kvm
,
6527 const struct kvm_memory_slot
*slot
,
6528 gfn_t start
, gfn_t end
,
6534 * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
6535 * down to the target level. This ensures pages are recursively split
6536 * all the way to the target level. There's no need to split pages
6537 * already at the target level.
6539 for (level
= KVM_MAX_HUGEPAGE_LEVEL
; level
> target_level
; level
--)
6540 __walk_slot_rmaps(kvm
, slot
, shadow_mmu_try_split_huge_pages
,
6541 level
, level
, start
, end
- 1, true, false);
6544 /* Must be called with the mmu_lock held in write-mode. */
6545 void kvm_mmu_try_split_huge_pages(struct kvm
*kvm
,
6546 const struct kvm_memory_slot
*memslot
,
6550 if (!tdp_mmu_enabled
)
6553 if (kvm_memslots_have_rmaps(kvm
))
6554 kvm_shadow_mmu_try_split_huge_pages(kvm
, memslot
, start
, end
, target_level
);
6556 kvm_tdp_mmu_try_split_huge_pages(kvm
, memslot
, start
, end
, target_level
, false);
6559 * A TLB flush is unnecessary at this point for the same resons as in
6560 * kvm_mmu_slot_try_split_huge_pages().
6564 void kvm_mmu_slot_try_split_huge_pages(struct kvm
*kvm
,
6565 const struct kvm_memory_slot
*memslot
,
6568 u64 start
= memslot
->base_gfn
;
6569 u64 end
= start
+ memslot
->npages
;
6571 if (!tdp_mmu_enabled
)
6574 if (kvm_memslots_have_rmaps(kvm
)) {
6575 write_lock(&kvm
->mmu_lock
);
6576 kvm_shadow_mmu_try_split_huge_pages(kvm
, memslot
, start
, end
, target_level
);
6577 write_unlock(&kvm
->mmu_lock
);
6580 read_lock(&kvm
->mmu_lock
);
6581 kvm_tdp_mmu_try_split_huge_pages(kvm
, memslot
, start
, end
, target_level
, true);
6582 read_unlock(&kvm
->mmu_lock
);
6585 * No TLB flush is necessary here. KVM will flush TLBs after
6586 * write-protecting and/or clearing dirty on the newly split SPTEs to
6587 * ensure that guest writes are reflected in the dirty log before the
6588 * ioctl to enable dirty logging on this memslot completes. Since the
6589 * split SPTEs retain the write and dirty bits of the huge SPTE, it is
6590 * safe for KVM to decide if a TLB flush is necessary based on the split
6595 static bool kvm_mmu_zap_collapsible_spte(struct kvm
*kvm
,
6596 struct kvm_rmap_head
*rmap_head
,
6597 const struct kvm_memory_slot
*slot
)
6600 struct rmap_iterator iter
;
6601 int need_tlb_flush
= 0;
6602 struct kvm_mmu_page
*sp
;
6605 for_each_rmap_spte(rmap_head
, &iter
, sptep
) {
6606 sp
= sptep_to_sp(sptep
);
6609 * We cannot do huge page mapping for indirect shadow pages,
6610 * which are found on the last rmap (level = 1) when not using
6611 * tdp; such shadow pages are synced with the page table in
6612 * the guest, and the guest page table is using 4K page size
6613 * mapping if the indirect sp has level = 1.
6615 if (sp
->role
.direct
&&
6616 sp
->role
.level
< kvm_mmu_max_mapping_level(kvm
, slot
, sp
->gfn
,
6618 kvm_zap_one_rmap_spte(kvm
, rmap_head
, sptep
);
6620 if (kvm_available_flush_remote_tlbs_range())
6621 kvm_flush_remote_tlbs_sptep(kvm
, sptep
);
6629 return need_tlb_flush
;
6632 static void kvm_rmap_zap_collapsible_sptes(struct kvm
*kvm
,
6633 const struct kvm_memory_slot
*slot
)
6636 * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
6637 * pages that are already mapped at the maximum hugepage level.
6639 if (walk_slot_rmaps(kvm
, slot
, kvm_mmu_zap_collapsible_spte
,
6640 PG_LEVEL_4K
, KVM_MAX_HUGEPAGE_LEVEL
- 1, true))
6641 kvm_flush_remote_tlbs_memslot(kvm
, slot
);
6644 void kvm_mmu_zap_collapsible_sptes(struct kvm
*kvm
,
6645 const struct kvm_memory_slot
*slot
)
6647 if (kvm_memslots_have_rmaps(kvm
)) {
6648 write_lock(&kvm
->mmu_lock
);
6649 kvm_rmap_zap_collapsible_sptes(kvm
, slot
);
6650 write_unlock(&kvm
->mmu_lock
);
6653 if (tdp_mmu_enabled
) {
6654 read_lock(&kvm
->mmu_lock
);
6655 kvm_tdp_mmu_zap_collapsible_sptes(kvm
, slot
);
6656 read_unlock(&kvm
->mmu_lock
);
6660 void kvm_mmu_slot_leaf_clear_dirty(struct kvm
*kvm
,
6661 const struct kvm_memory_slot
*memslot
)
6663 if (kvm_memslots_have_rmaps(kvm
)) {
6664 write_lock(&kvm
->mmu_lock
);
6666 * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6667 * support dirty logging at a 4k granularity.
6669 walk_slot_rmaps_4k(kvm
, memslot
, __rmap_clear_dirty
, false);
6670 write_unlock(&kvm
->mmu_lock
);
6673 if (tdp_mmu_enabled
) {
6674 read_lock(&kvm
->mmu_lock
);
6675 kvm_tdp_mmu_clear_dirty_slot(kvm
, memslot
);
6676 read_unlock(&kvm
->mmu_lock
);
6680 * The caller will flush the TLBs after this function returns.
6682 * It's also safe to flush TLBs out of mmu lock here as currently this
6683 * function is only used for dirty logging, in which case flushing TLB
6684 * out of mmu lock also guarantees no dirty pages will be lost in
6689 static void kvm_mmu_zap_all(struct kvm
*kvm
)
6691 struct kvm_mmu_page
*sp
, *node
;
6692 LIST_HEAD(invalid_list
);
6695 write_lock(&kvm
->mmu_lock
);
6697 list_for_each_entry_safe(sp
, node
, &kvm
->arch
.active_mmu_pages
, link
) {
6698 if (WARN_ON_ONCE(sp
->role
.invalid
))
6700 if (__kvm_mmu_prepare_zap_page(kvm
, sp
, &invalid_list
, &ign
))
6702 if (cond_resched_rwlock_write(&kvm
->mmu_lock
))
6706 kvm_mmu_commit_zap_page(kvm
, &invalid_list
);
6708 if (tdp_mmu_enabled
)
6709 kvm_tdp_mmu_zap_all(kvm
);
6711 write_unlock(&kvm
->mmu_lock
);
6714 void kvm_arch_flush_shadow_all(struct kvm
*kvm
)
6716 kvm_mmu_zap_all(kvm
);
6719 void kvm_arch_flush_shadow_memslot(struct kvm
*kvm
,
6720 struct kvm_memory_slot
*slot
)
6722 kvm_mmu_zap_all_fast(kvm
);
6725 void kvm_mmu_invalidate_mmio_sptes(struct kvm
*kvm
, u64 gen
)
6727 WARN_ON_ONCE(gen
& KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS
);
6729 gen
&= MMIO_SPTE_GEN_MASK
;
6732 * Generation numbers are incremented in multiples of the number of
6733 * address spaces in order to provide unique generations across all
6734 * address spaces. Strip what is effectively the address space
6735 * modifier prior to checking for a wrap of the MMIO generation so
6736 * that a wrap in any address space is detected.
6738 gen
&= ~((u64
)KVM_ADDRESS_SPACE_NUM
- 1);
6741 * The very rare case: if the MMIO generation number has wrapped,
6742 * zap all shadow pages.
6744 if (unlikely(gen
== 0)) {
6745 kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
6746 kvm_mmu_zap_all_fast(kvm
);
6750 static unsigned long mmu_shrink_scan(struct shrinker
*shrink
,
6751 struct shrink_control
*sc
)
6754 int nr_to_scan
= sc
->nr_to_scan
;
6755 unsigned long freed
= 0;
6757 mutex_lock(&kvm_lock
);
6759 list_for_each_entry(kvm
, &vm_list
, vm_list
) {
6761 LIST_HEAD(invalid_list
);
6764 * Never scan more than sc->nr_to_scan VM instances.
6765 * Will not hit this condition practically since we do not try
6766 * to shrink more than one VM and it is very unlikely to see
6767 * !n_used_mmu_pages so many times.
6772 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6773 * here. We may skip a VM instance errorneosly, but we do not
6774 * want to shrink a VM that only started to populate its MMU
6777 if (!kvm
->arch
.n_used_mmu_pages
&&
6778 !kvm_has_zapped_obsolete_pages(kvm
))
6781 idx
= srcu_read_lock(&kvm
->srcu
);
6782 write_lock(&kvm
->mmu_lock
);
6784 if (kvm_has_zapped_obsolete_pages(kvm
)) {
6785 kvm_mmu_commit_zap_page(kvm
,
6786 &kvm
->arch
.zapped_obsolete_pages
);
6790 freed
= kvm_mmu_zap_oldest_mmu_pages(kvm
, sc
->nr_to_scan
);
6793 write_unlock(&kvm
->mmu_lock
);
6794 srcu_read_unlock(&kvm
->srcu
, idx
);
6797 * unfair on small ones
6798 * per-vm shrinkers cry out
6799 * sadness comes quickly
6801 list_move_tail(&kvm
->vm_list
, &vm_list
);
6805 mutex_unlock(&kvm_lock
);
6809 static unsigned long mmu_shrink_count(struct shrinker
*shrink
,
6810 struct shrink_control
*sc
)
6812 return percpu_counter_read_positive(&kvm_total_used_mmu_pages
);
6815 static struct shrinker mmu_shrinker
= {
6816 .count_objects
= mmu_shrink_count
,
6817 .scan_objects
= mmu_shrink_scan
,
6818 .seeks
= DEFAULT_SEEKS
* 10,
6821 static void mmu_destroy_caches(void)
6823 kmem_cache_destroy(pte_list_desc_cache
);
6824 kmem_cache_destroy(mmu_page_header_cache
);
6827 static int get_nx_huge_pages(char *buffer
, const struct kernel_param
*kp
)
6829 if (nx_hugepage_mitigation_hard_disabled
)
6830 return sysfs_emit(buffer
, "never\n");
6832 return param_get_bool(buffer
, kp
);
6835 static bool get_nx_auto_mode(void)
6837 /* Return true when CPU has the bug, and mitigations are ON */
6838 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT
) && !cpu_mitigations_off();
6841 static void __set_nx_huge_pages(bool val
)
6843 nx_huge_pages
= itlb_multihit_kvm_mitigation
= val
;
6846 static int set_nx_huge_pages(const char *val
, const struct kernel_param
*kp
)
6848 bool old_val
= nx_huge_pages
;
6851 if (nx_hugepage_mitigation_hard_disabled
)
6854 /* In "auto" mode deploy workaround only if CPU has the bug. */
6855 if (sysfs_streq(val
, "off")) {
6857 } else if (sysfs_streq(val
, "force")) {
6859 } else if (sysfs_streq(val
, "auto")) {
6860 new_val
= get_nx_auto_mode();
6861 } else if (sysfs_streq(val
, "never")) {
6864 mutex_lock(&kvm_lock
);
6865 if (!list_empty(&vm_list
)) {
6866 mutex_unlock(&kvm_lock
);
6869 nx_hugepage_mitigation_hard_disabled
= true;
6870 mutex_unlock(&kvm_lock
);
6871 } else if (kstrtobool(val
, &new_val
) < 0) {
6875 __set_nx_huge_pages(new_val
);
6877 if (new_val
!= old_val
) {
6880 mutex_lock(&kvm_lock
);
6882 list_for_each_entry(kvm
, &vm_list
, vm_list
) {
6883 mutex_lock(&kvm
->slots_lock
);
6884 kvm_mmu_zap_all_fast(kvm
);
6885 mutex_unlock(&kvm
->slots_lock
);
6887 wake_up_process(kvm
->arch
.nx_huge_page_recovery_thread
);
6889 mutex_unlock(&kvm_lock
);
6896 * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
6897 * its default value of -1 is technically undefined behavior for a boolean.
6898 * Forward the module init call to SPTE code so that it too can handle module
6899 * params that need to be resolved/snapshot.
6901 void __init
kvm_mmu_x86_module_init(void)
6903 if (nx_huge_pages
== -1)
6904 __set_nx_huge_pages(get_nx_auto_mode());
6907 * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
6908 * TDP MMU is actually enabled is determined in kvm_configure_mmu()
6909 * when the vendor module is loaded.
6911 tdp_mmu_allowed
= tdp_mmu_enabled
;
6913 kvm_mmu_spte_module_init();
6917 * The bulk of the MMU initialization is deferred until the vendor module is
6918 * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
6919 * to be reset when a potentially different vendor module is loaded.
6921 int kvm_mmu_vendor_module_init(void)
6926 * MMU roles use union aliasing which is, generally speaking, an
6927 * undefined behavior. However, we supposedly know how compilers behave
6928 * and the current status quo is unlikely to change. Guardians below are
6929 * supposed to let us know if the assumption becomes false.
6931 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role
) != sizeof(u32
));
6932 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role
) != sizeof(u32
));
6933 BUILD_BUG_ON(sizeof(union kvm_cpu_role
) != sizeof(u64
));
6935 kvm_mmu_reset_all_pte_masks();
6937 pte_list_desc_cache
= kmem_cache_create("pte_list_desc",
6938 sizeof(struct pte_list_desc
),
6939 0, SLAB_ACCOUNT
, NULL
);
6940 if (!pte_list_desc_cache
)
6943 mmu_page_header_cache
= kmem_cache_create("kvm_mmu_page_header",
6944 sizeof(struct kvm_mmu_page
),
6945 0, SLAB_ACCOUNT
, NULL
);
6946 if (!mmu_page_header_cache
)
6949 if (percpu_counter_init(&kvm_total_used_mmu_pages
, 0, GFP_KERNEL
))
6952 ret
= register_shrinker(&mmu_shrinker
, "x86-mmu");
6959 percpu_counter_destroy(&kvm_total_used_mmu_pages
);
6961 mmu_destroy_caches();
6965 void kvm_mmu_destroy(struct kvm_vcpu
*vcpu
)
6967 kvm_mmu_unload(vcpu
);
6968 free_mmu_pages(&vcpu
->arch
.root_mmu
);
6969 free_mmu_pages(&vcpu
->arch
.guest_mmu
);
6970 mmu_free_memory_caches(vcpu
);
6973 void kvm_mmu_vendor_module_exit(void)
6975 mmu_destroy_caches();
6976 percpu_counter_destroy(&kvm_total_used_mmu_pages
);
6977 unregister_shrinker(&mmu_shrinker
);
6981 * Calculate the effective recovery period, accounting for '0' meaning "let KVM
6982 * select a halving time of 1 hour". Returns true if recovery is enabled.
6984 static bool calc_nx_huge_pages_recovery_period(uint
*period
)
6987 * Use READ_ONCE to get the params, this may be called outside of the
6988 * param setters, e.g. by the kthread to compute its next timeout.
6990 bool enabled
= READ_ONCE(nx_huge_pages
);
6991 uint ratio
= READ_ONCE(nx_huge_pages_recovery_ratio
);
6993 if (!enabled
|| !ratio
)
6996 *period
= READ_ONCE(nx_huge_pages_recovery_period_ms
);
6998 /* Make sure the period is not less than one second. */
6999 ratio
= min(ratio
, 3600u);
7000 *period
= 60 * 60 * 1000 / ratio
;
7005 static int set_nx_huge_pages_recovery_param(const char *val
, const struct kernel_param
*kp
)
7007 bool was_recovery_enabled
, is_recovery_enabled
;
7008 uint old_period
, new_period
;
7011 if (nx_hugepage_mitigation_hard_disabled
)
7014 was_recovery_enabled
= calc_nx_huge_pages_recovery_period(&old_period
);
7016 err
= param_set_uint(val
, kp
);
7020 is_recovery_enabled
= calc_nx_huge_pages_recovery_period(&new_period
);
7022 if (is_recovery_enabled
&&
7023 (!was_recovery_enabled
|| old_period
> new_period
)) {
7026 mutex_lock(&kvm_lock
);
7028 list_for_each_entry(kvm
, &vm_list
, vm_list
)
7029 wake_up_process(kvm
->arch
.nx_huge_page_recovery_thread
);
7031 mutex_unlock(&kvm_lock
);
7037 static void kvm_recover_nx_huge_pages(struct kvm
*kvm
)
7039 unsigned long nx_lpage_splits
= kvm
->stat
.nx_lpage_splits
;
7040 struct kvm_memory_slot
*slot
;
7042 struct kvm_mmu_page
*sp
;
7044 LIST_HEAD(invalid_list
);
7048 rcu_idx
= srcu_read_lock(&kvm
->srcu
);
7049 write_lock(&kvm
->mmu_lock
);
7052 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
7053 * be done under RCU protection, because the pages are freed via RCU
7058 ratio
= READ_ONCE(nx_huge_pages_recovery_ratio
);
7059 to_zap
= ratio
? DIV_ROUND_UP(nx_lpage_splits
, ratio
) : 0;
7060 for ( ; to_zap
; --to_zap
) {
7061 if (list_empty(&kvm
->arch
.possible_nx_huge_pages
))
7065 * We use a separate list instead of just using active_mmu_pages
7066 * because the number of shadow pages that be replaced with an
7067 * NX huge page is expected to be relatively small compared to
7068 * the total number of shadow pages. And because the TDP MMU
7069 * doesn't use active_mmu_pages.
7071 sp
= list_first_entry(&kvm
->arch
.possible_nx_huge_pages
,
7072 struct kvm_mmu_page
,
7073 possible_nx_huge_page_link
);
7074 WARN_ON_ONCE(!sp
->nx_huge_page_disallowed
);
7075 WARN_ON_ONCE(!sp
->role
.direct
);
7078 * Unaccount and do not attempt to recover any NX Huge Pages
7079 * that are being dirty tracked, as they would just be faulted
7080 * back in as 4KiB pages. The NX Huge Pages in this slot will be
7081 * recovered, along with all the other huge pages in the slot,
7082 * when dirty logging is disabled.
7084 * Since gfn_to_memslot() is relatively expensive, it helps to
7085 * skip it if it the test cannot possibly return true. On the
7086 * other hand, if any memslot has logging enabled, chances are
7087 * good that all of them do, in which case unaccount_nx_huge_page()
7088 * is much cheaper than zapping the page.
7090 * If a memslot update is in progress, reading an incorrect value
7091 * of kvm->nr_memslots_dirty_logging is not a problem: if it is
7092 * becoming zero, gfn_to_memslot() will be done unnecessarily; if
7093 * it is becoming nonzero, the page will be zapped unnecessarily.
7094 * Either way, this only affects efficiency in racy situations,
7095 * and not correctness.
7098 if (atomic_read(&kvm
->nr_memslots_dirty_logging
)) {
7099 struct kvm_memslots
*slots
;
7101 slots
= kvm_memslots_for_spte_role(kvm
, sp
->role
);
7102 slot
= __gfn_to_memslot(slots
, sp
->gfn
);
7103 WARN_ON_ONCE(!slot
);
7106 if (slot
&& kvm_slot_dirty_track_enabled(slot
))
7107 unaccount_nx_huge_page(kvm
, sp
);
7108 else if (is_tdp_mmu_page(sp
))
7109 flush
|= kvm_tdp_mmu_zap_sp(kvm
, sp
);
7111 kvm_mmu_prepare_zap_page(kvm
, sp
, &invalid_list
);
7112 WARN_ON_ONCE(sp
->nx_huge_page_disallowed
);
7114 if (need_resched() || rwlock_needbreak(&kvm
->mmu_lock
)) {
7115 kvm_mmu_remote_flush_or_zap(kvm
, &invalid_list
, flush
);
7118 cond_resched_rwlock_write(&kvm
->mmu_lock
);
7124 kvm_mmu_remote_flush_or_zap(kvm
, &invalid_list
, flush
);
7128 write_unlock(&kvm
->mmu_lock
);
7129 srcu_read_unlock(&kvm
->srcu
, rcu_idx
);
7132 static long get_nx_huge_page_recovery_timeout(u64 start_time
)
7137 enabled
= calc_nx_huge_pages_recovery_period(&period
);
7139 return enabled
? start_time
+ msecs_to_jiffies(period
) - get_jiffies_64()
7140 : MAX_SCHEDULE_TIMEOUT
;
7143 static int kvm_nx_huge_page_recovery_worker(struct kvm
*kvm
, uintptr_t data
)
7146 long remaining_time
;
7149 start_time
= get_jiffies_64();
7150 remaining_time
= get_nx_huge_page_recovery_timeout(start_time
);
7152 set_current_state(TASK_INTERRUPTIBLE
);
7153 while (!kthread_should_stop() && remaining_time
> 0) {
7154 schedule_timeout(remaining_time
);
7155 remaining_time
= get_nx_huge_page_recovery_timeout(start_time
);
7156 set_current_state(TASK_INTERRUPTIBLE
);
7159 set_current_state(TASK_RUNNING
);
7161 if (kthread_should_stop())
7164 kvm_recover_nx_huge_pages(kvm
);
7168 int kvm_mmu_post_init_vm(struct kvm
*kvm
)
7172 if (nx_hugepage_mitigation_hard_disabled
)
7175 err
= kvm_vm_create_worker_thread(kvm
, kvm_nx_huge_page_recovery_worker
, 0,
7176 "kvm-nx-lpage-recovery",
7177 &kvm
->arch
.nx_huge_page_recovery_thread
);
7179 kthread_unpark(kvm
->arch
.nx_huge_page_recovery_thread
);
7184 void kvm_mmu_pre_destroy_vm(struct kvm
*kvm
)
7186 if (kvm
->arch
.nx_huge_page_recovery_thread
)
7187 kthread_stop(kvm
->arch
.nx_huge_page_recovery_thread
);
7190 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
7191 static bool hugepage_test_mixed(struct kvm_memory_slot
*slot
, gfn_t gfn
,
7194 return lpage_info_slot(gfn
, slot
, level
)->disallow_lpage
& KVM_LPAGE_MIXED_FLAG
;
7197 static void hugepage_clear_mixed(struct kvm_memory_slot
*slot
, gfn_t gfn
,
7200 lpage_info_slot(gfn
, slot
, level
)->disallow_lpage
&= ~KVM_LPAGE_MIXED_FLAG
;
7203 static void hugepage_set_mixed(struct kvm_memory_slot
*slot
, gfn_t gfn
,
7206 lpage_info_slot(gfn
, slot
, level
)->disallow_lpage
|= KVM_LPAGE_MIXED_FLAG
;
7209 static bool hugepage_has_attrs(struct kvm
*kvm
, struct kvm_memory_slot
*slot
,
7210 gfn_t gfn
, int level
, unsigned long attrs
)
7212 const unsigned long start
= gfn
;
7213 const unsigned long end
= start
+ KVM_PAGES_PER_HPAGE(level
);
7215 if (level
== PG_LEVEL_2M
)
7216 return kvm_range_has_memory_attributes(kvm
, start
, end
, attrs
);
7218 for (gfn
= start
; gfn
< end
; gfn
+= KVM_PAGES_PER_HPAGE(level
- 1)) {
7219 if (hugepage_test_mixed(slot
, gfn
, level
- 1) ||
7220 attrs
!= kvm_get_memory_attributes(kvm
, gfn
))
7226 bool kvm_arch_post_set_memory_attributes(struct kvm
*kvm
,
7227 struct kvm_gfn_range
*range
)
7229 unsigned long attrs
= range
->arg
.attributes
;
7230 struct kvm_memory_slot
*slot
= range
->slot
;
7233 lockdep_assert_held_write(&kvm
->mmu_lock
);
7234 lockdep_assert_held(&kvm
->slots_lock
);
7237 * Calculate which ranges can be mapped with hugepages even if the slot
7238 * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over
7239 * a range that has PRIVATE GFNs, and conversely converting a range to
7240 * SHARED may now allow hugepages.
7242 if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm
)))
7246 * The sequence matters here: upper levels consume the result of lower
7249 for (level
= PG_LEVEL_2M
; level
<= KVM_MAX_HUGEPAGE_LEVEL
; level
++) {
7250 gfn_t nr_pages
= KVM_PAGES_PER_HPAGE(level
);
7251 gfn_t gfn
= gfn_round_for_level(range
->start
, level
);
7253 /* Process the head page if it straddles the range. */
7254 if (gfn
!= range
->start
|| gfn
+ nr_pages
> range
->end
) {
7256 * Skip mixed tracking if the aligned gfn isn't covered
7257 * by the memslot, KVM can't use a hugepage due to the
7258 * misaligned address regardless of memory attributes.
7260 if (gfn
>= slot
->base_gfn
) {
7261 if (hugepage_has_attrs(kvm
, slot
, gfn
, level
, attrs
))
7262 hugepage_clear_mixed(slot
, gfn
, level
);
7264 hugepage_set_mixed(slot
, gfn
, level
);
7270 * Pages entirely covered by the range are guaranteed to have
7271 * only the attributes which were just set.
7273 for ( ; gfn
+ nr_pages
<= range
->end
; gfn
+= nr_pages
)
7274 hugepage_clear_mixed(slot
, gfn
, level
);
7277 * Process the last tail page if it straddles the range and is
7278 * contained by the memslot. Like the head page, KVM can't
7279 * create a hugepage if the slot size is misaligned.
7281 if (gfn
< range
->end
&&
7282 (gfn
+ nr_pages
) <= (slot
->base_gfn
+ slot
->npages
)) {
7283 if (hugepage_has_attrs(kvm
, slot
, gfn
, level
, attrs
))
7284 hugepage_clear_mixed(slot
, gfn
, level
);
7286 hugepage_set_mixed(slot
, gfn
, level
);
7292 void kvm_mmu_init_memslot_memory_attributes(struct kvm
*kvm
,
7293 struct kvm_memory_slot
*slot
)
7297 if (!kvm_arch_has_private_mem(kvm
))
7300 for (level
= PG_LEVEL_2M
; level
<= KVM_MAX_HUGEPAGE_LEVEL
; level
++) {
7302 * Don't bother tracking mixed attributes for pages that can't
7303 * be huge due to alignment, i.e. process only pages that are
7304 * entirely contained by the memslot.
7306 gfn_t end
= gfn_round_for_level(slot
->base_gfn
+ slot
->npages
, level
);
7307 gfn_t start
= gfn_round_for_level(slot
->base_gfn
, level
);
7308 gfn_t nr_pages
= KVM_PAGES_PER_HPAGE(level
);
7311 if (start
< slot
->base_gfn
)
7315 * Unlike setting attributes, every potential hugepage needs to
7316 * be manually checked as the attributes may already be mixed.
7318 for (gfn
= start
; gfn
< end
; gfn
+= nr_pages
) {
7319 unsigned long attrs
= kvm_get_memory_attributes(kvm
, gfn
);
7321 if (hugepage_has_attrs(kvm
, slot
, gfn
, level
, attrs
))
7322 hugepage_clear_mixed(slot
, gfn
, level
);
7324 hugepage_set_mixed(slot
, gfn
, level
);