]>
Commit | Line | Data |
---|---|---|
20c8ccb1 | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
6aa8b732 AK |
2 | /* |
3 | * Kernel-based Virtual Machine driver for Linux | |
4 | * | |
5 | * This module enables machines with Intel VT-x extensions to run virtual | |
6 | * machines without emulation or binary translation. | |
7 | * | |
8 | * MMU support | |
9 | * | |
10 | * Copyright (C) 2006 Qumranet, Inc. | |
9611c187 | 11 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
6aa8b732 AK |
12 | * |
13 | * Authors: | |
14 | * Yaniv Kamay <yaniv@qumranet.com> | |
15 | * Avi Kivity <avi@qumranet.com> | |
6aa8b732 AK |
16 | */ |
17 | ||
18 | /* | |
f6b8ea6d PB |
19 | * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, |
20 | * as well as guest EPT tables, so the code in this file is compiled thrice, | |
21 | * once per guest PTE type. The per-type defines are #undef'd at the end. | |
6aa8b732 AK |
22 | */ |
23 | ||
24 | #if PTTYPE == 64 | |
25 | #define pt_element_t u64 | |
26 | #define guest_walker guest_walker64 | |
27 | #define FNAME(name) paging##64_##name | |
f6b8ea6d | 28 | #define PT_LEVEL_BITS 9 |
d8089bac GN |
29 | #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT |
30 | #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT | |
86407bcb | 31 | #define PT_HAVE_ACCESSED_DIRTY(mmu) true |
cea0f0e7 | 32 | #ifdef CONFIG_X86_64 |
f6ab0107 | 33 | #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL |
cea0f0e7 AK |
34 | #else |
35 | #define PT_MAX_FULL_LEVELS 2 | |
36 | #endif | |
6aa8b732 AK |
37 | #elif PTTYPE == 32 |
38 | #define pt_element_t u32 | |
39 | #define guest_walker guest_walker32 | |
40 | #define FNAME(name) paging##32_##name | |
2ca3129e | 41 | #define PT_LEVEL_BITS 10 |
cea0f0e7 | 42 | #define PT_MAX_FULL_LEVELS 2 |
d8089bac GN |
43 | #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT |
44 | #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT | |
86407bcb | 45 | #define PT_HAVE_ACCESSED_DIRTY(mmu) true |
b3fcdb04 SC |
46 | |
47 | #define PT32_DIR_PSE36_SIZE 4 | |
48 | #define PT32_DIR_PSE36_SHIFT 13 | |
49 | #define PT32_DIR_PSE36_MASK \ | |
50 | (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | |
37406aaa NHE |
51 | #elif PTTYPE == PTTYPE_EPT |
52 | #define pt_element_t u64 | |
53 | #define guest_walker guest_walkerEPT | |
54 | #define FNAME(name) ept_##name | |
f6b8ea6d | 55 | #define PT_LEVEL_BITS 9 |
ae1e2d10 PB |
56 | #define PT_GUEST_DIRTY_SHIFT 9 |
57 | #define PT_GUEST_ACCESSED_SHIFT 8 | |
ec283cb1 | 58 | #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) |
bb1fcc70 | 59 | #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL |
6aa8b732 AK |
60 | #else |
61 | #error Invalid PTTYPE value | |
62 | #endif | |
63 | ||
f6b8ea6d | 64 | /* Common logic, but per-type values. These also need to be undefined. */ |
70e41c31 | 65 | #define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) |
f6b8ea6d PB |
66 | #define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) |
67 | #define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) | |
68 | #define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) | |
69 | ||
ae1e2d10 PB |
70 | #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) |
71 | #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) | |
72 | ||
e04da980 | 73 | #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) |
3bae0459 | 74 | #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) |
5fb07ddb | 75 | |
6aa8b732 AK |
76 | /* |
77 | * The guest_walker structure emulates the behavior of the hardware page | |
78 | * table walker. | |
79 | */ | |
80 | struct guest_walker { | |
81 | int level; | |
8cbc7069 | 82 | unsigned max_level; |
cea0f0e7 | 83 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
7819026e | 84 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; |
189be38d | 85 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; |
7819026e | 86 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; |
8cbc7069 | 87 | pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; |
ba6a3541 | 88 | bool pte_writable[PT_MAX_FULL_LEVELS]; |
b1bd5cba LJ |
89 | unsigned int pt_access[PT_MAX_FULL_LEVELS]; |
90 | unsigned int pte_access; | |
815af8d4 | 91 | gfn_t gfn; |
8c28d031 | 92 | struct x86_exception fault; |
6aa8b732 AK |
93 | }; |
94 | ||
b3fcdb04 SC |
95 | #if PTTYPE == 32 |
96 | static inline gfn_t pse36_gfn_delta(u32 gpte) | |
97 | { | |
98 | int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; | |
99 | ||
100 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | |
101 | } | |
102 | #endif | |
103 | ||
e04da980 | 104 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) |
5fb07ddb | 105 | { |
e04da980 | 106 | return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; |
5fb07ddb AK |
107 | } |
108 | ||
86407bcb PB |
109 | static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, |
110 | unsigned gpte) | |
0ad805a0 NHE |
111 | { |
112 | unsigned mask; | |
113 | ||
61719a8f | 114 | /* dirty bit is not supported, so no need to track it */ |
86407bcb | 115 | if (!PT_HAVE_ACCESSED_DIRTY(mmu)) |
61719a8f GN |
116 | return; |
117 | ||
0ad805a0 NHE |
118 | BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); |
119 | ||
120 | mask = (unsigned)~ACC_WRITE_MASK; | |
121 | /* Allow write access to dirty gptes */ | |
d8089bac GN |
122 | mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & |
123 | PT_WRITABLE_MASK; | |
0ad805a0 NHE |
124 | *access &= mask; |
125 | } | |
126 | ||
0ad805a0 NHE |
127 | static inline int FNAME(is_present_gpte)(unsigned long pte) |
128 | { | |
37406aaa | 129 | #if PTTYPE != PTTYPE_EPT |
812f30b2 | 130 | return pte & PT_PRESENT_MASK; |
37406aaa NHE |
131 | #else |
132 | return pte & 7; | |
133 | #endif | |
0ad805a0 NHE |
134 | } |
135 | ||
b5c3c1b3 SC |
136 | static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) |
137 | { | |
138 | #if PTTYPE != PTTYPE_EPT | |
139 | return false; | |
140 | #else | |
141 | return __is_bad_mt_xwr(rsvd_check, gpte); | |
142 | #endif | |
143 | } | |
144 | ||
145 | static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) | |
146 | { | |
147 | return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || | |
148 | FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); | |
149 | } | |
150 | ||
0ad805a0 NHE |
151 | static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, |
152 | struct kvm_mmu_page *sp, u64 *spte, | |
153 | u64 gpte) | |
154 | { | |
0ad805a0 NHE |
155 | if (!FNAME(is_present_gpte)(gpte)) |
156 | goto no_present; | |
157 | ||
25cc0565 | 158 | /* Prefetch only accessed entries (unless A/D bits are disabled). */ |
44dd3ffa VK |
159 | if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && |
160 | !(gpte & PT_GUEST_ACCESSED_MASK)) | |
0ad805a0 NHE |
161 | goto no_present; |
162 | ||
3bae0459 | 163 | if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) |
f8052a05 SC |
164 | goto no_present; |
165 | ||
0ad805a0 NHE |
166 | return false; |
167 | ||
168 | no_present: | |
169 | drop_spte(vcpu->kvm, spte); | |
170 | return true; | |
171 | } | |
172 | ||
d95c5568 BD |
173 | /* |
174 | * For PTTYPE_EPT, a page table can be executable but not readable | |
175 | * on supported processors. Therefore, set_spte does not automatically | |
176 | * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK | |
177 | * to signify readability since it isn't used in the EPT case | |
178 | */ | |
42522d08 | 179 | static inline unsigned FNAME(gpte_access)(u64 gpte) |
0ad805a0 NHE |
180 | { |
181 | unsigned access; | |
37406aaa NHE |
182 | #if PTTYPE == PTTYPE_EPT |
183 | access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | | |
184 | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | | |
d95c5568 | 185 | ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); |
37406aaa | 186 | #else |
bb9eadf0 PB |
187 | BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); |
188 | BUILD_BUG_ON(ACC_EXEC_MASK != 1); | |
189 | access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); | |
190 | /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ | |
191 | access ^= (gpte >> PT64_NX_SHIFT); | |
37406aaa | 192 | #endif |
0ad805a0 NHE |
193 | |
194 | return access; | |
195 | } | |
196 | ||
8cbc7069 AK |
197 | static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, |
198 | struct kvm_mmu *mmu, | |
199 | struct guest_walker *walker, | |
2dbebf7a | 200 | gpa_t addr, int write_fault) |
8cbc7069 AK |
201 | { |
202 | unsigned level, index; | |
203 | pt_element_t pte, orig_pte; | |
204 | pt_element_t __user *ptep_user; | |
205 | gfn_t table_gfn; | |
206 | int ret; | |
207 | ||
61719a8f | 208 | /* dirty/accessed bits are not supported, so no need to update them */ |
86407bcb | 209 | if (!PT_HAVE_ACCESSED_DIRTY(mmu)) |
61719a8f GN |
210 | return 0; |
211 | ||
8cbc7069 AK |
212 | for (level = walker->max_level; level >= walker->level; --level) { |
213 | pte = orig_pte = walker->ptes[level - 1]; | |
214 | table_gfn = walker->table_gfn[level - 1]; | |
215 | ptep_user = walker->ptep_user[level - 1]; | |
216 | index = offset_in_page(ptep_user) / sizeof(pt_element_t); | |
d8089bac | 217 | if (!(pte & PT_GUEST_ACCESSED_MASK)) { |
8cbc7069 | 218 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); |
d8089bac | 219 | pte |= PT_GUEST_ACCESSED_MASK; |
8cbc7069 | 220 | } |
0ad805a0 | 221 | if (level == walker->level && write_fault && |
d8089bac | 222 | !(pte & PT_GUEST_DIRTY_MASK)) { |
8cbc7069 | 223 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); |
bab4165e | 224 | #if PTTYPE == PTTYPE_EPT |
02f5fb2e | 225 | if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) |
bab4165e BD |
226 | return -EINVAL; |
227 | #endif | |
d8089bac | 228 | pte |= PT_GUEST_DIRTY_MASK; |
8cbc7069 AK |
229 | } |
230 | if (pte == orig_pte) | |
231 | continue; | |
232 | ||
ba6a3541 PB |
233 | /* |
234 | * If the slot is read-only, simply do not process the accessed | |
235 | * and dirty bits. This is the correct thing to do if the slot | |
236 | * is ROM, and page tables in read-as-ROM/write-as-MMIO slots | |
237 | * are only supported if the accessed and dirty bits are already | |
238 | * set in the ROM (so that MMIO writes are never needed). | |
239 | * | |
240 | * Note that NPT does not allow this at all and faults, since | |
241 | * it always wants nested page table entries for the guest | |
242 | * page tables to be writable. And EPT works but will simply | |
243 | * overwrite the read-only memory to set the accessed and dirty | |
244 | * bits. | |
245 | */ | |
246 | if (unlikely(!walker->pte_writable[level - 1])) | |
247 | continue; | |
248 | ||
f122dfe4 | 249 | ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); |
8cbc7069 AK |
250 | if (ret) |
251 | return ret; | |
252 | ||
54bf36aa | 253 | kvm_vcpu_mark_page_dirty(vcpu, table_gfn); |
17e4bce0 | 254 | walker->ptes[level - 1] = pte; |
8cbc7069 AK |
255 | } |
256 | return 0; | |
257 | } | |
258 | ||
be94f6b7 HH |
259 | static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) |
260 | { | |
261 | unsigned pkeys = 0; | |
262 | #if PTTYPE == 64 | |
263 | pte_t pte = {.pte = gpte}; | |
264 | ||
265 | pkeys = pte_flags_pkey(pte_flags(pte)); | |
266 | #endif | |
267 | return pkeys; | |
268 | } | |
269 | ||
7cd138db SC |
270 | static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, |
271 | unsigned int level, unsigned int gpte) | |
272 | { | |
273 | /* | |
274 | * For EPT and PAE paging (both variants), bit 7 is either reserved at | |
275 | * all level or indicates a huge page (ignoring CR3/EPTP). In either | |
276 | * case, bit 7 being set terminates the walk. | |
277 | */ | |
278 | #if PTTYPE == 32 | |
279 | /* | |
280 | * 32-bit paging requires special handling because bit 7 is ignored if | |
281 | * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is | |
282 | * greater than the last level for which bit 7 is the PAGE_SIZE bit. | |
283 | * | |
284 | * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 | |
285 | * is not reserved and does not indicate a large page at this level, | |
286 | * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. | |
287 | */ | |
e5ed0fb0 | 288 | gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); |
7cd138db SC |
289 | #endif |
290 | /* | |
291 | * PG_LEVEL_4K always terminates. The RHS has bit 7 set | |
292 | * iff level <= PG_LEVEL_4K, which for our purpose means | |
293 | * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. | |
294 | */ | |
295 | gpte |= level - PG_LEVEL_4K - 1; | |
296 | ||
297 | return gpte & PT_PAGE_SIZE_MASK; | |
298 | } | |
ac79c978 | 299 | /* |
736c291c | 300 | * Fetch a guest pte for a guest virtual address, or for an L2's GPA. |
ac79c978 | 301 | */ |
1e301feb JR |
302 | static int FNAME(walk_addr_generic)(struct guest_walker *walker, |
303 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | |
5b22bbe7 | 304 | gpa_t addr, u64 access) |
6aa8b732 | 305 | { |
8cbc7069 | 306 | int ret; |
42bf3f0a | 307 | pt_element_t pte; |
3f649ab7 | 308 | pt_element_t __user *ptep_user; |
cea0f0e7 | 309 | gfn_t table_gfn; |
0780516a PB |
310 | u64 pt_access, pte_access; |
311 | unsigned index, accessed_dirty, pte_pkey; | |
5b22bbe7 | 312 | u64 nested_access; |
42bf3f0a | 313 | gpa_t pte_gpa; |
86407bcb | 314 | bool have_ad; |
134291bf | 315 | int offset; |
0780516a | 316 | u64 walk_nx_mask = 0; |
134291bf TY |
317 | const int write_fault = access & PFERR_WRITE_MASK; |
318 | const int user_fault = access & PFERR_USER_MASK; | |
319 | const int fetch_fault = access & PFERR_FETCH_MASK; | |
320 | u16 errcode = 0; | |
13d22b6a AK |
321 | gpa_t real_gpa; |
322 | gfn_t gfn; | |
6aa8b732 | 323 | |
6fbc2770 | 324 | trace_kvm_mmu_pagetable_walk(addr, access); |
92c1c1e8 | 325 | retry_walk: |
4d25502a | 326 | walker->level = mmu->cpu_role.base.level; |
d8dd54e0 | 327 | pte = mmu->get_guest_pgd(vcpu); |
86407bcb | 328 | have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); |
1e301feb | 329 | |
1b0973bd | 330 | #if PTTYPE == 64 |
0780516a | 331 | walk_nx_mask = 1ULL << PT64_NX_SHIFT; |
1e301feb | 332 | if (walker->level == PT32E_ROOT_LEVEL) { |
e4e517b4 | 333 | pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); |
07420171 | 334 | trace_kvm_mmu_paging_element(pte, walker->level); |
0ad805a0 | 335 | if (!FNAME(is_present_gpte)(pte)) |
f59c1d2d | 336 | goto error; |
1b0973bd AK |
337 | --walker->level; |
338 | } | |
339 | #endif | |
8cbc7069 | 340 | walker->max_level = walker->level; |
1715d0dc | 341 | ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); |
6aa8b732 | 342 | |
ae1e2d10 PB |
343 | /* |
344 | * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging | |
345 | * by the MOV to CR instruction are treated as reads and do not cause the | |
346 | * processor to set the dirty flag in any EPT paging-structure entry. | |
347 | */ | |
348 | nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; | |
349 | ||
0780516a | 350 | pte_access = ~0; |
13d22b6a | 351 | ++walker->level; |
ac79c978 | 352 | |
13d22b6a | 353 | do { |
6e2ca7d1 TY |
354 | unsigned long host_addr; |
355 | ||
0780516a | 356 | pt_access = pte_access; |
13d22b6a AK |
357 | --walker->level; |
358 | ||
42bf3f0a | 359 | index = PT_INDEX(addr, walker->level); |
5fb07ddb | 360 | table_gfn = gpte_to_gfn(pte); |
2329d46d JR |
361 | offset = index * sizeof(pt_element_t); |
362 | pte_gpa = gfn_to_gpa(table_gfn) + offset; | |
829ee279 LP |
363 | |
364 | BUG_ON(walker->level < 1); | |
42bf3f0a | 365 | walker->table_gfn[walker->level - 1] = table_gfn; |
7819026e | 366 | walker->pte_gpa[walker->level - 1] = pte_gpa; |
42bf3f0a | 367 | |
c59a0f57 LJ |
368 | real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), |
369 | nested_access, &walker->fault); | |
5e352519 PB |
370 | |
371 | /* | |
372 | * FIXME: This can happen if emulation (for of an INS/OUTS | |
373 | * instruction) triggers a nested page fault. The exit | |
374 | * qualification / exit info field will incorrectly have | |
375 | * "guest page access" as the nested page fault's cause, | |
376 | * instead of "guest page structure access". To fix this, | |
377 | * the x86_exception struct should be augmented with enough | |
378 | * information to fix the exit_qualification or exit_info_1 | |
379 | * fields. | |
380 | */ | |
6e1d2a3f | 381 | if (unlikely(real_gpa == INVALID_GPA)) |
54987b7a | 382 | return 0; |
5e352519 | 383 | |
312d16c7 | 384 | host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), |
ba6a3541 | 385 | &walker->pte_writable[walker->level - 1]); |
134291bf TY |
386 | if (unlikely(kvm_is_error_hva(host_addr))) |
387 | goto error; | |
6e2ca7d1 TY |
388 | |
389 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); | |
a4814443 | 390 | if (unlikely(__get_user(pte, ptep_user))) |
134291bf | 391 | goto error; |
8cbc7069 | 392 | walker->ptep_user[walker->level - 1] = ptep_user; |
a6085fba | 393 | |
07420171 | 394 | trace_kvm_mmu_paging_element(pte, walker->level); |
42bf3f0a | 395 | |
0780516a PB |
396 | /* |
397 | * Inverting the NX it lets us AND it like other | |
398 | * permission bits. | |
399 | */ | |
400 | pte_access = pt_access & (pte ^ walk_nx_mask); | |
401 | ||
0ad805a0 | 402 | if (unlikely(!FNAME(is_present_gpte)(pte))) |
134291bf | 403 | goto error; |
7993ba43 | 404 | |
b5c3c1b3 | 405 | if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { |
7a98205d | 406 | errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; |
134291bf | 407 | goto error; |
f59c1d2d | 408 | } |
82725b20 | 409 | |
7819026e | 410 | walker->ptes[walker->level - 1] = pte; |
b1bd5cba LJ |
411 | |
412 | /* Convert to ACC_*_MASK flags for struct guest_walker. */ | |
413 | walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); | |
7cd138db | 414 | } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); |
42bf3f0a | 415 | |
be94f6b7 | 416 | pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); |
0780516a PB |
417 | accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; |
418 | ||
419 | /* Convert to ACC_*_MASK flags for struct guest_walker. */ | |
42522d08 | 420 | walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); |
0780516a | 421 | errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); |
f13577e8 | 422 | if (unlikely(errcode)) |
f59c1d2d AK |
423 | goto error; |
424 | ||
13d22b6a AK |
425 | gfn = gpte_to_gfn_lvl(pte, walker->level); |
426 | gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; | |
427 | ||
b3fcdb04 SC |
428 | #if PTTYPE == 32 |
429 | if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) | |
13d22b6a | 430 | gfn += pse36_gfn_delta(pte); |
b3fcdb04 | 431 | #endif |
13d22b6a | 432 | |
c59a0f57 | 433 | real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); |
6e1d2a3f | 434 | if (real_gpa == INVALID_GPA) |
13d22b6a AK |
435 | return 0; |
436 | ||
437 | walker->gfn = real_gpa >> PAGE_SHIFT; | |
438 | ||
8ea667f2 | 439 | if (!write_fault) |
0780516a | 440 | FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); |
908e7d79 GN |
441 | else |
442 | /* | |
61719a8f GN |
443 | * On a write fault, fold the dirty bit into accessed_dirty. |
444 | * For modes without A/D bits support accessed_dirty will be | |
445 | * always clear. | |
908e7d79 | 446 | */ |
d8089bac GN |
447 | accessed_dirty &= pte >> |
448 | (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); | |
b514c30f AK |
449 | |
450 | if (unlikely(!accessed_dirty)) { | |
2dbebf7a SC |
451 | ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, |
452 | addr, write_fault); | |
b514c30f AK |
453 | if (unlikely(ret < 0)) |
454 | goto error; | |
455 | else if (ret) | |
456 | goto retry_walk; | |
457 | } | |
42bf3f0a | 458 | |
fe135d2c | 459 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", |
b1bd5cba LJ |
460 | __func__, (u64)pte, walker->pte_access, |
461 | walker->pt_access[walker->level - 1]); | |
7993ba43 AK |
462 | return 1; |
463 | ||
f59c1d2d | 464 | error: |
134291bf | 465 | errcode |= write_fault | user_fault; |
cd628f0f | 466 | if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) |
134291bf | 467 | errcode |= PFERR_FETCH_MASK; |
8df25a32 | 468 | |
134291bf TY |
469 | walker->fault.vector = PF_VECTOR; |
470 | walker->fault.error_code_valid = true; | |
471 | walker->fault.error_code = errcode; | |
25d92081 YZ |
472 | |
473 | #if PTTYPE == PTTYPE_EPT | |
474 | /* | |
475 | * Use PFERR_RSVD_MASK in error_code to to tell if EPT | |
476 | * misconfiguration requires to be injected. The detection is | |
477 | * done by is_rsvd_bits_set() above. | |
478 | * | |
479 | * We set up the value of exit_qualification to inject: | |
ddd6f0e9 KA |
480 | * [2:0] - Derive from the access bits. The exit_qualification might be |
481 | * out of date if it is serving an EPT misconfiguration. | |
25d92081 YZ |
482 | * [5:3] - Calculated by the page walk of the guest EPT page tables |
483 | * [7:8] - Derived from [7:8] of real exit_qualification | |
484 | * | |
485 | * The other bits are set to 0. | |
486 | */ | |
487 | if (!(errcode & PFERR_RSVD_MASK)) { | |
aecce510 SH |
488 | vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | |
489 | EPT_VIOLATION_GVA_TRANSLATED); | |
ddd6f0e9 KA |
490 | if (write_fault) |
491 | vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; | |
492 | if (user_fault) | |
493 | vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; | |
494 | if (fetch_fault) | |
495 | vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; | |
ca2a7c22 SC |
496 | |
497 | /* | |
498 | * Note, pte_access holds the raw RWX bits from the EPTE, not | |
499 | * ACC_*_MASK flags! | |
500 | */ | |
501 | vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << | |
502 | EPT_VIOLATION_RWX_SHIFT; | |
25d92081 YZ |
503 | } |
504 | #endif | |
6389ee94 AK |
505 | walker->fault.address = addr; |
506 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; | |
422e2e17 | 507 | walker->fault.async_page_fault = false; |
8df25a32 | 508 | |
8c28d031 | 509 | trace_kvm_mmu_walker_error(walker->fault.error_code); |
fe551881 | 510 | return 0; |
6aa8b732 AK |
511 | } |
512 | ||
1e301feb | 513 | static int FNAME(walk_addr)(struct guest_walker *walker, |
5b22bbe7 | 514 | struct kvm_vcpu *vcpu, gpa_t addr, u64 access) |
1e301feb | 515 | { |
44dd3ffa | 516 | return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, |
33770780 | 517 | access); |
1e301feb JR |
518 | } |
519 | ||
bd6360cc XG |
520 | static bool |
521 | FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |
522 | u64 *spte, pt_element_t gpte, bool no_dirty_log) | |
0028425f | 523 | { |
8a9f566a | 524 | struct kvm_memory_slot *slot; |
41074d07 | 525 | unsigned pte_access; |
bd6360cc | 526 | gfn_t gfn; |
ba049e93 | 527 | kvm_pfn_t pfn; |
0028425f | 528 | |
0ad805a0 | 529 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
bd6360cc | 530 | return false; |
407c61c6 | 531 | |
b8688d51 | 532 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
bd6360cc XG |
533 | |
534 | gfn = gpte_to_gfn(gpte); | |
42522d08 | 535 | pte_access = sp->role.access & FNAME(gpte_access)(gpte); |
44dd3ffa | 536 | FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); |
8a9f566a DM |
537 | |
538 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, | |
bd6360cc | 539 | no_dirty_log && (pte_access & ACC_WRITE_MASK)); |
8a9f566a DM |
540 | if (!slot) |
541 | return false; | |
542 | ||
543 | pfn = gfn_to_pfn_memslot_atomic(slot, gfn); | |
81c52c56 | 544 | if (is_error_pfn(pfn)) |
bd6360cc | 545 | return false; |
0f53b5b1 | 546 | |
8a9f566a | 547 | mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); |
43fdcda9 | 548 | kvm_release_pfn_clean(pfn); |
bd6360cc XG |
549 | return true; |
550 | } | |
551 | ||
39c8c672 AK |
552 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, |
553 | struct guest_walker *gw, int level) | |
554 | { | |
39c8c672 | 555 | pt_element_t curr_pte; |
189be38d XG |
556 | gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; |
557 | u64 mask; | |
558 | int r, index; | |
559 | ||
3bae0459 | 560 | if (level == PG_LEVEL_4K) { |
189be38d XG |
561 | mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; |
562 | base_gpa = pte_gpa & ~mask; | |
563 | index = (pte_gpa - base_gpa) / sizeof(pt_element_t); | |
564 | ||
54bf36aa | 565 | r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, |
189be38d XG |
566 | gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); |
567 | curr_pte = gw->prefetch_ptes[index]; | |
568 | } else | |
54bf36aa | 569 | r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, |
39c8c672 | 570 | &curr_pte, sizeof(curr_pte)); |
189be38d | 571 | |
39c8c672 AK |
572 | return r || curr_pte != gw->ptes[level - 1]; |
573 | } | |
574 | ||
189be38d XG |
575 | static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, |
576 | u64 *sptep) | |
957ed9ef XG |
577 | { |
578 | struct kvm_mmu_page *sp; | |
189be38d | 579 | pt_element_t *gptep = gw->prefetch_ptes; |
957ed9ef | 580 | u64 *spte; |
189be38d | 581 | int i; |
957ed9ef | 582 | |
57354682 | 583 | sp = sptep_to_sp(sptep); |
957ed9ef | 584 | |
3bae0459 | 585 | if (sp->role.level > PG_LEVEL_4K) |
957ed9ef XG |
586 | return; |
587 | ||
4a42d848 DS |
588 | /* |
589 | * If addresses are being invalidated, skip prefetching to avoid | |
590 | * accidentally prefetching those addresses. | |
591 | */ | |
20ec3ebd | 592 | if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) |
4a42d848 DS |
593 | return; |
594 | ||
957ed9ef XG |
595 | if (sp->role.direct) |
596 | return __direct_pte_prefetch(vcpu, sp, sptep); | |
597 | ||
79e48cec | 598 | i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); |
957ed9ef XG |
599 | spte = sp->spt + i; |
600 | ||
601 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | |
957ed9ef XG |
602 | if (spte == sptep) |
603 | continue; | |
604 | ||
c3707958 | 605 | if (is_shadow_present_pte(*spte)) |
957ed9ef XG |
606 | continue; |
607 | ||
bd6360cc | 608 | if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) |
957ed9ef | 609 | break; |
957ed9ef XG |
610 | } |
611 | } | |
612 | ||
6aa8b732 AK |
613 | /* |
614 | * Fetch a shadow pte for a specific level in the paging hierarchy. | |
d4878f24 XG |
615 | * If the guest tries to write a write-protected page, we need to |
616 | * emulate this operation, return 1 to indicate this case. | |
6aa8b732 | 617 | */ |
9c03b182 PB |
618 | static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, |
619 | struct guest_walker *gw) | |
6aa8b732 | 620 | { |
5991b332 | 621 | struct kvm_mmu_page *sp = NULL; |
24157aaf | 622 | struct kvm_shadow_walk_iterator it; |
b1bd5cba | 623 | unsigned int direct_access, access; |
73a3c659 | 624 | int top_level, ret; |
9c03b182 | 625 | gfn_t base_gfn = fault->gfn; |
abb9e0b8 | 626 | |
9c03b182 | 627 | WARN_ON_ONCE(gw->gfn != base_gfn); |
b36c7a7c | 628 | direct_access = gw->pte_access; |
84754cd8 | 629 | |
4d25502a | 630 | top_level = vcpu->arch.mmu->cpu_role.base.level; |
5991b332 AK |
631 | if (top_level == PT32E_ROOT_LEVEL) |
632 | top_level = PT32_ROOT_LEVEL; | |
633 | /* | |
634 | * Verify that the top-level gpte is still there. Since the page | |
635 | * is a root page, it is either write protected (and cannot be | |
636 | * changed from now on) or it is invalid (in which case, we don't | |
637 | * really care if it changes underneath us after this point). | |
638 | */ | |
639 | if (FNAME(gpte_changed)(vcpu, gw, top_level)) | |
640 | goto out_gpte_changed; | |
641 | ||
b9e5603c | 642 | if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) |
37f6a4e2 MT |
643 | goto out_gpte_changed; |
644 | ||
9c03b182 | 645 | for (shadow_walk_init(&it, vcpu, fault->addr); |
24157aaf AK |
646 | shadow_walk_okay(&it) && it.level > gw->level; |
647 | shadow_walk_next(&it)) { | |
0b3c9333 AK |
648 | gfn_t table_gfn; |
649 | ||
a30f47cb | 650 | clear_sp_write_flooding_count(it.sptep); |
ef0197e8 | 651 | |
0cd8dc73 PB |
652 | table_gfn = gw->table_gfn[it.level - 2]; |
653 | access = gw->pt_access[it.level - 2]; | |
654 | sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, | |
655 | false, access); | |
2e65e842 | 656 | |
0cd8dc73 | 657 | if (sp != ERR_PTR(-EEXIST)) { |
65855ed8 LJ |
658 | /* |
659 | * We must synchronize the pagetable before linking it | |
660 | * because the guest doesn't need to flush tlb when | |
661 | * the gpte is changed from non-present to present. | |
662 | * Otherwise, the guest may use the wrong mapping. | |
663 | * | |
664 | * For PG_LEVEL_4K, kvm_mmu_get_page() has already | |
665 | * synchronized it transiently via kvm_sync_page(). | |
666 | * | |
667 | * For higher level pagetable, we synchronize it via | |
668 | * the slower mmu_sync_children(). If it needs to | |
669 | * break, some progress has been made; return | |
670 | * RET_PF_RETRY and retry on the next #PF. | |
671 | * KVM_REQ_MMU_SYNC is not necessary but it | |
672 | * expedites the process. | |
673 | */ | |
674 | if (sp->unsync_children && | |
675 | mmu_sync_children(vcpu, sp, false)) | |
676 | return RET_PF_RETRY; | |
5991b332 | 677 | } |
0b3c9333 AK |
678 | |
679 | /* | |
680 | * Verify that the gpte in the page we've just write | |
681 | * protected is still there. | |
682 | */ | |
24157aaf | 683 | if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) |
0b3c9333 | 684 | goto out_gpte_changed; |
abb9e0b8 | 685 | |
0cd8dc73 | 686 | if (sp != ERR_PTR(-EEXIST)) |
98bba238 | 687 | link_shadow_page(vcpu, it.sptep, sp); |
e7a04c99 | 688 | } |
050e6499 | 689 | |
73a3c659 | 690 | kvm_mmu_hugepage_adjust(vcpu, fault); |
4cd071d1 | 691 | |
f0066d94 | 692 | trace_kvm_mmu_spte_requested(fault); |
335e192a | 693 | |
3fcf2d1b | 694 | for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { |
a30f47cb | 695 | clear_sp_write_flooding_count(it.sptep); |
b8e8c830 PB |
696 | |
697 | /* | |
698 | * We cannot overwrite existing page tables with an NX | |
699 | * large page, as the leaf could be executable. | |
700 | */ | |
73a3c659 | 701 | if (fault->nx_huge_page_workaround_enabled) |
536f0e6a | 702 | disallowed_hugepage_adjust(fault, *it.sptep, it.level); |
b8e8c830 | 703 | |
9c03b182 | 704 | base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); |
73a3c659 | 705 | if (it.level == fault->goal_level) |
3fcf2d1b PB |
706 | break; |
707 | ||
24157aaf | 708 | validate_direct_spte(vcpu, it.sptep, direct_access); |
0b3c9333 | 709 | |
0cd8dc73 PB |
710 | sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, |
711 | true, direct_access); | |
712 | if (sp == ERR_PTR(-EEXIST)) | |
713 | continue; | |
0b3c9333 | 714 | |
0cd8dc73 PB |
715 | link_shadow_page(vcpu, it.sptep, sp); |
716 | if (fault->huge_page_disallowed && | |
717 | fault->req_level >= it.level) | |
718 | account_huge_nx_page(vcpu->kvm, sp); | |
0b3c9333 AK |
719 | } |
720 | ||
b1a429fb SC |
721 | if (WARN_ON_ONCE(it.level != fault->goal_level)) |
722 | return -EFAULT; | |
723 | ||
8a9f566a | 724 | ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, |
a12f4381 | 725 | base_gfn, fault->pfn, fault); |
12703759 SC |
726 | if (ret == RET_PF_SPURIOUS) |
727 | return ret; | |
728 | ||
189be38d | 729 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); |
9b8ebbdb | 730 | return ret; |
0b3c9333 AK |
731 | |
732 | out_gpte_changed: | |
9b8ebbdb | 733 | return RET_PF_RETRY; |
6aa8b732 AK |
734 | } |
735 | ||
7751babd XG |
736 | /* |
737 | * To see whether the mapped gfn can write its page table in the current | |
738 | * mapping. | |
739 | * | |
740 | * It is the helper function of FNAME(page_fault). When guest uses large page | |
741 | * size to map the writable gfn which is used as current page table, we should | |
742 | * force kvm to use small page size to map it because new shadow page will be | |
743 | * created when kvm establishes shadow page table that stop kvm using large | |
744 | * page size. Do it early can avoid unnecessary #PF and emulation. | |
745 | * | |
93c05d3e XG |
746 | * @write_fault_to_shadow_pgtable will return true if the fault gfn is |
747 | * currently used as its page table. | |
748 | * | |
7751babd XG |
749 | * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok |
750 | * since the PDPT is always shadowed, that means, we can not use large page | |
751 | * size to map the gfn which is used as PDPT. | |
752 | */ | |
753 | static bool | |
754 | FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, | |
e88b8093 | 755 | struct guest_walker *walker, bool user_fault, |
93c05d3e | 756 | bool *write_fault_to_shadow_pgtable) |
7751babd XG |
757 | { |
758 | int level; | |
759 | gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); | |
93c05d3e | 760 | bool self_changed = false; |
7751babd XG |
761 | |
762 | if (!(walker->pte_access & ACC_WRITE_MASK || | |
fdaa2935 | 763 | (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) |
7751babd XG |
764 | return false; |
765 | ||
93c05d3e XG |
766 | for (level = walker->level; level <= walker->max_level; level++) { |
767 | gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; | |
768 | ||
769 | self_changed |= !(gfn & mask); | |
770 | *write_fault_to_shadow_pgtable |= !gfn; | |
771 | } | |
7751babd | 772 | |
93c05d3e | 773 | return self_changed; |
7751babd XG |
774 | } |
775 | ||
6aa8b732 AK |
776 | /* |
777 | * Page fault handler. There are several causes for a page fault: | |
778 | * - there is no shadow pte for the guest pte | |
779 | * - write access through a shadow pte marked read only so that we can set | |
780 | * the dirty bit | |
781 | * - write access to a shadow pte marked read only so we can update the page | |
782 | * dirty bitmap, when userspace requests it | |
783 | * - mmio access; in this case we will never install a present shadow pte | |
784 | * - normal guest page fault due to the guest pte marked not present, not | |
785 | * writable, or not executable | |
786 | * | |
e2dec939 AK |
787 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or |
788 | * a negative value on error. | |
6aa8b732 | 789 | */ |
c501040a | 790 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) |
6aa8b732 | 791 | { |
6aa8b732 | 792 | struct guest_walker walker; |
e2dec939 | 793 | int r; |
e930bffe | 794 | unsigned long mmu_seq; |
3647cd04 | 795 | bool is_self_change_mapping; |
6aa8b732 | 796 | |
9c03b182 | 797 | pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); |
c501040a | 798 | WARN_ON_ONCE(fault->is_tdp); |
714b93da | 799 | |
e9ee956e | 800 | /* |
9c03b182 | 801 | * Look up the guest pte for the faulting address. |
e9ee956e TY |
802 | * If PFEC.RSVD is set, this is a shadow page fault. |
803 | * The bit needs to be cleared before walking guest page tables. | |
804 | */ | |
9c03b182 PB |
805 | r = FNAME(walk_addr)(&walker, vcpu, fault->addr, |
806 | fault->error_code & ~PFERR_RSVD_MASK); | |
6aa8b732 AK |
807 | |
808 | /* | |
809 | * The page is not mapped by the guest. Let the guest handle it. | |
810 | */ | |
7993ba43 | 811 | if (!r) { |
b8688d51 | 812 | pgprintk("%s: guest page fault\n", __func__); |
2839180c | 813 | if (!fault->prefetch) |
0cd665bd | 814 | kvm_inject_emulated_page_fault(vcpu, &walker.fault); |
a30f47cb | 815 | |
9b8ebbdb | 816 | return RET_PF_RETRY; |
6aa8b732 AK |
817 | } |
818 | ||
b8a5d551 | 819 | fault->gfn = walker.gfn; |
e710c5f6 DM |
820 | fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); |
821 | ||
b8a5d551 | 822 | if (page_fault_handle_page_track(vcpu, fault)) { |
9c03b182 | 823 | shadow_page_table_clear_flood(vcpu, fault->addr); |
9b8ebbdb | 824 | return RET_PF_EMULATE; |
e5691a81 | 825 | } |
3d0c27ad | 826 | |
378f5cd6 | 827 | r = mmu_topup_memory_caches(vcpu, true); |
f3747a5a SC |
828 | if (r) |
829 | return r; | |
830 | ||
93c05d3e XG |
831 | vcpu->arch.write_fault_to_shadow_pgtable = false; |
832 | ||
833 | is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, | |
c501040a | 834 | &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); |
93c05d3e | 835 | |
6c2fd34f | 836 | if (is_self_change_mapping) |
4326e57e | 837 | fault->max_level = PG_LEVEL_4K; |
cbe1e6f0 | 838 | else |
4326e57e | 839 | fault->max_level = walker.level; |
cbe1e6f0 | 840 | |
20ec3ebd | 841 | mmu_seq = vcpu->kvm->mmu_invalidate_seq; |
4c2155ce | 842 | smp_rmb(); |
af585b92 | 843 | |
5276c616 SC |
844 | r = kvm_faultin_pfn(vcpu, fault); |
845 | if (r != RET_PF_CONTINUE) | |
8f32d5e5 | 846 | return r; |
d7824fff | 847 | |
5276c616 SC |
848 | r = handle_abnormal_pfn(vcpu, fault, walker.pte_access); |
849 | if (r != RET_PF_CONTINUE) | |
d7c55201 XG |
850 | return r; |
851 | ||
c2288505 XG |
852 | /* |
853 | * Do not change pte_access if the pfn is a mmio page, otherwise | |
854 | * we will cache the incorrect access into mmio spte. | |
855 | */ | |
c501040a | 856 | if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && |
e710c5f6 | 857 | !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { |
c2288505 XG |
858 | walker.pte_access |= ACC_WRITE_MASK; |
859 | walker.pte_access &= ~ACC_USER_MASK; | |
860 | ||
861 | /* | |
862 | * If we converted a user page to a kernel page, | |
863 | * so that the kernel can write to it when cr0.wp=0, | |
864 | * then we should prevent the kernel from executing it | |
865 | * if SMEP is enabled. | |
866 | */ | |
9a65d0b7 | 867 | if (is_cr4_smep(vcpu->arch.mmu)) |
c2288505 XG |
868 | walker.pte_access &= ~ACC_EXEC_MASK; |
869 | } | |
870 | ||
43fdcda9 | 871 | r = RET_PF_RETRY; |
531810ca | 872 | write_lock(&vcpu->kvm->mmu_lock); |
a955cad8 SC |
873 | |
874 | if (is_page_fault_stale(vcpu, fault, mmu_seq)) | |
e930bffe | 875 | goto out_unlock; |
bc32ce21 | 876 | |
7bd7ded6 SC |
877 | r = make_mmu_pages_available(vcpu); |
878 | if (r) | |
26eeb53c | 879 | goto out_unlock; |
9c03b182 | 880 | r = FNAME(fetch)(vcpu, fault, &walker); |
e930bffe AA |
881 | |
882 | out_unlock: | |
531810ca | 883 | write_unlock(&vcpu->kvm->mmu_lock); |
3647cd04 | 884 | kvm_release_pfn_clean(fault->pfn); |
43fdcda9 | 885 | return r; |
6aa8b732 AK |
886 | } |
887 | ||
505aef8f XG |
888 | static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) |
889 | { | |
890 | int offset = 0; | |
891 | ||
3bae0459 | 892 | WARN_ON(sp->role.level != PG_LEVEL_4K); |
505aef8f XG |
893 | |
894 | if (PTTYPE == 32) | |
2ca3129e | 895 | offset = sp->role.quadrant << SPTE_LEVEL_BITS; |
505aef8f XG |
896 | |
897 | return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); | |
898 | } | |
899 | ||
7eb77e9f | 900 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) |
a7052897 | 901 | { |
a461930b | 902 | struct kvm_shadow_walk_iterator iterator; |
f78978aa | 903 | struct kvm_mmu_page *sp; |
ace569e0 | 904 | u64 old_spte; |
a461930b AK |
905 | int level; |
906 | u64 *sptep; | |
907 | ||
bebb106a XG |
908 | vcpu_clear_mmio_info(vcpu, gva); |
909 | ||
f57f2ef5 XG |
910 | /* |
911 | * No need to check return value here, rmap_can_add() can | |
912 | * help us to skip pte prefetch later. | |
913 | */ | |
378f5cd6 | 914 | mmu_topup_memory_caches(vcpu, true); |
a7052897 | 915 | |
7eb77e9f | 916 | if (!VALID_PAGE(root_hpa)) { |
37f6a4e2 MT |
917 | WARN_ON(1); |
918 | return; | |
919 | } | |
920 | ||
531810ca | 921 | write_lock(&vcpu->kvm->mmu_lock); |
7eb77e9f | 922 | for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { |
a461930b AK |
923 | level = iterator.level; |
924 | sptep = iterator.sptep; | |
ad218f85 | 925 | |
57354682 | 926 | sp = sptep_to_sp(sptep); |
ace569e0 SC |
927 | old_spte = *sptep; |
928 | if (is_last_spte(old_spte, level)) { | |
f57f2ef5 XG |
929 | pt_element_t gpte; |
930 | gpa_t pte_gpa; | |
931 | ||
f78978aa XG |
932 | if (!sp->unsync) |
933 | break; | |
934 | ||
505aef8f | 935 | pte_gpa = FNAME(get_level1_sp_gpa)(sp); |
79e48cec | 936 | pte_gpa += spte_index(sptep) * sizeof(pt_element_t); |
a461930b | 937 | |
2de4085c | 938 | mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); |
ace569e0 | 939 | if (is_shadow_present_pte(old_spte)) |
c3134ce2 LT |
940 | kvm_flush_remote_tlbs_with_address(vcpu->kvm, |
941 | sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); | |
f57f2ef5 XG |
942 | |
943 | if (!rmap_can_add(vcpu)) | |
944 | break; | |
945 | ||
54bf36aa PB |
946 | if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, |
947 | sizeof(pt_element_t))) | |
f57f2ef5 XG |
948 | break; |
949 | ||
cc2a8e66 | 950 | FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); |
87917239 | 951 | } |
a7052897 | 952 | |
3e44dce4 | 953 | if (!sp->unsync_children) |
a461930b AK |
954 | break; |
955 | } | |
531810ca | 956 | write_unlock(&vcpu->kvm->mmu_lock); |
a7052897 MT |
957 | } |
958 | ||
736c291c | 959 | /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ |
1f5a21ee | 960 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
5b22bbe7 | 961 | gpa_t addr, u64 access, |
ab9ae313 | 962 | struct x86_exception *exception) |
6aa8b732 AK |
963 | { |
964 | struct guest_walker walker; | |
6e1d2a3f | 965 | gpa_t gpa = INVALID_GPA; |
e119d117 | 966 | int r; |
6aa8b732 | 967 | |
736c291c SC |
968 | #ifndef CONFIG_X86_64 |
969 | /* A 64-bit GVA should be impossible on 32-bit KVM. */ | |
1f5a21ee | 970 | WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); |
736c291c SC |
971 | #endif |
972 | ||
1f5a21ee | 973 | r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); |
6539e738 JR |
974 | |
975 | if (r) { | |
976 | gpa = gfn_to_gpa(walker.gfn); | |
1f5a21ee | 977 | gpa |= addr & ~PAGE_MASK; |
8c28d031 AK |
978 | } else if (exception) |
979 | *exception = walker.fault; | |
6539e738 JR |
980 | |
981 | return gpa; | |
982 | } | |
983 | ||
e8bc217a | 984 | /* |
6a97575d DM |
985 | * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is |
986 | * safe because: | |
e8bc217a MT |
987 | * - The spte has a reference to the struct page, so the pfn for a given gfn |
988 | * can't change unless all sptes pointing to it are nuked first. | |
c3e5e415 LJ |
989 | * |
990 | * Returns | |
991 | * < 0: the sp should be zapped | |
992 | * 0: the sp is synced and no tlb flushing is required | |
993 | * > 0: the sp is synced and tlb flushing is required | |
e8bc217a | 994 | */ |
a4a8e6f7 | 995 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
e8bc217a | 996 | { |
7a458f0e | 997 | union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; |
c3e5e415 | 998 | int i; |
9bdbba13 | 999 | bool host_writable; |
51fb60d8 | 1000 | gpa_t first_pte_gpa; |
4758d47e | 1001 | bool flush = false; |
e8bc217a | 1002 | |
2640b086 SC |
1003 | /* |
1004 | * Ignore various flags when verifying that it's safe to sync a shadow | |
1005 | * page using the current MMU context. | |
1006 | * | |
1007 | * - level: not part of the overall MMU role and will never match as the MMU's | |
1008 | * level tracks the root level | |
1009 | * - access: updated based on the new guest PTE | |
1010 | * - quadrant: not part of the overall MMU role (similar to level) | |
1011 | */ | |
1012 | const union kvm_mmu_page_role sync_role_ign = { | |
1013 | .level = 0xf, | |
1014 | .access = 0x7, | |
1015 | .quadrant = 0x3, | |
84e5ffd0 | 1016 | .passthrough = 0x1, |
2640b086 SC |
1017 | }; |
1018 | ||
1019 | /* | |
1020 | * Direct pages can never be unsync, and KVM should never attempt to | |
1021 | * sync a shadow page for a different MMU context, e.g. if the role | |
1022 | * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the | |
1023 | * reserved bits checks will be wrong, etc... | |
1024 | */ | |
1025 | if (WARN_ON_ONCE(sp->role.direct || | |
7a458f0e | 1026 | (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) |
c3e5e415 | 1027 | return -1; |
2032a93d | 1028 | |
505aef8f | 1029 | first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); |
51fb60d8 | 1030 | |
2ca3129e | 1031 | for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { |
4758d47e | 1032 | u64 *sptep, spte; |
53597858 | 1033 | struct kvm_memory_slot *slot; |
e8bc217a MT |
1034 | unsigned pte_access; |
1035 | pt_element_t gpte; | |
1036 | gpa_t pte_gpa; | |
f55c3f41 | 1037 | gfn_t gfn; |
e8bc217a | 1038 | |
ce88decf | 1039 | if (!sp->spt[i]) |
e8bc217a MT |
1040 | continue; |
1041 | ||
51fb60d8 | 1042 | pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); |
e8bc217a | 1043 | |
54bf36aa PB |
1044 | if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, |
1045 | sizeof(pt_element_t))) | |
c3e5e415 | 1046 | return -1; |
e8bc217a | 1047 | |
0ad805a0 | 1048 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { |
4758d47e | 1049 | flush = true; |
407c61c6 XG |
1050 | continue; |
1051 | } | |
1052 | ||
ce88decf XG |
1053 | gfn = gpte_to_gfn(gpte); |
1054 | pte_access = sp->role.access; | |
42522d08 | 1055 | pte_access &= FNAME(gpte_access)(gpte); |
44dd3ffa | 1056 | FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); |
ce88decf | 1057 | |
c3e5e415 | 1058 | if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) |
ce88decf XG |
1059 | continue; |
1060 | ||
9fb35657 SC |
1061 | /* |
1062 | * Drop the SPTE if the new protections would result in a RWX=0 | |
1063 | * SPTE or if the gfn is changing. The RWX=0 case only affects | |
1064 | * EPT with execute-only support, i.e. EPT without an effective | |
1065 | * "present" bit, as all other paging modes will create a | |
1066 | * read-only SPTE if pte_access is zero. | |
1067 | */ | |
6a97575d DM |
1068 | if ((!pte_access && !shadow_present_mask) || |
1069 | gfn != kvm_mmu_page_get_gfn(sp, i)) { | |
c3707958 | 1070 | drop_spte(vcpu->kvm, &sp->spt[i]); |
4758d47e | 1071 | flush = true; |
e8bc217a MT |
1072 | continue; |
1073 | } | |
1074 | ||
6a97575d DM |
1075 | /* Update the shadowed access bits in case they changed. */ |
1076 | kvm_mmu_page_set_access(sp, i, pte_access); | |
1077 | ||
4758d47e PB |
1078 | sptep = &sp->spt[i]; |
1079 | spte = *sptep; | |
1080 | host_writable = spte & shadow_host_writable_mask; | |
53597858 DM |
1081 | slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); |
1082 | make_spte(vcpu, sp, slot, pte_access, gfn, | |
4758d47e | 1083 | spte_to_pfn(spte), spte, true, false, |
7158bee4 | 1084 | host_writable, &spte); |
f8e453b0 | 1085 | |
4758d47e | 1086 | flush |= mmu_spte_update(sptep, spte); |
e8bc217a MT |
1087 | } |
1088 | ||
b8b9156e SC |
1089 | /* |
1090 | * Note, any flush is purely for KVM's correctness, e.g. when dropping | |
1091 | * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier | |
1092 | * unmap or dirty logging event doesn't fail to flush. The guest is | |
1093 | * responsible for flushing the TLB to ensure any changes in protection | |
1094 | * bits are recognized, i.e. until the guest flushes or page faults on | |
1095 | * a relevant address, KVM is architecturally allowed to let vCPUs use | |
1096 | * cached translations with the old protection bits. | |
1097 | */ | |
4758d47e | 1098 | return flush; |
e8bc217a MT |
1099 | } |
1100 | ||
6aa8b732 AK |
1101 | #undef pt_element_t |
1102 | #undef guest_walker | |
1103 | #undef FNAME | |
1104 | #undef PT_BASE_ADDR_MASK | |
1105 | #undef PT_INDEX | |
e04da980 JR |
1106 | #undef PT_LVL_ADDR_MASK |
1107 | #undef PT_LVL_OFFSET_MASK | |
c7addb90 | 1108 | #undef PT_LEVEL_BITS |
cea0f0e7 | 1109 | #undef PT_MAX_FULL_LEVELS |
5fb07ddb | 1110 | #undef gpte_to_gfn |
e04da980 | 1111 | #undef gpte_to_gfn_lvl |
d8089bac GN |
1112 | #undef PT_GUEST_ACCESSED_MASK |
1113 | #undef PT_GUEST_DIRTY_MASK | |
1114 | #undef PT_GUEST_DIRTY_SHIFT | |
1115 | #undef PT_GUEST_ACCESSED_SHIFT | |
86407bcb | 1116 | #undef PT_HAVE_ACCESSED_DIRTY |