]>
Commit | Line | Data |
---|---|---|
752e2cd7 TZ |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2020-2023 Loongson Technology Corporation Limited | |
4 | */ | |
5 | ||
6 | #include <linux/highmem.h> | |
7 | #include <linux/hugetlb.h> | |
8 | #include <linux/kvm_host.h> | |
9 | #include <linux/page-flags.h> | |
10 | #include <linux/uaccess.h> | |
11 | #include <asm/mmu_context.h> | |
12 | #include <asm/pgalloc.h> | |
13 | #include <asm/tlb.h> | |
14 | #include <asm/kvm_mmu.h> | |
15 | ||
7ab6fb50 BM |
16 | static inline bool kvm_hugepage_capable(struct kvm_memory_slot *slot) |
17 | { | |
18 | return slot->arch.flags & KVM_MEM_HUGEPAGE_CAPABLE; | |
19 | } | |
20 | ||
21 | static inline bool kvm_hugepage_incapable(struct kvm_memory_slot *slot) | |
22 | { | |
23 | return slot->arch.flags & KVM_MEM_HUGEPAGE_INCAPABLE; | |
24 | } | |
25 | ||
752e2cd7 TZ |
26 | static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) |
27 | { | |
28 | ctx->level = kvm->arch.root_level; | |
29 | /* pte table */ | |
30 | ctx->invalid_ptes = kvm->arch.invalid_ptes; | |
31 | ctx->pte_shifts = kvm->arch.pte_shifts; | |
32 | ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; | |
33 | ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; | |
34 | ctx->opaque = kvm; | |
35 | } | |
36 | ||
37 | /* | |
38 | * Mark a range of guest physical address space old (all accesses fault) in the | |
39 | * VM's GPA page table to allow detection of commonly used pages. | |
40 | */ | |
41 | static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) | |
42 | { | |
43 | if (kvm_pte_young(*pte)) { | |
44 | *pte = kvm_pte_mkold(*pte); | |
45 | return 1; | |
46 | } | |
47 | ||
48 | return 0; | |
49 | } | |
50 | ||
51 | /* | |
52 | * Mark a range of guest physical address space clean (writes fault) in the VM's | |
53 | * GPA page table to allow dirty page tracking. | |
54 | */ | |
55 | static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) | |
56 | { | |
57 | gfn_t offset; | |
58 | kvm_pte_t val; | |
59 | ||
60 | val = *pte; | |
61 | /* | |
62 | * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end | |
63 | * may cross hugepage, for first huge page parameter addr is equal to | |
64 | * start, however for the second huge page addr is base address of | |
65 | * this huge page, rather than start or end address | |
66 | */ | |
67 | if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) { | |
68 | offset = (addr >> PAGE_SHIFT) - ctx->gfn; | |
69 | if (!(BIT(offset) & ctx->mask)) | |
70 | return 0; | |
71 | } | |
72 | ||
73 | /* | |
74 | * Need not split huge page now, just set write-proect pte bit | |
75 | * Split huge page until next write fault | |
76 | */ | |
77 | if (kvm_pte_dirty(val)) { | |
78 | *pte = kvm_pte_mkclean(val); | |
79 | return 1; | |
80 | } | |
81 | ||
82 | return 0; | |
83 | } | |
84 | ||
85 | /* | |
86 | * Clear pte entry | |
87 | */ | |
88 | static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) | |
89 | { | |
90 | struct kvm *kvm; | |
91 | ||
92 | kvm = ctx->opaque; | |
93 | if (ctx->level) | |
94 | kvm->stat.hugepages--; | |
95 | else | |
96 | kvm->stat.pages--; | |
97 | ||
98 | *pte = ctx->invalid_entry; | |
99 | ||
100 | return 1; | |
101 | } | |
102 | ||
103 | /* | |
104 | * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. | |
105 | * | |
106 | * Allocate a blank KVM GPA page directory (PGD) for representing guest physical | |
107 | * to host physical page mappings. | |
108 | * | |
109 | * Returns: Pointer to new KVM GPA page directory. | |
110 | * NULL on allocation failure. | |
111 | */ | |
112 | kvm_pte_t *kvm_pgd_alloc(void) | |
113 | { | |
114 | kvm_pte_t *pgd; | |
115 | ||
116 | pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0); | |
117 | if (pgd) | |
118 | pgd_init((void *)pgd); | |
119 | ||
120 | return pgd; | |
121 | } | |
122 | ||
123 | static void _kvm_pte_init(void *addr, unsigned long val) | |
124 | { | |
125 | unsigned long *p, *end; | |
126 | ||
127 | p = (unsigned long *)addr; | |
128 | end = p + PTRS_PER_PTE; | |
129 | do { | |
130 | p[0] = val; | |
131 | p[1] = val; | |
132 | p[2] = val; | |
133 | p[3] = val; | |
134 | p[4] = val; | |
135 | p += 8; | |
136 | p[-3] = val; | |
137 | p[-2] = val; | |
138 | p[-1] = val; | |
139 | } while (p != end); | |
140 | } | |
141 | ||
142 | /* | |
143 | * Caller must hold kvm->mm_lock | |
144 | * | |
145 | * Walk the page tables of kvm to find the PTE corresponding to the | |
146 | * address @addr. If page tables don't exist for @addr, they will be created | |
147 | * from the MMU cache if @cache is not NULL. | |
148 | */ | |
149 | static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, | |
150 | struct kvm_mmu_memory_cache *cache, | |
151 | unsigned long addr, int level) | |
152 | { | |
153 | kvm_ptw_ctx ctx; | |
154 | kvm_pte_t *entry, *child; | |
155 | ||
156 | kvm_ptw_prepare(kvm, &ctx); | |
157 | child = kvm->arch.pgd; | |
158 | while (ctx.level > level) { | |
159 | entry = kvm_pgtable_offset(&ctx, child, addr); | |
160 | if (kvm_pte_none(&ctx, entry)) { | |
161 | if (!cache) | |
162 | return NULL; | |
163 | ||
164 | child = kvm_mmu_memory_cache_alloc(cache); | |
165 | _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]); | |
166 | kvm_set_pte(entry, __pa(child)); | |
167 | } else if (kvm_pte_huge(*entry)) { | |
168 | return entry; | |
169 | } else | |
170 | child = (kvm_pte_t *)__va(PHYSADDR(*entry)); | |
171 | kvm_ptw_enter(&ctx); | |
172 | } | |
173 | ||
174 | entry = kvm_pgtable_offset(&ctx, child, addr); | |
175 | ||
176 | return entry; | |
177 | } | |
178 | ||
179 | /* | |
180 | * Page walker for VM shadow mmu at last level | |
181 | * The last level is small pte page or huge pmd page | |
182 | */ | |
183 | static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) | |
184 | { | |
185 | int ret; | |
186 | phys_addr_t next, start, size; | |
187 | struct list_head *list; | |
188 | kvm_pte_t *entry, *child; | |
189 | ||
190 | ret = 0; | |
191 | start = addr; | |
192 | child = (kvm_pte_t *)__va(PHYSADDR(*dir)); | |
193 | entry = kvm_pgtable_offset(ctx, child, addr); | |
194 | do { | |
195 | next = addr + (0x1UL << ctx->pgtable_shift); | |
196 | if (!kvm_pte_present(ctx, entry)) | |
197 | continue; | |
198 | ||
199 | ret |= ctx->ops(entry, addr, ctx); | |
200 | } while (entry++, addr = next, addr < end); | |
201 | ||
202 | if (kvm_need_flush(ctx)) { | |
203 | size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); | |
204 | if (start + size == end) { | |
205 | list = (struct list_head *)child; | |
206 | list_add_tail(list, &ctx->list); | |
207 | *dir = ctx->invalid_ptes[ctx->level + 1]; | |
208 | } | |
209 | } | |
210 | ||
211 | return ret; | |
212 | } | |
213 | ||
214 | /* | |
215 | * Page walker for VM shadow mmu at page table dir level | |
216 | */ | |
217 | static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) | |
218 | { | |
219 | int ret; | |
220 | phys_addr_t next, start, size; | |
221 | struct list_head *list; | |
222 | kvm_pte_t *entry, *child; | |
223 | ||
224 | ret = 0; | |
225 | start = addr; | |
226 | child = (kvm_pte_t *)__va(PHYSADDR(*dir)); | |
227 | entry = kvm_pgtable_offset(ctx, child, addr); | |
228 | do { | |
229 | next = kvm_pgtable_addr_end(ctx, addr, end); | |
230 | if (!kvm_pte_present(ctx, entry)) | |
231 | continue; | |
232 | ||
233 | if (kvm_pte_huge(*entry)) { | |
234 | ret |= ctx->ops(entry, addr, ctx); | |
235 | continue; | |
236 | } | |
237 | ||
238 | kvm_ptw_enter(ctx); | |
239 | if (ctx->level == 0) | |
240 | ret |= kvm_ptw_leaf(entry, addr, next, ctx); | |
241 | else | |
242 | ret |= kvm_ptw_dir(entry, addr, next, ctx); | |
243 | kvm_ptw_exit(ctx); | |
244 | } while (entry++, addr = next, addr < end); | |
245 | ||
246 | if (kvm_need_flush(ctx)) { | |
247 | size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); | |
248 | if (start + size == end) { | |
249 | list = (struct list_head *)child; | |
250 | list_add_tail(list, &ctx->list); | |
251 | *dir = ctx->invalid_ptes[ctx->level + 1]; | |
252 | } | |
253 | } | |
254 | ||
255 | return ret; | |
256 | } | |
257 | ||
258 | /* | |
259 | * Page walker for VM shadow mmu at page root table | |
260 | */ | |
261 | static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) | |
262 | { | |
263 | int ret; | |
264 | phys_addr_t next; | |
265 | kvm_pte_t *entry; | |
266 | ||
267 | ret = 0; | |
268 | entry = kvm_pgtable_offset(ctx, dir, addr); | |
269 | do { | |
270 | next = kvm_pgtable_addr_end(ctx, addr, end); | |
271 | if (!kvm_pte_present(ctx, entry)) | |
272 | continue; | |
273 | ||
274 | kvm_ptw_enter(ctx); | |
275 | ret |= kvm_ptw_dir(entry, addr, next, ctx); | |
276 | kvm_ptw_exit(ctx); | |
277 | } while (entry++, addr = next, addr < end); | |
278 | ||
279 | return ret; | |
280 | } | |
281 | ||
282 | /* | |
283 | * kvm_flush_range() - Flush a range of guest physical addresses. | |
284 | * @kvm: KVM pointer. | |
285 | * @start_gfn: Guest frame number of first page in GPA range to flush. | |
286 | * @end_gfn: Guest frame number of last page in GPA range to flush. | |
287 | * @lock: Whether to hold mmu_lock or not | |
288 | * | |
289 | * Flushes a range of GPA mappings from the GPA page tables. | |
290 | */ | |
291 | static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock) | |
292 | { | |
293 | int ret; | |
294 | kvm_ptw_ctx ctx; | |
295 | struct list_head *pos, *temp; | |
296 | ||
297 | ctx.ops = kvm_flush_pte; | |
298 | ctx.flag = _KVM_FLUSH_PGTABLE; | |
299 | kvm_ptw_prepare(kvm, &ctx); | |
300 | INIT_LIST_HEAD(&ctx.list); | |
301 | ||
302 | if (lock) { | |
303 | spin_lock(&kvm->mmu_lock); | |
304 | ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, | |
305 | end_gfn << PAGE_SHIFT, &ctx); | |
306 | spin_unlock(&kvm->mmu_lock); | |
307 | } else | |
308 | ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, | |
309 | end_gfn << PAGE_SHIFT, &ctx); | |
310 | ||
311 | /* Flush vpid for each vCPU individually */ | |
312 | if (ret) | |
313 | kvm_flush_remote_tlbs(kvm); | |
314 | ||
315 | /* | |
316 | * free pte table page after mmu_lock | |
317 | * the pte table page is linked together with ctx.list | |
318 | */ | |
319 | list_for_each_safe(pos, temp, &ctx.list) { | |
320 | list_del(pos); | |
321 | free_page((unsigned long)pos); | |
322 | } | |
323 | } | |
324 | ||
325 | /* | |
326 | * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. | |
327 | * @kvm: KVM pointer. | |
328 | * @start_gfn: Guest frame number of first page in GPA range to flush. | |
329 | * @end_gfn: Guest frame number of last page in GPA range to flush. | |
330 | * | |
331 | * Make a range of GPA mappings clean so that guest writes will fault and | |
332 | * trigger dirty page logging. | |
333 | * | |
334 | * The caller must hold the @kvm->mmu_lock spinlock. | |
335 | * | |
336 | * Returns: Whether any GPA mappings were modified, which would require | |
337 | * derived mappings (GVA page tables & TLB enties) to be | |
338 | * invalidated. | |
339 | */ | |
340 | static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) | |
341 | { | |
342 | kvm_ptw_ctx ctx; | |
343 | ||
344 | ctx.ops = kvm_mkclean_pte; | |
345 | ctx.flag = 0; | |
346 | kvm_ptw_prepare(kvm, &ctx); | |
347 | return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx); | |
348 | } | |
349 | ||
350 | /* | |
351 | * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages | |
352 | * @kvm: The KVM pointer | |
353 | * @slot: The memory slot associated with mask | |
354 | * @gfn_offset: The gfn offset in memory slot | |
355 | * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory | |
356 | * slot to be write protected | |
357 | * | |
358 | * Walks bits set in mask write protects the associated pte's. Caller must | |
359 | * acquire @kvm->mmu_lock. | |
360 | */ | |
361 | void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, | |
362 | struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) | |
363 | { | |
364 | kvm_ptw_ctx ctx; | |
365 | gfn_t base_gfn = slot->base_gfn + gfn_offset; | |
366 | gfn_t start = base_gfn + __ffs(mask); | |
367 | gfn_t end = base_gfn + __fls(mask) + 1; | |
368 | ||
369 | ctx.ops = kvm_mkclean_pte; | |
370 | ctx.flag = _KVM_HAS_PGMASK; | |
371 | ctx.mask = mask; | |
372 | ctx.gfn = base_gfn; | |
373 | kvm_ptw_prepare(kvm, &ctx); | |
374 | ||
375 | kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); | |
376 | } | |
377 | ||
7ab6fb50 BM |
378 | int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, |
379 | struct kvm_memory_slot *new, enum kvm_mr_change change) | |
380 | { | |
381 | gpa_t gpa_start; | |
382 | hva_t hva_start; | |
383 | size_t size, gpa_offset, hva_offset; | |
384 | ||
385 | if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE)) | |
386 | return 0; | |
387 | /* | |
388 | * Prevent userspace from creating a memory region outside of the | |
389 | * VM GPA address space | |
390 | */ | |
391 | if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT)) | |
392 | return -ENOMEM; | |
393 | ||
394 | new->arch.flags = 0; | |
395 | size = new->npages * PAGE_SIZE; | |
396 | gpa_start = new->base_gfn << PAGE_SHIFT; | |
397 | hva_start = new->userspace_addr; | |
398 | if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE) | |
399 | && IS_ALIGNED(hva_start, PMD_SIZE)) | |
400 | new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE; | |
401 | else { | |
402 | /* | |
403 | * Pages belonging to memslots that don't have the same | |
404 | * alignment within a PMD for userspace and GPA cannot be | |
405 | * mapped with PMD entries, because we'll end up mapping | |
406 | * the wrong pages. | |
407 | * | |
408 | * Consider a layout like the following: | |
409 | * | |
410 | * memslot->userspace_addr: | |
411 | * +-----+--------------------+--------------------+---+ | |
412 | * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| | |
413 | * +-----+--------------------+--------------------+---+ | |
414 | * | |
415 | * memslot->base_gfn << PAGE_SIZE: | |
416 | * +---+--------------------+--------------------+-----+ | |
417 | * |abc|def Stage-2 block | Stage-2 block |tvxyz| | |
418 | * +---+--------------------+--------------------+-----+ | |
419 | * | |
420 | * If we create those stage-2 blocks, we'll end up with this | |
421 | * incorrect mapping: | |
422 | * d -> f | |
423 | * e -> g | |
424 | * f -> h | |
425 | */ | |
426 | gpa_offset = gpa_start & (PMD_SIZE - 1); | |
427 | hva_offset = hva_start & (PMD_SIZE - 1); | |
428 | if (gpa_offset != hva_offset) { | |
429 | new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; | |
430 | } else { | |
431 | if (gpa_offset == 0) | |
432 | gpa_offset = PMD_SIZE; | |
433 | if ((size + gpa_offset) < (PMD_SIZE * 2)) | |
434 | new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; | |
435 | } | |
436 | } | |
437 | ||
438 | return 0; | |
439 | } | |
440 | ||
752e2cd7 TZ |
441 | void kvm_arch_commit_memory_region(struct kvm *kvm, |
442 | struct kvm_memory_slot *old, | |
443 | const struct kvm_memory_slot *new, | |
444 | enum kvm_mr_change change) | |
445 | { | |
446 | int needs_flush; | |
447 | ||
448 | /* | |
449 | * If dirty page logging is enabled, write protect all pages in the slot | |
450 | * ready for dirty logging. | |
451 | * | |
452 | * There is no need to do this in any of the following cases: | |
453 | * CREATE: No dirty mappings will already exist. | |
454 | * MOVE/DELETE: The old mappings will already have been cleaned up by | |
455 | * kvm_arch_flush_shadow_memslot() | |
456 | */ | |
457 | if (change == KVM_MR_FLAGS_ONLY && | |
458 | (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && | |
459 | new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { | |
460 | spin_lock(&kvm->mmu_lock); | |
461 | /* Write protect GPA page table entries */ | |
462 | needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn, | |
463 | new->base_gfn + new->npages); | |
464 | spin_unlock(&kvm->mmu_lock); | |
465 | if (needs_flush) | |
466 | kvm_flush_remote_tlbs(kvm); | |
467 | } | |
468 | } | |
469 | ||
470 | void kvm_arch_flush_shadow_all(struct kvm *kvm) | |
471 | { | |
472 | kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0); | |
473 | } | |
474 | ||
475 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) | |
476 | { | |
477 | /* | |
478 | * The slot has been made invalid (ready for moving or deletion), so we | |
479 | * need to ensure that it can no longer be accessed by any guest vCPUs. | |
480 | */ | |
481 | kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1); | |
482 | } | |
483 | ||
484 | bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) | |
485 | { | |
486 | kvm_ptw_ctx ctx; | |
487 | ||
488 | ctx.flag = 0; | |
489 | ctx.ops = kvm_flush_pte; | |
490 | kvm_ptw_prepare(kvm, &ctx); | |
491 | INIT_LIST_HEAD(&ctx.list); | |
492 | ||
493 | return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, | |
494 | range->end << PAGE_SHIFT, &ctx); | |
495 | } | |
496 | ||
497 | bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) | |
498 | { | |
499 | unsigned long prot_bits; | |
500 | kvm_pte_t *ptep; | |
501 | kvm_pfn_t pfn = pte_pfn(range->arg.pte); | |
502 | gpa_t gpa = range->start << PAGE_SHIFT; | |
503 | ||
504 | ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); | |
505 | if (!ptep) | |
506 | return false; | |
507 | ||
508 | /* Replacing an absent or old page doesn't need flushes */ | |
509 | if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) { | |
510 | kvm_set_pte(ptep, 0); | |
511 | return false; | |
512 | } | |
513 | ||
514 | /* Fill new pte if write protected or page migrated */ | |
515 | prot_bits = _PAGE_PRESENT | __READABLE; | |
516 | prot_bits |= _CACHE_MASK & pte_val(range->arg.pte); | |
517 | ||
518 | /* | |
519 | * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support | |
520 | * _PAGE_WRITE for map_page_fast if next page write fault | |
521 | * _PAGE_DIRTY since gpa has already recorded as dirty page | |
522 | */ | |
523 | prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte); | |
524 | kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits))); | |
525 | ||
526 | return true; | |
527 | } | |
528 | ||
529 | bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) | |
530 | { | |
531 | kvm_ptw_ctx ctx; | |
532 | ||
533 | ctx.flag = 0; | |
534 | ctx.ops = kvm_mkold_pte; | |
535 | kvm_ptw_prepare(kvm, &ctx); | |
536 | ||
537 | return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, | |
538 | range->end << PAGE_SHIFT, &ctx); | |
539 | } | |
540 | ||
541 | bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) | |
542 | { | |
543 | gpa_t gpa = range->start << PAGE_SHIFT; | |
544 | kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); | |
545 | ||
546 | if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) | |
547 | return true; | |
548 | ||
549 | return false; | |
550 | } | |
551 | ||
552 | /* | |
553 | * kvm_map_page_fast() - Fast path GPA fault handler. | |
554 | * @vcpu: vCPU pointer. | |
555 | * @gpa: Guest physical address of fault. | |
556 | * @write: Whether the fault was due to a write. | |
557 | * | |
558 | * Perform fast path GPA fault handling, doing all that can be done without | |
559 | * calling into KVM. This handles marking old pages young (for idle page | |
560 | * tracking), and dirtying of clean pages (for dirty page logging). | |
561 | * | |
562 | * Returns: 0 on success, in which case we can update derived mappings and | |
563 | * resume guest execution. | |
564 | * -EFAULT on failure due to absent GPA mapping or write to | |
565 | * read-only page, in which case KVM must be consulted. | |
566 | */ | |
567 | static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) | |
568 | { | |
569 | int ret = 0; | |
570 | kvm_pfn_t pfn = 0; | |
571 | kvm_pte_t *ptep, changed, new; | |
572 | gfn_t gfn = gpa >> PAGE_SHIFT; | |
573 | struct kvm *kvm = vcpu->kvm; | |
574 | struct kvm_memory_slot *slot; | |
575 | ||
576 | spin_lock(&kvm->mmu_lock); | |
577 | ||
578 | /* Fast path - just check GPA page table for an existing entry */ | |
579 | ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); | |
580 | if (!ptep || !kvm_pte_present(NULL, ptep)) { | |
581 | ret = -EFAULT; | |
582 | goto out; | |
583 | } | |
584 | ||
585 | /* Track access to pages marked old */ | |
586 | new = *ptep; | |
587 | if (!kvm_pte_young(new)) | |
588 | new = kvm_pte_mkyoung(new); | |
589 | /* call kvm_set_pfn_accessed() after unlock */ | |
590 | ||
591 | if (write && !kvm_pte_dirty(new)) { | |
592 | if (!kvm_pte_write(new)) { | |
593 | ret = -EFAULT; | |
594 | goto out; | |
595 | } | |
596 | ||
597 | if (kvm_pte_huge(new)) { | |
598 | /* | |
599 | * Do not set write permission when dirty logging is | |
600 | * enabled for HugePages | |
601 | */ | |
602 | slot = gfn_to_memslot(kvm, gfn); | |
603 | if (kvm_slot_dirty_track_enabled(slot)) { | |
604 | ret = -EFAULT; | |
605 | goto out; | |
606 | } | |
607 | } | |
608 | ||
609 | /* Track dirtying of writeable pages */ | |
610 | new = kvm_pte_mkdirty(new); | |
611 | } | |
612 | ||
613 | changed = new ^ (*ptep); | |
614 | if (changed) { | |
615 | kvm_set_pte(ptep, new); | |
616 | pfn = kvm_pte_pfn(new); | |
617 | } | |
618 | spin_unlock(&kvm->mmu_lock); | |
619 | ||
620 | /* | |
621 | * Fixme: pfn may be freed after mmu_lock | |
622 | * kvm_try_get_pfn(pfn)/kvm_release_pfn pair to prevent this? | |
623 | */ | |
624 | if (kvm_pte_young(changed)) | |
625 | kvm_set_pfn_accessed(pfn); | |
626 | ||
627 | if (kvm_pte_dirty(changed)) { | |
628 | mark_page_dirty(kvm, gfn); | |
629 | kvm_set_pfn_dirty(pfn); | |
630 | } | |
631 | return ret; | |
632 | out: | |
633 | spin_unlock(&kvm->mmu_lock); | |
634 | return ret; | |
635 | } | |
636 | ||
637 | static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, | |
7ab6fb50 | 638 | unsigned long hva, bool write) |
752e2cd7 | 639 | { |
7ab6fb50 | 640 | hva_t start, end; |
752e2cd7 TZ |
641 | |
642 | /* Disable dirty logging on HugePages */ | |
643 | if (kvm_slot_dirty_track_enabled(memslot) && write) | |
644 | return false; | |
645 | ||
7ab6fb50 BM |
646 | if (kvm_hugepage_capable(memslot)) |
647 | return true; | |
752e2cd7 | 648 | |
7ab6fb50 | 649 | if (kvm_hugepage_incapable(memslot)) |
752e2cd7 TZ |
650 | return false; |
651 | ||
7ab6fb50 BM |
652 | start = memslot->userspace_addr; |
653 | end = start + memslot->npages * PAGE_SIZE; | |
654 | ||
752e2cd7 TZ |
655 | /* |
656 | * Next, let's make sure we're not trying to map anything not covered | |
657 | * by the memslot. This means we have to prohibit block size mappings | |
658 | * for the beginning and end of a non-block aligned and non-block sized | |
659 | * memory slot (illustrated by the head and tail parts of the | |
660 | * userspace view above containing pages 'abcde' and 'xyz', | |
661 | * respectively). | |
662 | * | |
663 | * Note that it doesn't matter if we do the check using the | |
664 | * userspace_addr or the base_gfn, as both are equally aligned (per | |
665 | * the check above) and equally sized. | |
666 | */ | |
7ab6fb50 | 667 | return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE)); |
752e2cd7 TZ |
668 | } |
669 | ||
670 | /* | |
671 | * Lookup the mapping level for @gfn in the current mm. | |
672 | * | |
673 | * WARNING! Use of host_pfn_mapping_level() requires the caller and the end | |
674 | * consumer to be tied into KVM's handlers for MMU notifier events! | |
675 | * | |
676 | * There are several ways to safely use this helper: | |
677 | * | |
678 | * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before | |
679 | * consuming it. In this case, mmu_lock doesn't need to be held during the | |
680 | * lookup, but it does need to be held while checking the MMU notifier. | |
681 | * | |
682 | * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation | |
683 | * event for the hva. This can be done by explicit checking the MMU notifier | |
684 | * or by ensuring that KVM already has a valid mapping that covers the hva. | |
685 | * | |
686 | * - Do not use the result to install new mappings, e.g. use the host mapping | |
687 | * level only to decide whether or not to zap an entry. In this case, it's | |
688 | * not required to hold mmu_lock (though it's highly likely the caller will | |
689 | * want to hold mmu_lock anyways, e.g. to modify SPTEs). | |
690 | * | |
691 | * Note! The lookup can still race with modifications to host page tables, but | |
692 | * the above "rules" ensure KVM will not _consume_ the result of the walk if a | |
693 | * race with the primary MMU occurs. | |
694 | */ | |
695 | static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, | |
696 | const struct kvm_memory_slot *slot) | |
697 | { | |
698 | int level = 0; | |
699 | unsigned long hva; | |
700 | unsigned long flags; | |
701 | pgd_t pgd; | |
702 | p4d_t p4d; | |
703 | pud_t pud; | |
704 | pmd_t pmd; | |
705 | ||
706 | /* | |
707 | * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() | |
708 | * is not solely for performance, it's also necessary to avoid the | |
709 | * "writable" check in __gfn_to_hva_many(), which will always fail on | |
710 | * read-only memslots due to gfn_to_hva() assuming writes. Earlier | |
711 | * page fault steps have already verified the guest isn't writing a | |
712 | * read-only memslot. | |
713 | */ | |
714 | hva = __gfn_to_hva_memslot(slot, gfn); | |
715 | ||
716 | /* | |
717 | * Disable IRQs to prevent concurrent tear down of host page tables, | |
718 | * e.g. if the primary MMU promotes a P*D to a huge page and then frees | |
719 | * the original page table. | |
720 | */ | |
721 | local_irq_save(flags); | |
722 | ||
723 | /* | |
724 | * Read each entry once. As above, a non-leaf entry can be promoted to | |
725 | * a huge page _during_ this walk. Re-reading the entry could send the | |
726 | * walk into the weeks, e.g. p*d_large() returns false (sees the old | |
727 | * value) and then p*d_offset() walks into the target huge page instead | |
728 | * of the old page table (sees the new value). | |
729 | */ | |
730 | pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); | |
731 | if (pgd_none(pgd)) | |
732 | goto out; | |
733 | ||
734 | p4d = READ_ONCE(*p4d_offset(&pgd, hva)); | |
735 | if (p4d_none(p4d) || !p4d_present(p4d)) | |
736 | goto out; | |
737 | ||
738 | pud = READ_ONCE(*pud_offset(&p4d, hva)); | |
739 | if (pud_none(pud) || !pud_present(pud)) | |
740 | goto out; | |
741 | ||
742 | pmd = READ_ONCE(*pmd_offset(&pud, hva)); | |
743 | if (pmd_none(pmd) || !pmd_present(pmd)) | |
744 | goto out; | |
745 | ||
746 | if (kvm_pte_huge(pmd_val(pmd))) | |
747 | level = 1; | |
748 | ||
749 | out: | |
750 | local_irq_restore(flags); | |
751 | return level; | |
752 | } | |
753 | ||
754 | /* | |
755 | * Split huge page | |
756 | */ | |
757 | static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn) | |
758 | { | |
759 | int i; | |
760 | kvm_pte_t val, *child; | |
761 | struct kvm *kvm = vcpu->kvm; | |
762 | struct kvm_mmu_memory_cache *memcache; | |
763 | ||
764 | memcache = &vcpu->arch.mmu_page_cache; | |
765 | child = kvm_mmu_memory_cache_alloc(memcache); | |
766 | val = kvm_pte_mksmall(*ptep); | |
767 | for (i = 0; i < PTRS_PER_PTE; i++) { | |
768 | kvm_set_pte(child + i, val); | |
769 | val += PAGE_SIZE; | |
770 | } | |
771 | ||
772 | /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ | |
773 | kvm_set_pte(ptep, __pa(child)); | |
774 | ||
775 | kvm->stat.hugepages--; | |
776 | kvm->stat.pages += PTRS_PER_PTE; | |
777 | ||
778 | return child + (gfn & (PTRS_PER_PTE - 1)); | |
779 | } | |
780 | ||
781 | /* | |
782 | * kvm_map_page() - Map a guest physical page. | |
783 | * @vcpu: vCPU pointer. | |
784 | * @gpa: Guest physical address of fault. | |
785 | * @write: Whether the fault was due to a write. | |
786 | * | |
787 | * Handle GPA faults by creating a new GPA mapping (or updating an existing | |
788 | * one). | |
789 | * | |
790 | * This takes care of marking pages young or dirty (idle/dirty page tracking), | |
791 | * asking KVM for the corresponding PFN, and creating a mapping in the GPA page | |
792 | * tables. Derived mappings (GVA page tables and TLBs) must be handled by the | |
793 | * caller. | |
794 | * | |
795 | * Returns: 0 on success | |
796 | * -EFAULT if there is no memory region at @gpa or a write was | |
797 | * attempted to a read-only memory region. This is usually handled | |
798 | * as an MMIO access. | |
799 | */ | |
800 | static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) | |
801 | { | |
802 | bool writeable; | |
803 | int srcu_idx, err, retry_no = 0, level; | |
804 | unsigned long hva, mmu_seq, prot_bits; | |
805 | kvm_pfn_t pfn; | |
806 | kvm_pte_t *ptep, new_pte; | |
807 | gfn_t gfn = gpa >> PAGE_SHIFT; | |
808 | struct kvm *kvm = vcpu->kvm; | |
809 | struct kvm_memory_slot *memslot; | |
810 | struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; | |
811 | ||
812 | /* Try the fast path to handle old / clean pages */ | |
813 | srcu_idx = srcu_read_lock(&kvm->srcu); | |
814 | err = kvm_map_page_fast(vcpu, gpa, write); | |
815 | if (!err) | |
816 | goto out; | |
817 | ||
818 | memslot = gfn_to_memslot(kvm, gfn); | |
819 | hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); | |
820 | if (kvm_is_error_hva(hva) || (write && !writeable)) { | |
821 | err = -EFAULT; | |
822 | goto out; | |
823 | } | |
824 | ||
825 | /* We need a minimum of cached pages ready for page table creation */ | |
826 | err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); | |
827 | if (err) | |
828 | goto out; | |
829 | ||
830 | retry: | |
831 | /* | |
832 | * Used to check for invalidations in progress, of the pfn that is | |
833 | * returned by pfn_to_pfn_prot below. | |
834 | */ | |
835 | mmu_seq = kvm->mmu_invalidate_seq; | |
836 | /* | |
837 | * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in | |
838 | * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't | |
839 | * risk the page we get a reference to getting unmapped before we have a | |
840 | * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. | |
841 | * | |
842 | * This smp_rmb() pairs with the effective smp_wmb() of the combination | |
843 | * of the pte_unmap_unlock() after the PTE is zapped, and the | |
844 | * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before | |
845 | * mmu_invalidate_seq is incremented. | |
846 | */ | |
847 | smp_rmb(); | |
848 | ||
849 | /* Slow path - ask KVM core whether we can access this GPA */ | |
850 | pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable); | |
851 | if (is_error_noslot_pfn(pfn)) { | |
852 | err = -EFAULT; | |
853 | goto out; | |
854 | } | |
855 | ||
856 | /* Check if an invalidation has taken place since we got pfn */ | |
857 | spin_lock(&kvm->mmu_lock); | |
858 | if (mmu_invalidate_retry_hva(kvm, mmu_seq, hva)) { | |
859 | /* | |
860 | * This can happen when mappings are changed asynchronously, but | |
861 | * also synchronously if a COW is triggered by | |
862 | * gfn_to_pfn_prot(). | |
863 | */ | |
864 | spin_unlock(&kvm->mmu_lock); | |
865 | kvm_release_pfn_clean(pfn); | |
866 | if (retry_no > 100) { | |
867 | retry_no = 0; | |
868 | schedule(); | |
869 | } | |
870 | retry_no++; | |
871 | goto retry; | |
872 | } | |
873 | ||
874 | /* | |
875 | * For emulated devices such virtio device, actual cache attribute is | |
876 | * determined by physical machine. | |
877 | * For pass through physical device, it should be uncachable | |
878 | */ | |
879 | prot_bits = _PAGE_PRESENT | __READABLE; | |
880 | if (pfn_valid(pfn)) | |
881 | prot_bits |= _CACHE_CC; | |
882 | else | |
883 | prot_bits |= _CACHE_SUC; | |
884 | ||
885 | if (writeable) { | |
886 | prot_bits |= _PAGE_WRITE; | |
887 | if (write) | |
888 | prot_bits |= __WRITEABLE; | |
889 | } | |
890 | ||
891 | /* Disable dirty logging on HugePages */ | |
892 | level = 0; | |
7ab6fb50 | 893 | if (!fault_supports_huge_mapping(memslot, hva, write)) { |
752e2cd7 TZ |
894 | level = 0; |
895 | } else { | |
896 | level = host_pfn_mapping_level(kvm, gfn, memslot); | |
897 | if (level == 1) { | |
898 | gfn = gfn & ~(PTRS_PER_PTE - 1); | |
899 | pfn = pfn & ~(PTRS_PER_PTE - 1); | |
900 | } | |
901 | } | |
902 | ||
903 | /* Ensure page tables are allocated */ | |
904 | ptep = kvm_populate_gpa(kvm, memcache, gpa, level); | |
905 | new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits)); | |
906 | if (level == 1) { | |
907 | new_pte = kvm_pte_mkhuge(new_pte); | |
908 | /* | |
909 | * previous pmd entry is invalid_pte_table | |
910 | * there is invalid tlb with small page | |
911 | * need flush these invalid tlbs for current vcpu | |
912 | */ | |
913 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | |
914 | ++kvm->stat.hugepages; | |
915 | } else if (kvm_pte_huge(*ptep) && write) | |
916 | ptep = kvm_split_huge(vcpu, ptep, gfn); | |
917 | else | |
918 | ++kvm->stat.pages; | |
919 | kvm_set_pte(ptep, new_pte); | |
920 | spin_unlock(&kvm->mmu_lock); | |
921 | ||
922 | if (prot_bits & _PAGE_DIRTY) { | |
923 | mark_page_dirty_in_slot(kvm, memslot, gfn); | |
924 | kvm_set_pfn_dirty(pfn); | |
925 | } | |
926 | ||
927 | kvm_set_pfn_accessed(pfn); | |
928 | kvm_release_pfn_clean(pfn); | |
929 | out: | |
930 | srcu_read_unlock(&kvm->srcu, srcu_idx); | |
931 | return err; | |
932 | } | |
933 | ||
934 | int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) | |
935 | { | |
936 | int ret; | |
937 | ||
938 | ret = kvm_map_page(vcpu, gpa, write); | |
939 | if (ret) | |
940 | return ret; | |
941 | ||
942 | /* Invalidate this entry in the TLB */ | |
943 | kvm_flush_tlb_gpa(vcpu, gpa); | |
944 | ||
945 | return 0; | |
946 | } | |
947 | ||
948 | void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) | |
949 | { | |
950 | } | |
951 | ||
752e2cd7 TZ |
952 | void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, |
953 | const struct kvm_memory_slot *memslot) | |
954 | { | |
955 | kvm_flush_remote_tlbs(kvm); | |
956 | } |