]>
Commit | Line | Data |
---|---|---|
d2912cb1 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
9e04ba69 | 2 | /* |
9e04ba69 PM |
3 | * |
4 | * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | |
5 | */ | |
6 | ||
7 | #include <linux/types.h> | |
8 | #include <linux/string.h> | |
9 | #include <linux/kvm.h> | |
10 | #include <linux/kvm_host.h> | |
9a94d3ee PM |
11 | #include <linux/anon_inodes.h> |
12 | #include <linux/file.h> | |
13 | #include <linux/debugfs.h> | |
9e04ba69 PM |
14 | |
15 | #include <asm/kvm_ppc.h> | |
16 | #include <asm/kvm_book3s.h> | |
17 | #include <asm/page.h> | |
18 | #include <asm/mmu.h> | |
19 | #include <asm/pgtable.h> | |
20 | #include <asm/pgalloc.h> | |
94171b19 | 21 | #include <asm/pte-walk.h> |
008e359c BR |
22 | #include <asm/ultravisor.h> |
23 | #include <asm/kvm_book3s_uvmem.h> | |
9e04ba69 PM |
24 | |
25 | /* | |
26 | * Supported radix tree geometry. | |
27 | * Like p9, we support either 5 or 9 bits at the first (lowest) level, | |
28 | * for a page size of 64k or 4k. | |
29 | */ | |
30 | static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; | |
31 | ||
6ff887b8 SJS |
32 | unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, |
33 | gva_t eaddr, void *to, void *from, | |
34 | unsigned long n) | |
d7b45615 | 35 | { |
f4607722 | 36 | int uninitialized_var(old_pid), old_lpid; |
d7b45615 | 37 | unsigned long quadrant, ret = n; |
d7b45615 SJS |
38 | bool is_load = !!to; |
39 | ||
95d386c2 SJS |
40 | /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ |
41 | if (kvmhv_on_pseries()) | |
42 | return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, | |
43 | __pa(to), __pa(from), n); | |
d7b45615 SJS |
44 | |
45 | quadrant = 1; | |
46 | if (!pid) | |
47 | quadrant = 2; | |
48 | if (is_load) | |
49 | from = (void *) (eaddr | (quadrant << 62)); | |
50 | else | |
51 | to = (void *) (eaddr | (quadrant << 62)); | |
52 | ||
53 | preempt_disable(); | |
54 | ||
55 | /* switch the lpid first to avoid running host with unallocated pid */ | |
56 | old_lpid = mfspr(SPRN_LPID); | |
57 | if (old_lpid != lpid) | |
58 | mtspr(SPRN_LPID, lpid); | |
59 | if (quadrant == 1) { | |
60 | old_pid = mfspr(SPRN_PID); | |
61 | if (old_pid != pid) | |
62 | mtspr(SPRN_PID, pid); | |
63 | } | |
64 | isync(); | |
65 | ||
d7b45615 | 66 | if (is_load) |
def0bfdb | 67 | ret = probe_user_read(to, (const void __user *)from, n); |
d7b45615 | 68 | else |
def0bfdb | 69 | ret = probe_user_write((void __user *)to, from, n); |
d7b45615 SJS |
70 | |
71 | /* switch the pid first to avoid running host with unallocated pid */ | |
72 | if (quadrant == 1 && pid != old_pid) | |
73 | mtspr(SPRN_PID, old_pid); | |
74 | if (lpid != old_lpid) | |
75 | mtspr(SPRN_LPID, old_lpid); | |
76 | isync(); | |
77 | ||
78 | preempt_enable(); | |
79 | ||
80 | return ret; | |
81 | } | |
6ff887b8 | 82 | EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); |
d7b45615 SJS |
83 | |
84 | static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, | |
85 | void *to, void *from, unsigned long n) | |
86 | { | |
87 | int lpid = vcpu->kvm->arch.lpid; | |
88 | int pid = vcpu->arch.pid; | |
89 | ||
90 | /* This would cause a data segment intr so don't allow the access */ | |
91 | if (eaddr & (0x3FFUL << 52)) | |
92 | return -EINVAL; | |
93 | ||
94 | /* Should we be using the nested lpid */ | |
95 | if (vcpu->arch.nested) | |
96 | lpid = vcpu->arch.nested->shadow_lpid; | |
97 | ||
98 | /* If accessing quadrant 3 then pid is expected to be 0 */ | |
99 | if (((eaddr >> 62) & 0x3) == 0x3) | |
100 | pid = 0; | |
101 | ||
102 | eaddr &= ~(0xFFFUL << 52); | |
103 | ||
104 | return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n); | |
105 | } | |
106 | ||
107 | long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, | |
108 | unsigned long n) | |
109 | { | |
110 | long ret; | |
111 | ||
112 | ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); | |
113 | if (ret > 0) | |
114 | memset(to + (n - ret), 0, ret); | |
115 | ||
116 | return ret; | |
117 | } | |
118 | EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); | |
119 | ||
120 | long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, | |
121 | unsigned long n) | |
122 | { | |
123 | return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); | |
124 | } | |
125 | EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); | |
126 | ||
fd10be25 SJS |
127 | int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, |
128 | struct kvmppc_pte *gpte, u64 root, | |
129 | u64 *pte_ret_p) | |
9e04ba69 PM |
130 | { |
131 | struct kvm *kvm = vcpu->kvm; | |
9e04ba69 | 132 | int ret, level, ps; |
fd10be25 | 133 | unsigned long rts, bits, offset, index; |
9811c78e SJS |
134 | u64 pte, base, gpa; |
135 | __be64 rpte; | |
9e04ba69 | 136 | |
9e04ba69 PM |
137 | rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | |
138 | ((root & RTS2_MASK) >> RTS2_SHIFT); | |
139 | bits = root & RPDS_MASK; | |
9811c78e | 140 | base = root & RPDB_MASK; |
9e04ba69 | 141 | |
9e04ba69 | 142 | offset = rts + 31; |
9e04ba69 | 143 | |
9811c78e | 144 | /* Current implementations only support 52-bit space */ |
9e04ba69 PM |
145 | if (offset != 52) |
146 | return -EINVAL; | |
147 | ||
9811c78e | 148 | /* Walk each level of the radix tree */ |
9e04ba69 | 149 | for (level = 3; level >= 0; --level) { |
fd10be25 | 150 | u64 addr; |
9811c78e | 151 | /* Check a valid size */ |
9e04ba69 PM |
152 | if (level && bits != p9_supported_radix_bits[level]) |
153 | return -EINVAL; | |
154 | if (level == 0 && !(bits == 5 || bits == 9)) | |
155 | return -EINVAL; | |
156 | offset -= bits; | |
157 | index = (eaddr >> offset) & ((1UL << bits) - 1); | |
9811c78e SJS |
158 | /* Check that low bits of page table base are zero */ |
159 | if (base & ((1UL << (bits + 3)) - 1)) | |
9e04ba69 | 160 | return -EINVAL; |
9811c78e | 161 | /* Read the entry from guest memory */ |
fd10be25 SJS |
162 | addr = base + (index * sizeof(rpte)); |
163 | ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); | |
164 | if (ret) { | |
165 | if (pte_ret_p) | |
166 | *pte_ret_p = addr; | |
9e04ba69 | 167 | return ret; |
fd10be25 | 168 | } |
9e04ba69 PM |
169 | pte = __be64_to_cpu(rpte); |
170 | if (!(pte & _PAGE_PRESENT)) | |
171 | return -ENOENT; | |
9811c78e | 172 | /* Check if a leaf entry */ |
9e04ba69 PM |
173 | if (pte & _PAGE_PTE) |
174 | break; | |
9811c78e SJS |
175 | /* Get ready to walk the next level */ |
176 | base = pte & RPDB_MASK; | |
177 | bits = pte & RPDS_MASK; | |
9e04ba69 | 178 | } |
9811c78e SJS |
179 | |
180 | /* Need a leaf at lowest level; 512GB pages not supported */ | |
9e04ba69 PM |
181 | if (level < 0 || level == 3) |
182 | return -EINVAL; | |
183 | ||
9811c78e SJS |
184 | /* We found a valid leaf PTE */ |
185 | /* Offset is now log base 2 of the page size */ | |
9e04ba69 PM |
186 | gpa = pte & 0x01fffffffffff000ul; |
187 | if (gpa & ((1ul << offset) - 1)) | |
188 | return -EINVAL; | |
9811c78e | 189 | gpa |= eaddr & ((1ul << offset) - 1); |
9e04ba69 PM |
190 | for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) |
191 | if (offset == mmu_psize_defs[ps].shift) | |
192 | break; | |
193 | gpte->page_size = ps; | |
fd10be25 | 194 | gpte->page_shift = offset; |
9e04ba69 PM |
195 | |
196 | gpte->eaddr = eaddr; | |
197 | gpte->raddr = gpa; | |
198 | ||
199 | /* Work out permissions */ | |
200 | gpte->may_read = !!(pte & _PAGE_READ); | |
201 | gpte->may_write = !!(pte & _PAGE_WRITE); | |
202 | gpte->may_execute = !!(pte & _PAGE_EXEC); | |
9811c78e | 203 | |
fd10be25 SJS |
204 | gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); |
205 | ||
9811c78e SJS |
206 | if (pte_ret_p) |
207 | *pte_ret_p = pte; | |
208 | ||
209 | return 0; | |
210 | } | |
211 | ||
fd10be25 SJS |
212 | /* |
213 | * Used to walk a partition or process table radix tree in guest memory | |
214 | * Note: We exploit the fact that a partition table and a process | |
215 | * table have the same layout, a partition-scoped page table and a | |
216 | * process-scoped page table have the same layout, and the 2nd | |
217 | * doubleword of a partition table entry has the same layout as | |
218 | * the PTCR register. | |
219 | */ | |
220 | int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, | |
221 | struct kvmppc_pte *gpte, u64 table, | |
222 | int table_index, u64 *pte_ret_p) | |
223 | { | |
224 | struct kvm *kvm = vcpu->kvm; | |
225 | int ret; | |
226 | unsigned long size, ptbl, root; | |
227 | struct prtb_entry entry; | |
228 | ||
229 | if ((table & PRTS_MASK) > 24) | |
230 | return -EINVAL; | |
231 | size = 1ul << ((table & PRTS_MASK) + 12); | |
232 | ||
233 | /* Is the table big enough to contain this entry? */ | |
234 | if ((table_index * sizeof(entry)) >= size) | |
235 | return -EINVAL; | |
236 | ||
237 | /* Read the table to find the root of the radix tree */ | |
238 | ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); | |
239 | ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); | |
240 | if (ret) | |
241 | return ret; | |
242 | ||
243 | /* Root is stored in the first double word */ | |
244 | root = be64_to_cpu(entry.prtb0); | |
245 | ||
246 | return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); | |
247 | } | |
248 | ||
9811c78e SJS |
249 | int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, |
250 | struct kvmppc_pte *gpte, bool data, bool iswrite) | |
251 | { | |
252 | u32 pid; | |
253 | u64 pte; | |
254 | int ret; | |
255 | ||
256 | /* Work out effective PID */ | |
257 | switch (eaddr >> 62) { | |
258 | case 0: | |
259 | pid = vcpu->arch.pid; | |
260 | break; | |
261 | case 3: | |
262 | pid = 0; | |
263 | break; | |
264 | default: | |
265 | return -EINVAL; | |
266 | } | |
267 | ||
268 | ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, | |
269 | vcpu->kvm->arch.process_table, pid, &pte); | |
270 | if (ret) | |
271 | return ret; | |
272 | ||
273 | /* Check privilege (applies only to process scoped translations) */ | |
9e04ba69 PM |
274 | if (kvmppc_get_msr(vcpu) & MSR_PR) { |
275 | if (pte & _PAGE_PRIVILEGED) { | |
276 | gpte->may_read = 0; | |
277 | gpte->may_write = 0; | |
278 | gpte->may_execute = 0; | |
279 | } | |
280 | } else { | |
281 | if (!(pte & _PAGE_PRIVILEGED)) { | |
282 | /* Check AMR/IAMR to see if strict mode is in force */ | |
283 | if (vcpu->arch.amr & (1ul << 62)) | |
284 | gpte->may_read = 0; | |
285 | if (vcpu->arch.amr & (1ul << 63)) | |
286 | gpte->may_write = 0; | |
287 | if (vcpu->arch.iamr & (1ul << 62)) | |
288 | gpte->may_execute = 0; | |
289 | } | |
290 | } | |
291 | ||
292 | return 0; | |
293 | } | |
294 | ||
90165d3d SJS |
295 | void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, |
296 | unsigned int pshift, unsigned int lpid) | |
5a319350 | 297 | { |
d91cb39f | 298 | unsigned long psize = PAGE_SIZE; |
690ed4ca PM |
299 | int psi; |
300 | long rc; | |
301 | unsigned long rb; | |
d91cb39f NP |
302 | |
303 | if (pshift) | |
304 | psize = 1UL << pshift; | |
690ed4ca PM |
305 | else |
306 | pshift = PAGE_SHIFT; | |
d91cb39f NP |
307 | |
308 | addr &= ~(psize - 1); | |
690ed4ca PM |
309 | |
310 | if (!kvmhv_on_pseries()) { | |
311 | radix__flush_tlb_lpid_page(lpid, addr, psize); | |
312 | return; | |
313 | } | |
314 | ||
315 | psi = shift_to_mmu_psize(pshift); | |
316 | rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); | |
317 | rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), | |
318 | lpid, rb); | |
319 | if (rc) | |
320 | pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); | |
5a319350 PM |
321 | } |
322 | ||
fd10be25 | 323 | static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) |
c4c8a764 | 324 | { |
690ed4ca PM |
325 | long rc; |
326 | ||
327 | if (!kvmhv_on_pseries()) { | |
328 | radix__flush_pwc_lpid(lpid); | |
329 | return; | |
330 | } | |
331 | ||
332 | rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), | |
333 | lpid, TLBIEL_INVAL_SET_LPID); | |
334 | if (rc) | |
335 | pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); | |
c4c8a764 PM |
336 | } |
337 | ||
878cf2bb | 338 | static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, |
8f7b79b8 PM |
339 | unsigned long clr, unsigned long set, |
340 | unsigned long addr, unsigned int shift) | |
5a319350 | 341 | { |
2bf1071a | 342 | return __radix_pte_update(ptep, clr, set); |
5a319350 PM |
343 | } |
344 | ||
345 | void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, | |
346 | pte_t *ptep, pte_t pte) | |
347 | { | |
348 | radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); | |
349 | } | |
350 | ||
351 | static struct kmem_cache *kvm_pte_cache; | |
21828c99 | 352 | static struct kmem_cache *kvm_pmd_cache; |
5a319350 PM |
353 | |
354 | static pte_t *kvmppc_pte_alloc(void) | |
355 | { | |
356 | return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); | |
357 | } | |
358 | ||
359 | static void kvmppc_pte_free(pte_t *ptep) | |
360 | { | |
361 | kmem_cache_free(kvm_pte_cache, ptep); | |
362 | } | |
363 | ||
21828c99 AK |
364 | static pmd_t *kvmppc_pmd_alloc(void) |
365 | { | |
366 | return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL); | |
367 | } | |
368 | ||
369 | static void kvmppc_pmd_free(pmd_t *pmdp) | |
370 | { | |
371 | kmem_cache_free(kvm_pmd_cache, pmdp); | |
372 | } | |
373 | ||
8cf531ed SJS |
374 | /* Called with kvm->mmu_lock held */ |
375 | void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, | |
c43c3a86 PM |
376 | unsigned int shift, |
377 | const struct kvm_memory_slot *memslot, | |
fd10be25 | 378 | unsigned int lpid) |
a5fad1e9 NP |
379 | |
380 | { | |
a5fad1e9 | 381 | unsigned long old; |
8cf531ed SJS |
382 | unsigned long gfn = gpa >> PAGE_SHIFT; |
383 | unsigned long page_size = PAGE_SIZE; | |
384 | unsigned long hpa; | |
a5fad1e9 NP |
385 | |
386 | old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); | |
fd10be25 | 387 | kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); |
a5fad1e9 | 388 | |
8cf531ed SJS |
389 | /* The following only applies to L1 entries */ |
390 | if (lpid != kvm->arch.lpid) | |
391 | return; | |
a5fad1e9 | 392 | |
8cf531ed | 393 | if (!memslot) { |
a5fad1e9 | 394 | memslot = gfn_to_memslot(kvm, gfn); |
f0f825f0 | 395 | if (!memslot) |
8cf531ed | 396 | return; |
a5fad1e9 | 397 | } |
8f1f7b9b | 398 | if (shift) { /* 1GB or 2MB page */ |
8cf531ed | 399 | page_size = 1ul << shift; |
8f1f7b9b SJS |
400 | if (shift == PMD_SHIFT) |
401 | kvm->stat.num_2M_pages--; | |
402 | else if (shift == PUD_SHIFT) | |
403 | kvm->stat.num_1G_pages--; | |
404 | } | |
8cf531ed SJS |
405 | |
406 | gpa &= ~(page_size - 1); | |
407 | hpa = old & PTE_RPN_MASK; | |
408 | kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); | |
409 | ||
410 | if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) | |
411 | kvmppc_update_dirty_map(memslot, gfn, page_size); | |
a5fad1e9 NP |
412 | } |
413 | ||
a5704e83 NP |
414 | /* |
415 | * kvmppc_free_p?d are used to free existing page tables, and recursively | |
416 | * descend and clear and free children. | |
417 | * Callers are responsible for flushing the PWC. | |
418 | * | |
419 | * When page tables are being unmapped/freed as part of page fault path | |
420 | * (full == false), ptes are not expected. There is code to unmap them | |
421 | * and emit a warning if encountered, but there may already be data | |
422 | * corruption due to the unexpected mappings. | |
423 | */ | |
fd10be25 SJS |
424 | static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, |
425 | unsigned int lpid) | |
a5704e83 NP |
426 | { |
427 | if (full) { | |
afd31356 | 428 | memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE); |
a5704e83 NP |
429 | } else { |
430 | pte_t *p = pte; | |
431 | unsigned long it; | |
432 | ||
433 | for (it = 0; it < PTRS_PER_PTE; ++it, ++p) { | |
434 | if (pte_val(*p) == 0) | |
435 | continue; | |
436 | WARN_ON_ONCE(1); | |
437 | kvmppc_unmap_pte(kvm, p, | |
438 | pte_pfn(*p) << PAGE_SHIFT, | |
fd10be25 | 439 | PAGE_SHIFT, NULL, lpid); |
a5704e83 NP |
440 | } |
441 | } | |
442 | ||
443 | kvmppc_pte_free(pte); | |
444 | } | |
445 | ||
fd10be25 SJS |
446 | static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, |
447 | unsigned int lpid) | |
a5704e83 NP |
448 | { |
449 | unsigned long im; | |
450 | pmd_t *p = pmd; | |
451 | ||
452 | for (im = 0; im < PTRS_PER_PMD; ++im, ++p) { | |
453 | if (!pmd_present(*p)) | |
454 | continue; | |
455 | if (pmd_is_leaf(*p)) { | |
456 | if (full) { | |
457 | pmd_clear(p); | |
458 | } else { | |
459 | WARN_ON_ONCE(1); | |
460 | kvmppc_unmap_pte(kvm, (pte_t *)p, | |
461 | pte_pfn(*(pte_t *)p) << PAGE_SHIFT, | |
fd10be25 | 462 | PMD_SHIFT, NULL, lpid); |
a5704e83 NP |
463 | } |
464 | } else { | |
465 | pte_t *pte; | |
466 | ||
467 | pte = pte_offset_map(p, 0); | |
fd10be25 | 468 | kvmppc_unmap_free_pte(kvm, pte, full, lpid); |
a5704e83 NP |
469 | pmd_clear(p); |
470 | } | |
471 | } | |
472 | kvmppc_pmd_free(pmd); | |
473 | } | |
474 | ||
fd10be25 SJS |
475 | static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, |
476 | unsigned int lpid) | |
a5704e83 NP |
477 | { |
478 | unsigned long iu; | |
479 | pud_t *p = pud; | |
480 | ||
481 | for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) { | |
482 | if (!pud_present(*p)) | |
483 | continue; | |
d6eacedd | 484 | if (pud_is_leaf(*p)) { |
a5704e83 NP |
485 | pud_clear(p); |
486 | } else { | |
487 | pmd_t *pmd; | |
488 | ||
489 | pmd = pmd_offset(p, 0); | |
fd10be25 | 490 | kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); |
a5704e83 NP |
491 | pud_clear(p); |
492 | } | |
493 | } | |
494 | pud_free(kvm->mm, pud); | |
495 | } | |
496 | ||
fd10be25 | 497 | void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) |
a5704e83 NP |
498 | { |
499 | unsigned long ig; | |
a5704e83 | 500 | |
a5704e83 NP |
501 | for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { |
502 | pud_t *pud; | |
503 | ||
504 | if (!pgd_present(*pgd)) | |
505 | continue; | |
506 | pud = pud_offset(pgd, 0); | |
fd10be25 | 507 | kvmppc_unmap_free_pud(kvm, pud, lpid); |
a5704e83 NP |
508 | pgd_clear(pgd); |
509 | } | |
fd10be25 SJS |
510 | } |
511 | ||
512 | void kvmppc_free_radix(struct kvm *kvm) | |
513 | { | |
514 | if (kvm->arch.pgtable) { | |
515 | kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, | |
516 | kvm->arch.lpid); | |
517 | pgd_free(kvm->mm, kvm->arch.pgtable); | |
518 | kvm->arch.pgtable = NULL; | |
519 | } | |
a5704e83 NP |
520 | } |
521 | ||
522 | static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, | |
fd10be25 | 523 | unsigned long gpa, unsigned int lpid) |
a5704e83 NP |
524 | { |
525 | pte_t *pte = pte_offset_kernel(pmd, 0); | |
526 | ||
527 | /* | |
528 | * Clearing the pmd entry then flushing the PWC ensures that the pte | |
529 | * page no longer be cached by the MMU, so can be freed without | |
530 | * flushing the PWC again. | |
531 | */ | |
532 | pmd_clear(pmd); | |
fd10be25 | 533 | kvmppc_radix_flush_pwc(kvm, lpid); |
a5704e83 | 534 | |
fd10be25 | 535 | kvmppc_unmap_free_pte(kvm, pte, false, lpid); |
a5704e83 NP |
536 | } |
537 | ||
538 | static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, | |
fd10be25 | 539 | unsigned long gpa, unsigned int lpid) |
a5704e83 NP |
540 | { |
541 | pmd_t *pmd = pmd_offset(pud, 0); | |
542 | ||
543 | /* | |
544 | * Clearing the pud entry then flushing the PWC ensures that the pmd | |
545 | * page and any children pte pages will no longer be cached by the MMU, | |
546 | * so can be freed without flushing the PWC again. | |
547 | */ | |
548 | pud_clear(pud); | |
fd10be25 | 549 | kvmppc_radix_flush_pwc(kvm, lpid); |
a5704e83 | 550 | |
fd10be25 | 551 | kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); |
a5704e83 NP |
552 | } |
553 | ||
878cf2bb NP |
554 | /* |
555 | * There are a number of bits which may differ between different faults to | |
556 | * the same partition scope entry. RC bits, in the course of cleaning and | |
557 | * aging. And the write bit can change, either the access could have been | |
558 | * upgraded, or a read fault could happen concurrently with a write fault | |
559 | * that sets those bits first. | |
560 | */ | |
561 | #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) | |
562 | ||
fd10be25 SJS |
563 | int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, |
564 | unsigned long gpa, unsigned int level, | |
8cf531ed SJS |
565 | unsigned long mmu_seq, unsigned int lpid, |
566 | unsigned long *rmapp, struct rmap_nested **n_rmap) | |
5a319350 PM |
567 | { |
568 | pgd_t *pgd; | |
569 | pud_t *pud, *new_pud = NULL; | |
570 | pmd_t *pmd, *new_pmd = NULL; | |
571 | pte_t *ptep, *new_ptep = NULL; | |
572 | int ret; | |
573 | ||
574 | /* Traverse the guest's 2nd-level tree, allocate new levels needed */ | |
04bae9d5 | 575 | pgd = pgtable + pgd_index(gpa); |
5a319350 PM |
576 | pud = NULL; |
577 | if (pgd_present(*pgd)) | |
578 | pud = pud_offset(pgd, gpa); | |
579 | else | |
580 | new_pud = pud_alloc_one(kvm->mm, gpa); | |
581 | ||
582 | pmd = NULL; | |
d6eacedd | 583 | if (pud && pud_present(*pud) && !pud_is_leaf(*pud)) |
5a319350 | 584 | pmd = pmd_offset(pud, gpa); |
58c5c276 | 585 | else if (level <= 1) |
21828c99 | 586 | new_pmd = kvmppc_pmd_alloc(); |
5a319350 | 587 | |
c3856aeb | 588 | if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) |
5a319350 PM |
589 | new_ptep = kvmppc_pte_alloc(); |
590 | ||
591 | /* Check if we might have been invalidated; let the guest retry if so */ | |
592 | spin_lock(&kvm->mmu_lock); | |
593 | ret = -EAGAIN; | |
594 | if (mmu_notifier_retry(kvm, mmu_seq)) | |
595 | goto out_unlock; | |
596 | ||
597 | /* Now traverse again under the lock and change the tree */ | |
598 | ret = -ENOMEM; | |
599 | if (pgd_none(*pgd)) { | |
600 | if (!new_pud) | |
601 | goto out_unlock; | |
602 | pgd_populate(kvm->mm, pgd, new_pud); | |
603 | new_pud = NULL; | |
604 | } | |
605 | pud = pud_offset(pgd, gpa); | |
d6eacedd | 606 | if (pud_is_leaf(*pud)) { |
58c5c276 PM |
607 | unsigned long hgpa = gpa & PUD_MASK; |
608 | ||
878cf2bb NP |
609 | /* Check if we raced and someone else has set the same thing */ |
610 | if (level == 2) { | |
611 | if (pud_raw(*pud) == pte_raw(pte)) { | |
612 | ret = 0; | |
613 | goto out_unlock; | |
614 | } | |
615 | /* Valid 1GB page here already, add our extra bits */ | |
616 | WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) & | |
617 | PTE_BITS_MUST_MATCH); | |
618 | kvmppc_radix_update_pte(kvm, (pte_t *)pud, | |
619 | 0, pte_val(pte), hgpa, PUD_SHIFT); | |
620 | ret = 0; | |
621 | goto out_unlock; | |
622 | } | |
58c5c276 PM |
623 | /* |
624 | * If we raced with another CPU which has just put | |
625 | * a 1GB pte in after we saw a pmd page, try again. | |
626 | */ | |
878cf2bb | 627 | if (!new_pmd) { |
58c5c276 PM |
628 | ret = -EAGAIN; |
629 | goto out_unlock; | |
630 | } | |
58c5c276 | 631 | /* Valid 1GB page here already, remove it */ |
fd10be25 SJS |
632 | kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, |
633 | lpid); | |
58c5c276 PM |
634 | } |
635 | if (level == 2) { | |
636 | if (!pud_none(*pud)) { | |
637 | /* | |
638 | * There's a page table page here, but we wanted to | |
639 | * install a large page, so remove and free the page | |
a5704e83 | 640 | * table page. |
58c5c276 | 641 | */ |
fd10be25 | 642 | kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); |
58c5c276 PM |
643 | } |
644 | kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); | |
8cf531ed SJS |
645 | if (rmapp && n_rmap) |
646 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); | |
58c5c276 PM |
647 | ret = 0; |
648 | goto out_unlock; | |
649 | } | |
5a319350 PM |
650 | if (pud_none(*pud)) { |
651 | if (!new_pmd) | |
652 | goto out_unlock; | |
653 | pud_populate(kvm->mm, pud, new_pmd); | |
654 | new_pmd = NULL; | |
655 | } | |
656 | pmd = pmd_offset(pud, gpa); | |
c3856aeb PM |
657 | if (pmd_is_leaf(*pmd)) { |
658 | unsigned long lgpa = gpa & PMD_MASK; | |
659 | ||
878cf2bb NP |
660 | /* Check if we raced and someone else has set the same thing */ |
661 | if (level == 1) { | |
662 | if (pmd_raw(*pmd) == pte_raw(pte)) { | |
663 | ret = 0; | |
664 | goto out_unlock; | |
665 | } | |
666 | /* Valid 2MB page here already, add our extra bits */ | |
667 | WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & | |
668 | PTE_BITS_MUST_MATCH); | |
669 | kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), | |
fd10be25 | 670 | 0, pte_val(pte), lgpa, PMD_SHIFT); |
878cf2bb NP |
671 | ret = 0; |
672 | goto out_unlock; | |
673 | } | |
674 | ||
c3856aeb PM |
675 | /* |
676 | * If we raced with another CPU which has just put | |
677 | * a 2MB pte in after we saw a pte page, try again. | |
678 | */ | |
878cf2bb | 679 | if (!new_ptep) { |
c3856aeb PM |
680 | ret = -EAGAIN; |
681 | goto out_unlock; | |
682 | } | |
683 | /* Valid 2MB page here already, remove it */ | |
fd10be25 SJS |
684 | kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, |
685 | lpid); | |
5a319350 | 686 | } |
58c5c276 PM |
687 | if (level == 1) { |
688 | if (!pmd_none(*pmd)) { | |
689 | /* | |
690 | * There's a page table page here, but we wanted to | |
691 | * install a large page, so remove and free the page | |
a5704e83 | 692 | * table page. |
58c5c276 | 693 | */ |
fd10be25 | 694 | kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); |
5a319350 | 695 | } |
5a319350 | 696 | kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); |
8cf531ed SJS |
697 | if (rmapp && n_rmap) |
698 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); | |
58c5c276 PM |
699 | ret = 0; |
700 | goto out_unlock; | |
5a319350 | 701 | } |
58c5c276 PM |
702 | if (pmd_none(*pmd)) { |
703 | if (!new_ptep) | |
704 | goto out_unlock; | |
705 | pmd_populate(kvm->mm, pmd, new_ptep); | |
706 | new_ptep = NULL; | |
707 | } | |
708 | ptep = pte_offset_kernel(pmd, gpa); | |
709 | if (pte_present(*ptep)) { | |
710 | /* Check if someone else set the same thing */ | |
711 | if (pte_raw(*ptep) == pte_raw(pte)) { | |
712 | ret = 0; | |
713 | goto out_unlock; | |
714 | } | |
878cf2bb NP |
715 | /* Valid page here already, add our extra bits */ |
716 | WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) & | |
717 | PTE_BITS_MUST_MATCH); | |
718 | kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0); | |
719 | ret = 0; | |
720 | goto out_unlock; | |
5a319350 | 721 | } |
58c5c276 | 722 | kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); |
8cf531ed SJS |
723 | if (rmapp && n_rmap) |
724 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); | |
5a319350 PM |
725 | ret = 0; |
726 | ||
727 | out_unlock: | |
728 | spin_unlock(&kvm->mmu_lock); | |
729 | if (new_pud) | |
730 | pud_free(kvm->mm, new_pud); | |
731 | if (new_pmd) | |
21828c99 | 732 | kvmppc_pmd_free(new_pmd); |
5a319350 PM |
733 | if (new_ptep) |
734 | kvmppc_pte_free(new_ptep); | |
735 | return ret; | |
736 | } | |
737 | ||
fd10be25 SJS |
738 | bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, |
739 | unsigned long gpa, unsigned int lpid) | |
04bae9d5 SJS |
740 | { |
741 | unsigned long pgflags; | |
742 | unsigned int shift; | |
743 | pte_t *ptep; | |
744 | ||
745 | /* | |
746 | * Need to set an R or C bit in the 2nd-level tables; | |
747 | * since we are just helping out the hardware here, | |
748 | * it is sufficient to do what the hardware does. | |
749 | */ | |
750 | pgflags = _PAGE_ACCESSED; | |
751 | if (writing) | |
752 | pgflags |= _PAGE_DIRTY; | |
753 | /* | |
754 | * We are walking the secondary (partition-scoped) page table here. | |
755 | * We can do this without disabling irq because the Linux MM | |
756 | * subsystem doesn't do THP splits and collapses on this tree. | |
757 | */ | |
758 | ptep = __find_linux_pte(pgtable, gpa, NULL, &shift); | |
759 | if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { | |
760 | kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); | |
761 | return true; | |
762 | } | |
763 | return false; | |
764 | } | |
765 | ||
fd10be25 SJS |
766 | int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, |
767 | unsigned long gpa, | |
768 | struct kvm_memory_slot *memslot, | |
769 | bool writing, bool kvm_ro, | |
770 | pte_t *inserted_pte, unsigned int *levelp) | |
5a319350 PM |
771 | { |
772 | struct kvm *kvm = vcpu->kvm; | |
31c8b0d0 | 773 | struct page *page = NULL; |
04bae9d5 SJS |
774 | unsigned long mmu_seq; |
775 | unsigned long hva, gfn = gpa >> PAGE_SHIFT; | |
31c8b0d0 PM |
776 | bool upgrade_write = false; |
777 | bool *upgrade_p = &upgrade_write; | |
5a319350 | 778 | pte_t pte, *ptep; |
5a319350 | 779 | unsigned int shift, level; |
04bae9d5 | 780 | int ret; |
f460f679 | 781 | bool large_enable; |
5a319350 | 782 | |
31c8b0d0 PM |
783 | /* used to check for invalidations in progress */ |
784 | mmu_seq = kvm->mmu_notifier_seq; | |
785 | smp_rmb(); | |
786 | ||
787 | /* | |
788 | * Do a fast check first, since __gfn_to_pfn_memslot doesn't | |
789 | * do it with !atomic && !async, which is how we call it. | |
790 | * We always ask for write permission since the common case | |
791 | * is that the page is writable. | |
792 | */ | |
793 | hva = gfn_to_hva_memslot(memslot, gfn); | |
04bae9d5 | 794 | if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { |
31c8b0d0 PM |
795 | upgrade_write = true; |
796 | } else { | |
71d29f43 NP |
797 | unsigned long pfn; |
798 | ||
31c8b0d0 PM |
799 | /* Call KVM generic code to do the slow-path check */ |
800 | pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, | |
801 | writing, upgrade_p); | |
802 | if (is_error_noslot_pfn(pfn)) | |
803 | return -EFAULT; | |
804 | page = NULL; | |
805 | if (pfn_valid(pfn)) { | |
806 | page = pfn_to_page(pfn); | |
807 | if (PageReserved(page)) | |
808 | page = NULL; | |
5a319350 | 809 | } |
31c8b0d0 PM |
810 | } |
811 | ||
5a319350 | 812 | /* |
71d29f43 NP |
813 | * Read the PTE from the process' radix tree and use that |
814 | * so we get the shift and attribute bits. | |
5a319350 | 815 | */ |
71d29f43 NP |
816 | local_irq_disable(); |
817 | ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); | |
ae49deda PM |
818 | pte = __pte(0); |
819 | if (ptep) | |
820 | pte = *ptep; | |
821 | local_irq_enable(); | |
6579804c PM |
822 | /* |
823 | * If the PTE disappeared temporarily due to a THP | |
824 | * collapse, just return and let the guest try again. | |
825 | */ | |
ae49deda | 826 | if (!pte_present(pte)) { |
6579804c PM |
827 | if (page) |
828 | put_page(page); | |
829 | return RESUME_GUEST; | |
830 | } | |
71d29f43 | 831 | |
f460f679 PM |
832 | /* If we're logging dirty pages, always map single pages */ |
833 | large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); | |
834 | ||
71d29f43 | 835 | /* Get pte level from shift/size */ |
f460f679 | 836 | if (large_enable && shift == PUD_SHIFT && |
71d29f43 NP |
837 | (gpa & (PUD_SIZE - PAGE_SIZE)) == |
838 | (hva & (PUD_SIZE - PAGE_SIZE))) { | |
839 | level = 2; | |
f460f679 | 840 | } else if (large_enable && shift == PMD_SHIFT && |
71d29f43 NP |
841 | (gpa & (PMD_SIZE - PAGE_SIZE)) == |
842 | (hva & (PMD_SIZE - PAGE_SIZE))) { | |
843 | level = 1; | |
31c8b0d0 | 844 | } else { |
71d29f43 NP |
845 | level = 0; |
846 | if (shift > PAGE_SHIFT) { | |
847 | /* | |
848 | * If the pte maps more than one page, bring over | |
849 | * bits from the virtual address to get the real | |
850 | * address of the specific single page we want. | |
851 | */ | |
852 | unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; | |
853 | pte = __pte(pte_val(pte) | (hva & rpnmask)); | |
bc64dd0e | 854 | } |
5a319350 | 855 | } |
5a319350 | 856 | |
71d29f43 NP |
857 | pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); |
858 | if (writing || upgrade_write) { | |
859 | if (pte_val(pte) & _PAGE_WRITE) | |
860 | pte = __pte(pte_val(pte) | _PAGE_DIRTY); | |
861 | } else { | |
862 | pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); | |
863 | } | |
864 | ||
5a319350 | 865 | /* Allocate space in the tree and write the PTE */ |
04bae9d5 | 866 | ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, |
8cf531ed | 867 | mmu_seq, kvm->arch.lpid, NULL, NULL); |
04bae9d5 SJS |
868 | if (inserted_pte) |
869 | *inserted_pte = pte; | |
870 | if (levelp) | |
871 | *levelp = level; | |
5a319350 PM |
872 | |
873 | if (page) { | |
31c8b0d0 | 874 | if (!ret && (pte_val(pte) & _PAGE_WRITE)) |
c3856aeb PM |
875 | set_page_dirty_lock(page); |
876 | put_page(page); | |
5a319350 | 877 | } |
c3856aeb | 878 | |
8f1f7b9b SJS |
879 | /* Increment number of large pages if we (successfully) inserted one */ |
880 | if (!ret) { | |
881 | if (level == 1) | |
882 | kvm->stat.num_2M_pages++; | |
883 | else if (level == 2) | |
884 | kvm->stat.num_1G_pages++; | |
885 | } | |
886 | ||
04bae9d5 SJS |
887 | return ret; |
888 | } | |
889 | ||
890 | int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, | |
891 | unsigned long ea, unsigned long dsisr) | |
892 | { | |
893 | struct kvm *kvm = vcpu->kvm; | |
894 | unsigned long gpa, gfn; | |
895 | struct kvm_memory_slot *memslot; | |
896 | long ret; | |
897 | bool writing = !!(dsisr & DSISR_ISSTORE); | |
898 | bool kvm_ro = false; | |
899 | ||
900 | /* Check for unusual errors */ | |
901 | if (dsisr & DSISR_UNSUPP_MMU) { | |
902 | pr_err("KVM: Got unsupported MMU fault\n"); | |
903 | return -EFAULT; | |
904 | } | |
905 | if (dsisr & DSISR_BADACCESS) { | |
906 | /* Reflect to the guest as DSI */ | |
907 | pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); | |
908 | kvmppc_core_queue_data_storage(vcpu, ea, dsisr); | |
909 | return RESUME_GUEST; | |
910 | } | |
911 | ||
912 | /* Translate the logical address */ | |
913 | gpa = vcpu->arch.fault_gpa & ~0xfffUL; | |
914 | gpa &= ~0xF000000000000000ul; | |
915 | gfn = gpa >> PAGE_SHIFT; | |
916 | if (!(dsisr & DSISR_PRTABLE_FAULT)) | |
917 | gpa |= ea & 0xfff; | |
918 | ||
008e359c BR |
919 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
920 | return kvmppc_send_page_to_uv(kvm, gfn); | |
921 | ||
04bae9d5 SJS |
922 | /* Get the corresponding memslot */ |
923 | memslot = gfn_to_memslot(kvm, gfn); | |
924 | ||
925 | /* No memslot means it's an emulated MMIO region */ | |
926 | if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { | |
927 | if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | | |
928 | DSISR_SET_RC)) { | |
929 | /* | |
930 | * Bad address in guest page table tree, or other | |
931 | * unusual error - reflect it to the guest as DSI. | |
932 | */ | |
933 | kvmppc_core_queue_data_storage(vcpu, ea, dsisr); | |
934 | return RESUME_GUEST; | |
935 | } | |
936 | return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); | |
937 | } | |
938 | ||
939 | if (memslot->flags & KVM_MEM_READONLY) { | |
940 | if (writing) { | |
941 | /* give the guest a DSI */ | |
942 | kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | | |
943 | DSISR_PROTFAULT); | |
944 | return RESUME_GUEST; | |
945 | } | |
946 | kvm_ro = true; | |
947 | } | |
948 | ||
949 | /* Failed to set the reference/change bits */ | |
950 | if (dsisr & DSISR_SET_RC) { | |
951 | spin_lock(&kvm->mmu_lock); | |
952 | if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, | |
fd10be25 | 953 | writing, gpa, kvm->arch.lpid)) |
04bae9d5 SJS |
954 | dsisr &= ~DSISR_SET_RC; |
955 | spin_unlock(&kvm->mmu_lock); | |
956 | ||
957 | if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | | |
958 | DSISR_PROTFAULT | DSISR_SET_RC))) | |
959 | return RESUME_GUEST; | |
960 | } | |
961 | ||
962 | /* Try to insert a pte */ | |
963 | ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, | |
964 | kvm_ro, NULL, NULL); | |
965 | ||
c3856aeb PM |
966 | if (ret == 0 || ret == -EAGAIN) |
967 | ret = RESUME_GUEST; | |
5a319350 PM |
968 | return ret; |
969 | } | |
970 | ||
c43c3a86 | 971 | /* Called with kvm->mmu_lock held */ |
01756099 PM |
972 | int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
973 | unsigned long gfn) | |
974 | { | |
975 | pte_t *ptep; | |
976 | unsigned long gpa = gfn << PAGE_SHIFT; | |
977 | unsigned int shift; | |
978 | ||
008e359c BR |
979 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { |
980 | uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); | |
981 | return 0; | |
982 | } | |
983 | ||
94171b19 | 984 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
f0f825f0 | 985 | if (ptep && pte_present(*ptep)) |
fd10be25 SJS |
986 | kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, |
987 | kvm->arch.lpid); | |
01756099 PM |
988 | return 0; |
989 | } | |
990 | ||
c43c3a86 | 991 | /* Called with kvm->mmu_lock held */ |
01756099 PM |
992 | int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
993 | unsigned long gfn) | |
994 | { | |
995 | pte_t *ptep; | |
996 | unsigned long gpa = gfn << PAGE_SHIFT; | |
997 | unsigned int shift; | |
998 | int ref = 0; | |
ae59a7e1 | 999 | unsigned long old, *rmapp; |
01756099 | 1000 | |
008e359c BR |
1001 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
1002 | return ref; | |
1003 | ||
94171b19 | 1004 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
01756099 | 1005 | if (ptep && pte_present(*ptep) && pte_young(*ptep)) { |
ae59a7e1 SJS |
1006 | old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, |
1007 | gpa, shift); | |
01756099 | 1008 | /* XXX need to flush tlb here? */ |
ae59a7e1 SJS |
1009 | /* Also clear bit in ptes in shadow pgtable for nested guests */ |
1010 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; | |
1011 | kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, | |
1012 | old & PTE_RPN_MASK, | |
1013 | 1UL << shift); | |
01756099 PM |
1014 | ref = 1; |
1015 | } | |
1016 | return ref; | |
1017 | } | |
1018 | ||
c43c3a86 | 1019 | /* Called with kvm->mmu_lock held */ |
01756099 PM |
1020 | int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
1021 | unsigned long gfn) | |
1022 | { | |
1023 | pte_t *ptep; | |
1024 | unsigned long gpa = gfn << PAGE_SHIFT; | |
1025 | unsigned int shift; | |
1026 | int ref = 0; | |
1027 | ||
008e359c BR |
1028 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
1029 | return ref; | |
1030 | ||
94171b19 | 1031 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
01756099 PM |
1032 | if (ptep && pte_present(*ptep) && pte_young(*ptep)) |
1033 | ref = 1; | |
1034 | return ref; | |
1035 | } | |
1036 | ||
8f7b79b8 PM |
1037 | /* Returns the number of PAGE_SIZE pages that are dirty */ |
1038 | static int kvm_radix_test_clear_dirty(struct kvm *kvm, | |
1039 | struct kvm_memory_slot *memslot, int pagenum) | |
1040 | { | |
1041 | unsigned long gfn = memslot->base_gfn + pagenum; | |
1042 | unsigned long gpa = gfn << PAGE_SHIFT; | |
1043 | pte_t *ptep; | |
1044 | unsigned int shift; | |
1045 | int ret = 0; | |
ae59a7e1 | 1046 | unsigned long old, *rmapp; |
8f7b79b8 | 1047 | |
008e359c BR |
1048 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
1049 | return ret; | |
1050 | ||
94171b19 | 1051 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); |
8f7b79b8 PM |
1052 | if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { |
1053 | ret = 1; | |
1054 | if (shift) | |
1055 | ret = 1 << (shift - PAGE_SHIFT); | |
ae59a7e1 SJS |
1056 | spin_lock(&kvm->mmu_lock); |
1057 | old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, | |
1058 | gpa, shift); | |
fd10be25 | 1059 | kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); |
ae59a7e1 SJS |
1060 | /* Also clear bit in ptes in shadow pgtable for nested guests */ |
1061 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; | |
1062 | kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0, | |
1063 | old & PTE_RPN_MASK, | |
1064 | 1UL << shift); | |
1065 | spin_unlock(&kvm->mmu_lock); | |
8f7b79b8 PM |
1066 | } |
1067 | return ret; | |
1068 | } | |
1069 | ||
1070 | long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, | |
1071 | struct kvm_memory_slot *memslot, unsigned long *map) | |
1072 | { | |
1073 | unsigned long i, j; | |
8f7b79b8 PM |
1074 | int npages; |
1075 | ||
8f7b79b8 PM |
1076 | for (i = 0; i < memslot->npages; i = j) { |
1077 | npages = kvm_radix_test_clear_dirty(kvm, memslot, i); | |
1078 | ||
1079 | /* | |
1080 | * Note that if npages > 0 then i must be a multiple of npages, | |
1081 | * since huge pages are only used to back the guest at guest | |
1082 | * real addresses that are a multiple of their size. | |
1083 | * Since we have at most one PTE covering any given guest | |
1084 | * real address, if npages > 1 we can skip to i + npages. | |
1085 | */ | |
1086 | j = i + 1; | |
e641a317 PM |
1087 | if (npages) { |
1088 | set_dirty_bits(map, i, npages); | |
117647ff | 1089 | j = i + npages; |
e641a317 | 1090 | } |
8f7b79b8 PM |
1091 | } |
1092 | return 0; | |
1093 | } | |
1094 | ||
5af3e9d0 PM |
1095 | void kvmppc_radix_flush_memslot(struct kvm *kvm, |
1096 | const struct kvm_memory_slot *memslot) | |
1097 | { | |
1098 | unsigned long n; | |
1099 | pte_t *ptep; | |
1100 | unsigned long gpa; | |
1101 | unsigned int shift; | |
1102 | ||
c3262257 | 1103 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) |
ce477a7a | 1104 | kvmppc_uvmem_drop_pages(memslot, kvm, true); |
c3262257 | 1105 | |
008e359c BR |
1106 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
1107 | return; | |
1108 | ||
5af3e9d0 PM |
1109 | gpa = memslot->base_gfn << PAGE_SHIFT; |
1110 | spin_lock(&kvm->mmu_lock); | |
1111 | for (n = memslot->npages; n; --n) { | |
1112 | ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); | |
1113 | if (ptep && pte_present(*ptep)) | |
1114 | kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, | |
1115 | kvm->arch.lpid); | |
1116 | gpa += PAGE_SIZE; | |
1117 | } | |
1118 | spin_unlock(&kvm->mmu_lock); | |
1119 | } | |
1120 | ||
8cf4ecc0 PM |
1121 | static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, |
1122 | int psize, int *indexp) | |
1123 | { | |
1124 | if (!mmu_psize_defs[psize].shift) | |
1125 | return; | |
1126 | info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | | |
1127 | (mmu_psize_defs[psize].ap << 29); | |
1128 | ++(*indexp); | |
1129 | } | |
1130 | ||
1131 | int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) | |
1132 | { | |
1133 | int i; | |
1134 | ||
1135 | if (!radix_enabled()) | |
1136 | return -EINVAL; | |
1137 | memset(info, 0, sizeof(*info)); | |
1138 | ||
1139 | /* 4k page size */ | |
1140 | info->geometries[0].page_shift = 12; | |
1141 | info->geometries[0].level_bits[0] = 9; | |
1142 | for (i = 1; i < 4; ++i) | |
1143 | info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; | |
1144 | /* 64k page size */ | |
1145 | info->geometries[1].page_shift = 16; | |
1146 | for (i = 0; i < 4; ++i) | |
1147 | info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; | |
1148 | ||
1149 | i = 0; | |
1150 | add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); | |
1151 | add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); | |
1152 | add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); | |
1153 | add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); | |
1154 | ||
1155 | return 0; | |
1156 | } | |
1157 | ||
1158 | int kvmppc_init_vm_radix(struct kvm *kvm) | |
1159 | { | |
1160 | kvm->arch.pgtable = pgd_alloc(kvm->mm); | |
1161 | if (!kvm->arch.pgtable) | |
1162 | return -ENOMEM; | |
1163 | return 0; | |
1164 | } | |
1165 | ||
5a319350 PM |
1166 | static void pte_ctor(void *addr) |
1167 | { | |
21828c99 AK |
1168 | memset(addr, 0, RADIX_PTE_TABLE_SIZE); |
1169 | } | |
1170 | ||
1171 | static void pmd_ctor(void *addr) | |
1172 | { | |
1173 | memset(addr, 0, RADIX_PMD_TABLE_SIZE); | |
5a319350 PM |
1174 | } |
1175 | ||
9a94d3ee PM |
1176 | struct debugfs_radix_state { |
1177 | struct kvm *kvm; | |
1178 | struct mutex mutex; | |
1179 | unsigned long gpa; | |
83a05510 | 1180 | int lpid; |
9a94d3ee PM |
1181 | int chars_left; |
1182 | int buf_index; | |
1183 | char buf[128]; | |
1184 | u8 hdr; | |
1185 | }; | |
1186 | ||
1187 | static int debugfs_radix_open(struct inode *inode, struct file *file) | |
1188 | { | |
1189 | struct kvm *kvm = inode->i_private; | |
1190 | struct debugfs_radix_state *p; | |
1191 | ||
1192 | p = kzalloc(sizeof(*p), GFP_KERNEL); | |
1193 | if (!p) | |
1194 | return -ENOMEM; | |
1195 | ||
1196 | kvm_get_kvm(kvm); | |
1197 | p->kvm = kvm; | |
1198 | mutex_init(&p->mutex); | |
1199 | file->private_data = p; | |
1200 | ||
1201 | return nonseekable_open(inode, file); | |
1202 | } | |
1203 | ||
1204 | static int debugfs_radix_release(struct inode *inode, struct file *file) | |
1205 | { | |
1206 | struct debugfs_radix_state *p = file->private_data; | |
1207 | ||
1208 | kvm_put_kvm(p->kvm); | |
1209 | kfree(p); | |
1210 | return 0; | |
1211 | } | |
1212 | ||
1213 | static ssize_t debugfs_radix_read(struct file *file, char __user *buf, | |
1214 | size_t len, loff_t *ppos) | |
1215 | { | |
1216 | struct debugfs_radix_state *p = file->private_data; | |
1217 | ssize_t ret, r; | |
1218 | unsigned long n; | |
1219 | struct kvm *kvm; | |
1220 | unsigned long gpa; | |
1221 | pgd_t *pgt; | |
83a05510 | 1222 | struct kvm_nested_guest *nested; |
9a94d3ee PM |
1223 | pgd_t pgd, *pgdp; |
1224 | pud_t pud, *pudp; | |
1225 | pmd_t pmd, *pmdp; | |
1226 | pte_t *ptep; | |
1227 | int shift; | |
1228 | unsigned long pte; | |
1229 | ||
1230 | kvm = p->kvm; | |
1231 | if (!kvm_is_radix(kvm)) | |
1232 | return 0; | |
1233 | ||
1234 | ret = mutex_lock_interruptible(&p->mutex); | |
1235 | if (ret) | |
1236 | return ret; | |
1237 | ||
1238 | if (p->chars_left) { | |
1239 | n = p->chars_left; | |
1240 | if (n > len) | |
1241 | n = len; | |
1242 | r = copy_to_user(buf, p->buf + p->buf_index, n); | |
1243 | n -= r; | |
1244 | p->chars_left -= n; | |
1245 | p->buf_index += n; | |
1246 | buf += n; | |
1247 | len -= n; | |
1248 | ret = n; | |
1249 | if (r) { | |
1250 | if (!n) | |
1251 | ret = -EFAULT; | |
1252 | goto out; | |
1253 | } | |
1254 | } | |
1255 | ||
1256 | gpa = p->gpa; | |
83a05510 PM |
1257 | nested = NULL; |
1258 | pgt = NULL; | |
1259 | while (len != 0 && p->lpid >= 0) { | |
1260 | if (gpa >= RADIX_PGTABLE_RANGE) { | |
1261 | gpa = 0; | |
1262 | pgt = NULL; | |
1263 | if (nested) { | |
1264 | kvmhv_put_nested(nested); | |
1265 | nested = NULL; | |
1266 | } | |
1267 | p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); | |
1268 | p->hdr = 0; | |
1269 | if (p->lpid < 0) | |
1270 | break; | |
1271 | } | |
1272 | if (!pgt) { | |
1273 | if (p->lpid == 0) { | |
1274 | pgt = kvm->arch.pgtable; | |
1275 | } else { | |
1276 | nested = kvmhv_get_nested(kvm, p->lpid, false); | |
1277 | if (!nested) { | |
1278 | gpa = RADIX_PGTABLE_RANGE; | |
1279 | continue; | |
1280 | } | |
1281 | pgt = nested->shadow_pgtable; | |
1282 | } | |
1283 | } | |
1284 | n = 0; | |
9a94d3ee | 1285 | if (!p->hdr) { |
83a05510 PM |
1286 | if (p->lpid > 0) |
1287 | n = scnprintf(p->buf, sizeof(p->buf), | |
1288 | "\nNested LPID %d: ", p->lpid); | |
1289 | n += scnprintf(p->buf + n, sizeof(p->buf) - n, | |
9a94d3ee PM |
1290 | "pgdir: %lx\n", (unsigned long)pgt); |
1291 | p->hdr = 1; | |
1292 | goto copy; | |
1293 | } | |
1294 | ||
1295 | pgdp = pgt + pgd_index(gpa); | |
1296 | pgd = READ_ONCE(*pgdp); | |
1297 | if (!(pgd_val(pgd) & _PAGE_PRESENT)) { | |
1298 | gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; | |
1299 | continue; | |
1300 | } | |
1301 | ||
1302 | pudp = pud_offset(&pgd, gpa); | |
1303 | pud = READ_ONCE(*pudp); | |
1304 | if (!(pud_val(pud) & _PAGE_PRESENT)) { | |
1305 | gpa = (gpa & PUD_MASK) + PUD_SIZE; | |
1306 | continue; | |
1307 | } | |
1308 | if (pud_val(pud) & _PAGE_PTE) { | |
1309 | pte = pud_val(pud); | |
1310 | shift = PUD_SHIFT; | |
1311 | goto leaf; | |
1312 | } | |
1313 | ||
1314 | pmdp = pmd_offset(&pud, gpa); | |
1315 | pmd = READ_ONCE(*pmdp); | |
1316 | if (!(pmd_val(pmd) & _PAGE_PRESENT)) { | |
1317 | gpa = (gpa & PMD_MASK) + PMD_SIZE; | |
1318 | continue; | |
1319 | } | |
1320 | if (pmd_val(pmd) & _PAGE_PTE) { | |
1321 | pte = pmd_val(pmd); | |
1322 | shift = PMD_SHIFT; | |
1323 | goto leaf; | |
1324 | } | |
1325 | ||
1326 | ptep = pte_offset_kernel(&pmd, gpa); | |
1327 | pte = pte_val(READ_ONCE(*ptep)); | |
1328 | if (!(pte & _PAGE_PRESENT)) { | |
1329 | gpa += PAGE_SIZE; | |
1330 | continue; | |
1331 | } | |
1332 | shift = PAGE_SHIFT; | |
1333 | leaf: | |
1334 | n = scnprintf(p->buf, sizeof(p->buf), | |
1335 | " %lx: %lx %d\n", gpa, pte, shift); | |
1336 | gpa += 1ul << shift; | |
1337 | copy: | |
1338 | p->chars_left = n; | |
1339 | if (n > len) | |
1340 | n = len; | |
1341 | r = copy_to_user(buf, p->buf, n); | |
1342 | n -= r; | |
1343 | p->chars_left -= n; | |
1344 | p->buf_index = n; | |
1345 | buf += n; | |
1346 | len -= n; | |
1347 | ret += n; | |
1348 | if (r) { | |
1349 | if (!ret) | |
1350 | ret = -EFAULT; | |
1351 | break; | |
1352 | } | |
1353 | } | |
1354 | p->gpa = gpa; | |
83a05510 PM |
1355 | if (nested) |
1356 | kvmhv_put_nested(nested); | |
9a94d3ee PM |
1357 | |
1358 | out: | |
1359 | mutex_unlock(&p->mutex); | |
1360 | return ret; | |
1361 | } | |
1362 | ||
1363 | static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, | |
1364 | size_t len, loff_t *ppos) | |
1365 | { | |
1366 | return -EACCES; | |
1367 | } | |
1368 | ||
1369 | static const struct file_operations debugfs_radix_fops = { | |
1370 | .owner = THIS_MODULE, | |
1371 | .open = debugfs_radix_open, | |
1372 | .release = debugfs_radix_release, | |
1373 | .read = debugfs_radix_read, | |
1374 | .write = debugfs_radix_write, | |
1375 | .llseek = generic_file_llseek, | |
1376 | }; | |
1377 | ||
1378 | void kvmhv_radix_debugfs_init(struct kvm *kvm) | |
1379 | { | |
c4fd527f GKH |
1380 | debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm, |
1381 | &debugfs_radix_fops); | |
9a94d3ee PM |
1382 | } |
1383 | ||
5a319350 PM |
1384 | int kvmppc_radix_init(void) |
1385 | { | |
21828c99 | 1386 | unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; |
5a319350 PM |
1387 | |
1388 | kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); | |
1389 | if (!kvm_pte_cache) | |
1390 | return -ENOMEM; | |
21828c99 AK |
1391 | |
1392 | size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; | |
1393 | ||
1394 | kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); | |
1395 | if (!kvm_pmd_cache) { | |
1396 | kmem_cache_destroy(kvm_pte_cache); | |
1397 | return -ENOMEM; | |
1398 | } | |
1399 | ||
5a319350 PM |
1400 | return 0; |
1401 | } | |
1402 | ||
1403 | void kvmppc_radix_exit(void) | |
1404 | { | |
1405 | kmem_cache_destroy(kvm_pte_cache); | |
21828c99 | 1406 | kmem_cache_destroy(kvm_pmd_cache); |
5a319350 | 1407 | } |