1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
5 * This module enables kernel and guest-mode vCPU access to guest physical
6 * memory with suitable invalidation mechanisms.
8 * Copyright © 2021 Amazon.com, Inc. or its affiliates.
11 * David Woodhouse <dwmw2@infradead.org>
14 #include <linux/kvm_host.h>
15 #include <linux/kvm.h>
16 #include <linux/highmem.h>
17 #include <linux/module.h>
18 #include <linux/errno.h>
23 * MMU notifier 'invalidate_range_start' hook.
25 void gfn_to_pfn_cache_invalidate_start(struct kvm
*kvm
, unsigned long start
,
26 unsigned long end
, bool may_block
)
28 struct gfn_to_pfn_cache
*gpc
;
30 spin_lock(&kvm
->gpc_lock
);
31 list_for_each_entry(gpc
, &kvm
->gpc_list
, list
) {
32 read_lock_irq(&gpc
->lock
);
34 /* Only a single page so no need to care about length */
35 if (gpc
->valid
&& !is_error_noslot_pfn(gpc
->pfn
) &&
36 gpc
->uhva
>= start
&& gpc
->uhva
< end
) {
37 read_unlock_irq(&gpc
->lock
);
40 * There is a small window here where the cache could
41 * be modified, and invalidation would no longer be
42 * necessary. Hence check again whether invalidation
43 * is still necessary once the write lock has been
47 write_lock_irq(&gpc
->lock
);
48 if (gpc
->valid
&& !is_error_noslot_pfn(gpc
->pfn
) &&
49 gpc
->uhva
>= start
&& gpc
->uhva
< end
)
51 write_unlock_irq(&gpc
->lock
);
55 read_unlock_irq(&gpc
->lock
);
57 spin_unlock(&kvm
->gpc_lock
);
60 static bool kvm_gpc_is_valid_len(gpa_t gpa
, unsigned long uhva
,
63 unsigned long offset
= kvm_is_error_gpa(gpa
) ? offset_in_page(uhva
) :
67 * The cached access must fit within a single page. The 'len' argument
68 * to activate() and refresh() exists only to enforce that.
70 return offset
+ len
<= PAGE_SIZE
;
73 bool kvm_gpc_check(struct gfn_to_pfn_cache
*gpc
, unsigned long len
)
75 struct kvm_memslots
*slots
= kvm_memslots(gpc
->kvm
);
81 * If the page was cached from a memslot, make sure the memslots have
82 * not been re-configured.
84 if (!kvm_is_error_gpa(gpc
->gpa
) && gpc
->generation
!= slots
->generation
)
87 if (kvm_is_error_hva(gpc
->uhva
))
90 if (!kvm_gpc_is_valid_len(gpc
->gpa
, gpc
->uhva
, len
))
99 static void *gpc_map(kvm_pfn_t pfn
)
102 return kmap(pfn_to_page(pfn
));
104 #ifdef CONFIG_HAS_IOMEM
105 return memremap(pfn_to_hpa(pfn
), PAGE_SIZE
, MEMREMAP_WB
);
111 static void gpc_unmap(kvm_pfn_t pfn
, void *khva
)
113 /* Unmap the old pfn/page if it was mapped before. */
114 if (is_error_noslot_pfn(pfn
) || !khva
)
117 if (pfn_valid(pfn
)) {
118 kunmap(pfn_to_page(pfn
));
122 #ifdef CONFIG_HAS_IOMEM
127 static inline bool mmu_notifier_retry_cache(struct kvm
*kvm
, unsigned long mmu_seq
)
130 * mn_active_invalidate_count acts for all intents and purposes
131 * like mmu_invalidate_in_progress here; but the latter cannot
132 * be used here because the invalidation of caches in the
133 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
136 * Note, it does not matter that mn_active_invalidate_count
137 * is not protected by gpc->lock. It is guaranteed to
138 * be elevated before the mmu_notifier acquires gpc->lock, and
139 * isn't dropped until after mmu_invalidate_seq is updated.
141 if (kvm
->mn_active_invalidate_count
)
145 * Ensure mn_active_invalidate_count is read before
146 * mmu_invalidate_seq. This pairs with the smp_wmb() in
147 * mmu_notifier_invalidate_range_end() to guarantee either the
148 * old (non-zero) value of mn_active_invalidate_count or the
149 * new (incremented) value of mmu_invalidate_seq is observed.
152 return kvm
->mmu_invalidate_seq
!= mmu_seq
;
155 static kvm_pfn_t
hva_to_pfn_retry(struct gfn_to_pfn_cache
*gpc
)
157 /* Note, the new page offset may be different than the old! */
158 void *old_khva
= (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc
->khva
);
159 kvm_pfn_t new_pfn
= KVM_PFN_ERR_FAULT
;
160 void *new_khva
= NULL
;
161 unsigned long mmu_seq
;
163 lockdep_assert_held(&gpc
->refresh_lock
);
165 lockdep_assert_held_write(&gpc
->lock
);
168 * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva
169 * assets have already been updated and so a concurrent check() from a
170 * different task may not fail the gpa/uhva/generation checks.
175 mmu_seq
= gpc
->kvm
->mmu_invalidate_seq
;
178 write_unlock_irq(&gpc
->lock
);
181 * If the previous iteration "failed" due to an mmu_notifier
182 * event, release the pfn and unmap the kernel virtual address
183 * from the previous attempt. Unmapping might sleep, so this
184 * needs to be done after dropping the lock. Opportunistically
185 * check for resched while the lock isn't held.
187 if (new_pfn
!= KVM_PFN_ERR_FAULT
) {
189 * Keep the mapping if the previous iteration reused
190 * the existing mapping and didn't create a new one.
192 if (new_khva
!= old_khva
)
193 gpc_unmap(new_pfn
, new_khva
);
195 kvm_release_pfn_clean(new_pfn
);
200 /* We always request a writeable mapping */
201 new_pfn
= hva_to_pfn(gpc
->uhva
, false, false, NULL
, true, NULL
);
202 if (is_error_noslot_pfn(new_pfn
))
206 * Obtain a new kernel mapping if KVM itself will access the
207 * pfn. Note, kmap() and memremap() can both sleep, so this
208 * too must be done outside of gpc->lock!
210 if (new_pfn
== gpc
->pfn
)
213 new_khva
= gpc_map(new_pfn
);
216 kvm_release_pfn_clean(new_pfn
);
220 write_lock_irq(&gpc
->lock
);
223 * Other tasks must wait for _this_ refresh to complete before
224 * attempting to refresh.
226 WARN_ON_ONCE(gpc
->valid
);
227 } while (mmu_notifier_retry_cache(gpc
->kvm
, mmu_seq
));
231 gpc
->khva
= new_khva
+ offset_in_page(gpc
->uhva
);
234 * Put the reference to the _new_ pfn. The pfn is now tracked by the
235 * cache and can be safely migrated, swapped, etc... as the cache will
236 * invalidate any mappings in response to relevant mmu_notifier events.
238 kvm_release_pfn_clean(new_pfn
);
243 write_lock_irq(&gpc
->lock
);
248 static int __kvm_gpc_refresh(struct gfn_to_pfn_cache
*gpc
, gpa_t gpa
, unsigned long uhva
,
251 unsigned long page_offset
;
252 bool unmap_old
= false;
253 unsigned long old_uhva
;
255 bool hva_change
= false;
259 /* Either gpa or uhva must be valid, but not both */
260 if (WARN_ON_ONCE(kvm_is_error_gpa(gpa
) == kvm_is_error_hva(uhva
)))
263 if (!kvm_gpc_is_valid_len(gpa
, uhva
, len
))
266 lockdep_assert_held(&gpc
->refresh_lock
);
268 write_lock_irq(&gpc
->lock
);
276 old_khva
= (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc
->khva
);
277 old_uhva
= PAGE_ALIGN_DOWN(gpc
->uhva
);
279 if (kvm_is_error_gpa(gpa
)) {
280 page_offset
= offset_in_page(uhva
);
282 gpc
->gpa
= INVALID_GPA
;
284 gpc
->uhva
= PAGE_ALIGN_DOWN(uhva
);
286 if (gpc
->uhva
!= old_uhva
)
289 struct kvm_memslots
*slots
= kvm_memslots(gpc
->kvm
);
291 page_offset
= offset_in_page(gpa
);
293 if (gpc
->gpa
!= gpa
|| gpc
->generation
!= slots
->generation
||
294 kvm_is_error_hva(gpc
->uhva
)) {
295 gfn_t gfn
= gpa_to_gfn(gpa
);
298 gpc
->generation
= slots
->generation
;
299 gpc
->memslot
= __gfn_to_memslot(slots
, gfn
);
300 gpc
->uhva
= gfn_to_hva_memslot(gpc
->memslot
, gfn
);
302 if (kvm_is_error_hva(gpc
->uhva
)) {
308 * Even if the GPA and/or the memslot generation changed, the
309 * HVA may still be the same.
311 if (gpc
->uhva
!= old_uhva
)
314 gpc
->uhva
= old_uhva
;
318 /* Note: the offset must be correct before calling hva_to_pfn_retry() */
319 gpc
->uhva
+= page_offset
;
322 * If the userspace HVA changed or the PFN was already invalid,
323 * drop the lock and do the HVA to PFN lookup again.
325 if (!gpc
->valid
|| hva_change
) {
326 ret
= hva_to_pfn_retry(gpc
);
329 * If the HVA→PFN mapping was already valid, don't unmap it.
330 * But do update gpc->khva because the offset within the page
333 gpc
->khva
= old_khva
+ page_offset
;
340 * Invalidate the cache and purge the pfn/khva if the refresh failed.
341 * Some/all of the uhva, gpa, and memslot generation info may still be
342 * valid, leave it as is.
346 gpc
->pfn
= KVM_PFN_ERR_FAULT
;
350 /* Detect a pfn change before dropping the lock! */
351 unmap_old
= (old_pfn
!= gpc
->pfn
);
354 write_unlock_irq(&gpc
->lock
);
357 gpc_unmap(old_pfn
, old_khva
);
362 int kvm_gpc_refresh(struct gfn_to_pfn_cache
*gpc
, unsigned long len
)
366 guard(mutex
)(&gpc
->refresh_lock
);
369 * If the GPA is valid then ignore the HVA, as a cache can be GPA-based
370 * or HVA-based, not both. For GPA-based caches, the HVA will be
371 * recomputed during refresh if necessary.
373 uhva
= kvm_is_error_gpa(gpc
->gpa
) ? gpc
->uhva
: KVM_HVA_ERR_BAD
;
375 return __kvm_gpc_refresh(gpc
, gpc
->gpa
, uhva
, len
);
378 void kvm_gpc_init(struct gfn_to_pfn_cache
*gpc
, struct kvm
*kvm
)
380 rwlock_init(&gpc
->lock
);
381 mutex_init(&gpc
->refresh_lock
);
384 gpc
->pfn
= KVM_PFN_ERR_FAULT
;
385 gpc
->gpa
= INVALID_GPA
;
386 gpc
->uhva
= KVM_HVA_ERR_BAD
;
387 gpc
->active
= gpc
->valid
= false;
390 static int __kvm_gpc_activate(struct gfn_to_pfn_cache
*gpc
, gpa_t gpa
, unsigned long uhva
,
393 struct kvm
*kvm
= gpc
->kvm
;
395 guard(mutex
)(&gpc
->refresh_lock
);
398 if (KVM_BUG_ON(gpc
->valid
, kvm
))
401 spin_lock(&kvm
->gpc_lock
);
402 list_add(&gpc
->list
, &kvm
->gpc_list
);
403 spin_unlock(&kvm
->gpc_lock
);
406 * Activate the cache after adding it to the list, a concurrent
407 * refresh must not establish a mapping until the cache is
408 * reachable by mmu_notifier events.
410 write_lock_irq(&gpc
->lock
);
412 write_unlock_irq(&gpc
->lock
);
414 return __kvm_gpc_refresh(gpc
, gpa
, uhva
, len
);
417 int kvm_gpc_activate(struct gfn_to_pfn_cache
*gpc
, gpa_t gpa
, unsigned long len
)
419 return __kvm_gpc_activate(gpc
, gpa
, KVM_HVA_ERR_BAD
, len
);
422 int kvm_gpc_activate_hva(struct gfn_to_pfn_cache
*gpc
, unsigned long uhva
, unsigned long len
)
424 return __kvm_gpc_activate(gpc
, INVALID_GPA
, uhva
, len
);
427 void kvm_gpc_deactivate(struct gfn_to_pfn_cache
*gpc
)
429 struct kvm
*kvm
= gpc
->kvm
;
433 guard(mutex
)(&gpc
->refresh_lock
);
437 * Deactivate the cache before removing it from the list, KVM
438 * must stall mmu_notifier events until all users go away, i.e.
439 * until gpc->lock is dropped and refresh is guaranteed to fail.
441 write_lock_irq(&gpc
->lock
);
446 * Leave the GPA => uHVA cache intact, it's protected by the
447 * memslot generation. The PFN lookup needs to be redone every
448 * time as mmu_notifier protection is lost when the cache is
449 * removed from the VM's gpc_list.
451 old_khva
= gpc
->khva
- offset_in_page(gpc
->khva
);
455 gpc
->pfn
= KVM_PFN_ERR_FAULT
;
456 write_unlock_irq(&gpc
->lock
);
458 spin_lock(&kvm
->gpc_lock
);
459 list_del(&gpc
->list
);
460 spin_unlock(&kvm
->gpc_lock
);
462 gpc_unmap(old_pfn
, old_khva
);