]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
KVM: Pin (as in FOLL_PIN) pages during kvm_vcpu_map()
authorSean Christopherson <seanjc@google.com>
Thu, 10 Oct 2024 18:23:33 +0000 (11:23 -0700)
committerPaolo Bonzini <pbonzini@redhat.com>
Fri, 25 Oct 2024 16:58:00 +0000 (12:58 -0400)
Pin, as in FOLL_PIN, pages when mapping them for direct access by KVM.
As per Documentation/core-api/pin_user_pages.rst, writing to a page that
was gotten via FOLL_GET is explicitly disallowed.

  Correct (uses FOLL_PIN calls):
      pin_user_pages()
      write to the data within the pages
      unpin_user_pages()

  INCORRECT (uses FOLL_GET calls):
      get_user_pages()
      write to the data within the pages
      put_page()

Unfortunately, FOLL_PIN is a "private" flag, and so kvm_follow_pfn must
use a one-off bool instead of being able to piggyback the "flags" field.

Link: https://lwn.net/Articles/930667
Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010182427.1434605-32-seanjc@google.com>

include/linux/kvm_host.h
virt/kvm/kvm_main.c
virt/kvm/kvm_mm.h

index 35e1beb017dd5f4bbdde19cccd5c38f6abe35a1f..b4c541fa5a1f9783215a1d4b2ea3055b968dbc9d 100644 (file)
@@ -280,7 +280,7 @@ struct kvm_host_map {
         * can be used as guest memory but they are not managed by host
         * kernel).
         */
-       struct page *refcounted_page;
+       struct page *pinned_page;
        struct page *page;
        void *hva;
        kvm_pfn_t pfn;
index 842a5d5f3120c15055361dd28f9f58f3bbf3edec..0d59f47f099e9e713e12405746caa9dd78897fc0 100644 (file)
@@ -2814,9 +2814,12 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
         */
        if (map) {
                pfn = map->pfn;
-               page = kvm_pfn_to_refcounted_page(pfn);
-               if (page && !get_page_unless_zero(page))
-                       return KVM_PFN_ERR_FAULT;
+
+               if (!kfp->pin) {
+                       page = kvm_pfn_to_refcounted_page(pfn);
+                       if (page && !get_page_unless_zero(page))
+                               return KVM_PFN_ERR_FAULT;
+               }
        } else {
                pfn = page_to_pfn(page);
        }
@@ -2834,16 +2837,24 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
 static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
 {
        struct page *page;
+       bool r;
 
        /*
-        * Fast pin a writable pfn only if it is a write fault request
-        * or the caller allows to map a writable pfn for a read fault
-        * request.
+        * Try the fast-only path when the caller wants to pin/get the page for
+        * writing.  If the caller only wants to read the page, KVM must go
+        * down the full, slow path in order to avoid racing an operation that
+        * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing
+        * at the old, read-only page while mm/ points at a new, writable page.
         */
        if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable))
                return false;
 
-       if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) {
+       if (kfp->pin)
+               r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1;
+       else
+               r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page);
+
+       if (r) {
                *pfn = kvm_resolve_pfn(kfp, page, NULL, true);
                return true;
        }
@@ -2872,10 +2883,21 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
        struct page *page, *wpage;
        int npages;
 
-       npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
+       if (kfp->pin)
+               npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags);
+       else
+               npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
        if (npages != 1)
                return npages;
 
+       /*
+        * Pinning is mutually exclusive with opportunistically mapping a read
+        * fault as writable, as KVM should never pin pages when mapping memory
+        * into the guest (pinning is only for direct accesses from KVM).
+        */
+       if (WARN_ON_ONCE(kfp->map_writable && kfp->pin))
+               goto out;
+
        /* map read fault as writable if possible */
        if (!(flags & FOLL_WRITE) && kfp->map_writable &&
            get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
@@ -2884,6 +2906,7 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
                flags |= FOLL_WRITE;
        }
 
+out:
        *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
        return npages;
 }
@@ -3093,10 +3116,11 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
                .slot = gfn_to_memslot(vcpu->kvm, gfn),
                .gfn = gfn,
                .flags = FOLL_WRITE,
-               .refcounted_page = &map->refcounted_page,
+               .refcounted_page = &map->pinned_page,
+               .pin = true,
        };
 
-       map->refcounted_page = NULL;
+       map->pinned_page = NULL;
        map->page = NULL;
        map->hva = NULL;
        map->gfn = gfn;
@@ -3133,16 +3157,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
        if (dirty)
                kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
 
-       if (map->refcounted_page) {
+       if (map->pinned_page) {
                if (dirty)
-                       kvm_release_page_dirty(map->refcounted_page);
-               else
-                       kvm_release_page_clean(map->refcounted_page);
+                       kvm_set_page_dirty(map->pinned_page);
+               kvm_set_page_accessed(map->pinned_page);
+               unpin_user_page(map->pinned_page);
        }
 
        map->hva = NULL;
        map->page = NULL;
-       map->refcounted_page = NULL;
+       map->pinned_page = NULL;
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
index d3ac1ba8ba6663c91b536e0307729fa3904abab7..acef3f5c582ae706b6a4b7785aa0b96afd63799a 100644 (file)
@@ -30,6 +30,13 @@ struct kvm_follow_pfn {
        /* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */
        unsigned int flags;
 
+       /*
+        * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag).
+        * The page *must* be pinned if KVM will write to the page via a kernel
+        * mapping, e.g. via kmap(), mremap(), etc.
+        */
+       bool pin;
+
        /*
         * If non-NULL, try to get a writable mapping even for a read fault.
         * Set to true if a writable mapping was obtained.