--- /dev/null
+From c3856aeb29402e94ad9b3879030165cc6a4fdc56 Mon Sep 17 00:00:00 2001
+From: Paul Mackerras <paulus@ozlabs.org>
+Date: Fri, 23 Feb 2018 21:21:12 +1100
+Subject: KVM: PPC: Book3S HV: Fix handling of large pages in radix page fault handler
+
+From: Paul Mackerras <paulus@ozlabs.org>
+
+commit c3856aeb29402e94ad9b3879030165cc6a4fdc56 upstream.
+
+This fixes several bugs in the radix page fault handler relating to
+the way large pages in the memory backing the guest were handled.
+First, the check for large pages only checked for explicit huge pages
+and missed transparent huge pages. Then the check that the addresses
+(host virtual vs. guest physical) had appropriate alignment was
+wrong, meaning that the code never put a large page in the partition
+scoped radix tree; it was always demoted to a small page.
+
+Fixing this exposed bugs in kvmppc_create_pte(). We were never
+invalidating a 2MB PTE, which meant that if a page was initially
+faulted in without write permission and the guest then attempted
+to store to it, we would never update the PTE to have write permission.
+If we find a valid 2MB PTE in the PMD, we need to clear it and
+do a TLB invalidation before installing either the new 2MB PTE or
+a pointer to a page table page.
+
+This also corrects an assumption that get_user_pages_fast would set
+the _PAGE_DIRTY bit if we are writing, which is not true. Instead we
+mark the page dirty explicitly with set_page_dirty_lock(). This
+also means we don't need the dirty bit set on the host PTE when
+providing write access on a read fault.
+
+[paulus@ozlabs.org - use mark_pages_dirty instead of
+ kvmppc_update_dirty_map]
+
+Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/kvm/book3s_64_mmu_radix.c | 72 +++++++++++++++++++++------------
+ 1 file changed, 46 insertions(+), 26 deletions(-)
+
+--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
++++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
+@@ -19,6 +19,9 @@
+ #include <asm/pgalloc.h>
+ #include <asm/pte-walk.h>
+
++static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
++ unsigned long gfn, unsigned int order);
++
+ /*
+ * Supported radix tree geometry.
+ * Like p9, we support either 5 or 9 bits at the first (lowest) level,
+@@ -195,6 +198,12 @@ static void kvmppc_pte_free(pte_t *ptep)
+ kmem_cache_free(kvm_pte_cache, ptep);
+ }
+
++/* Like pmd_huge() and pmd_large(), but works regardless of config options */
++static inline int pmd_is_leaf(pmd_t pmd)
++{
++ return !!(pmd_val(pmd) & _PAGE_PTE);
++}
++
+ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
+ unsigned int level, unsigned long mmu_seq)
+ {
+@@ -219,7 +228,7 @@ static int kvmppc_create_pte(struct kvm
+ else
+ new_pmd = pmd_alloc_one(kvm->mm, gpa);
+
+- if (level == 0 && !(pmd && pmd_present(*pmd)))
++ if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
+ new_ptep = kvmppc_pte_alloc();
+
+ /* Check if we might have been invalidated; let the guest retry if so */
+@@ -244,12 +253,30 @@ static int kvmppc_create_pte(struct kvm
+ new_pmd = NULL;
+ }
+ pmd = pmd_offset(pud, gpa);
+- if (pmd_large(*pmd)) {
+- /* Someone else has instantiated a large page here; retry */
+- ret = -EAGAIN;
+- goto out_unlock;
+- }
+- if (level == 1 && !pmd_none(*pmd)) {
++ if (pmd_is_leaf(*pmd)) {
++ unsigned long lgpa = gpa & PMD_MASK;
++
++ /*
++ * If we raced with another CPU which has just put
++ * a 2MB pte in after we saw a pte page, try again.
++ */
++ if (level == 0 && !new_ptep) {
++ ret = -EAGAIN;
++ goto out_unlock;
++ }
++ /* Valid 2MB page here already, remove it */
++ old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
++ ~0UL, 0, lgpa, PMD_SHIFT);
++ kvmppc_radix_tlbie_page(kvm, lgpa, PMD_SHIFT);
++ if (old & _PAGE_DIRTY) {
++ unsigned long gfn = lgpa >> PAGE_SHIFT;
++ struct kvm_memory_slot *memslot;
++ memslot = gfn_to_memslot(kvm, gfn);
++ if (memslot)
++ mark_pages_dirty(kvm, memslot, gfn,
++ PMD_SHIFT - PAGE_SHIFT);
++ }
++ } else if (level == 1 && !pmd_none(*pmd)) {
+ /*
+ * There's a page table page here, but we wanted
+ * to install a large page. Tell the caller and let
+@@ -412,28 +439,24 @@ int kvmppc_book3s_radix_page_fault(struc
+ } else {
+ page = pages[0];
+ pfn = page_to_pfn(page);
+- if (PageHuge(page)) {
+- page = compound_head(page);
+- pte_size <<= compound_order(page);
++ if (PageCompound(page)) {
++ pte_size <<= compound_order(compound_head(page));
+ /* See if we can insert a 2MB large-page PTE here */
+ if (pte_size >= PMD_SIZE &&
+- (gpa & PMD_MASK & PAGE_MASK) ==
+- (hva & PMD_MASK & PAGE_MASK)) {
++ (gpa & (PMD_SIZE - PAGE_SIZE)) ==
++ (hva & (PMD_SIZE - PAGE_SIZE))) {
+ level = 1;
+ pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
+ }
+ }
+ /* See if we can provide write access */
+ if (writing) {
+- /*
+- * We assume gup_fast has set dirty on the host PTE.
+- */
+ pgflags |= _PAGE_WRITE;
+ } else {
+ local_irq_save(flags);
+ ptep = find_current_mm_pte(current->mm->pgd,
+ hva, NULL, NULL);
+- if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
++ if (ptep && pte_write(*ptep))
+ pgflags |= _PAGE_WRITE;
+ local_irq_restore(flags);
+ }
+@@ -459,18 +482,15 @@ int kvmppc_book3s_radix_page_fault(struc
+ pte = pfn_pte(pfn, __pgprot(pgflags));
+ ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+ }
+- if (ret == 0 || ret == -EAGAIN)
+- ret = RESUME_GUEST;
+
+ if (page) {
+- /*
+- * We drop pages[0] here, not page because page might
+- * have been set to the head page of a compound, but
+- * we have to drop the reference on the correct tail
+- * page to match the get inside gup()
+- */
+- put_page(pages[0]);
++ if (!ret && (pgflags & _PAGE_WRITE))
++ set_page_dirty_lock(page);
++ put_page(page);
+ }
++
++ if (ret == 0 || ret == -EAGAIN)
++ ret = RESUME_GUEST;
+ return ret;
+ }
+
+@@ -676,7 +696,7 @@ void kvmppc_free_radix(struct kvm *kvm)
+ continue;
+ pmd = pmd_offset(pud, 0);
+ for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
+- if (pmd_huge(*pmd)) {
++ if (pmd_is_leaf(*pmd)) {
+ pmd_clear(pmd);
+ continue;
+ }