]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
RISC-V: KVM: Transparent huge page support
authorJessica Liu <liu.xuemei1@zte.com.cn>
Thu, 27 Nov 2025 08:51:37 +0000 (16:51 +0800)
committerAnup Patel <anup@brainfault.org>
Fri, 6 Feb 2026 13:35:32 +0000 (19:05 +0530)
Use block mapping if backed by a THP, as implemented in architectures
like ARM and x86_64.

Signed-off-by: Jessica Liu <liu.xuemei1@zte.com.cn>
Reviewed-by: Anup Patel <anup@brainfault.org>
Link: https://lore.kernel.org/r/20251127165137780QbUOVPKPAfWSGAFl5qtRy@zte.com.cn
Signed-off-by: Anup Patel <anup@brainfault.org>
arch/riscv/kvm/mmu.c
arch/riscv/mm/pgtable.c

index 4ab06697bfc085dc69c4284e818a83e057b7854c..0b75eb2a1820e2e7a1146f5431a3bf154ebd1f9e 100644 (file)
@@ -305,6 +305,142 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
        return pte_young(ptep_get(ptep));
 }
 
+static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot,
+                                              unsigned long hva)
+{
+       hva_t uaddr_start, uaddr_end;
+       gpa_t gpa_start;
+       size_t size;
+
+       size = memslot->npages * PAGE_SIZE;
+       uaddr_start = memslot->userspace_addr;
+       uaddr_end = uaddr_start + size;
+
+       gpa_start = memslot->base_gfn << PAGE_SHIFT;
+
+       /*
+        * Pages belonging to memslots that don't have the same alignment
+        * within a PMD for userspace and GPA cannot be mapped with g-stage
+        * PMD entries, because we'll end up mapping the wrong pages.
+        *
+        * Consider a layout like the following:
+        *
+        *    memslot->userspace_addr:
+        *    +-----+--------------------+--------------------+---+
+        *    |abcde|fgh  vs-stage block  |    vs-stage block tv|xyz|
+        *    +-----+--------------------+--------------------+---+
+        *
+        *    memslot->base_gfn << PAGE_SHIFT:
+        *      +---+--------------------+--------------------+-----+
+        *      |abc|def  g-stage block  |    g-stage block   |tvxyz|
+        *      +---+--------------------+--------------------+-----+
+        *
+        * If we create those g-stage blocks, we'll end up with this incorrect
+        * mapping:
+        *   d -> f
+        *   e -> g
+        *   f -> h
+        */
+       if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1)))
+               return false;
+
+       /*
+        * Next, let's make sure we're not trying to map anything not covered
+        * by the memslot. This means we have to prohibit block size mappings
+        * for the beginning and end of a non-block aligned and non-block sized
+        * memory slot (illustrated by the head and tail parts of the
+        * userspace view above containing pages 'abcde' and 'xyz',
+        * respectively).
+        *
+        * Note that it doesn't matter if we do the check using the
+        * userspace_addr or the base_gfn, as both are equally aligned (per
+        * the check above) and equally sized.
+        */
+       return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE));
+}
+
+static int get_hva_mapping_size(struct kvm *kvm,
+                               unsigned long hva)
+{
+       int size = PAGE_SIZE;
+       unsigned long flags;
+       pgd_t pgd;
+       p4d_t p4d;
+       pud_t pud;
+       pmd_t pmd;
+
+       /*
+        * Disable IRQs to prevent concurrent tear down of host page tables,
+        * e.g. if the primary MMU promotes a P*D to a huge page and then frees
+        * the original page table.
+        */
+       local_irq_save(flags);
+
+       /*
+        * Read each entry once.  As above, a non-leaf entry can be promoted to
+        * a huge page _during_ this walk.  Re-reading the entry could send the
+        * walk into the weeks, e.g. p*d_leaf() returns false (sees the old
+        * value) and then p*d_offset() walks into the target huge page instead
+        * of the old page table (sees the new value).
+        */
+       pgd = pgdp_get(pgd_offset(kvm->mm, hva));
+       if (pgd_none(pgd))
+               goto out;
+
+       p4d = p4dp_get(p4d_offset(&pgd, hva));
+       if (p4d_none(p4d) || !p4d_present(p4d))
+               goto out;
+
+       pud = pudp_get(pud_offset(&p4d, hva));
+       if (pud_none(pud) || !pud_present(pud))
+               goto out;
+
+       if (pud_leaf(pud)) {
+               size = PUD_SIZE;
+               goto out;
+       }
+
+       pmd = pmdp_get(pmd_offset(&pud, hva));
+       if (pmd_none(pmd) || !pmd_present(pmd))
+               goto out;
+
+       if (pmd_leaf(pmd))
+               size = PMD_SIZE;
+
+out:
+       local_irq_restore(flags);
+       return size;
+}
+
+static unsigned long transparent_hugepage_adjust(struct kvm *kvm,
+                                                struct kvm_memory_slot *memslot,
+                                                unsigned long hva,
+                                                kvm_pfn_t *hfnp, gpa_t *gpa)
+{
+       kvm_pfn_t hfn = *hfnp;
+
+       /*
+        * Make sure the adjustment is done only for THP pages. Also make
+        * sure that the HVA and GPA are sufficiently aligned and that the
+        * block map is contained within the memslot.
+        */
+       if (fault_supports_gstage_huge_mapping(memslot, hva)) {
+               int sz;
+
+               sz = get_hva_mapping_size(kvm, hva);
+               if (sz < PMD_SIZE)
+                       return sz;
+
+               *gpa &= PMD_MASK;
+               hfn &= ~(PTRS_PER_PMD - 1);
+               *hfnp = hfn;
+
+               return PMD_SIZE;
+       }
+
+       return PAGE_SIZE;
+}
+
 int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
                      gpa_t gpa, unsigned long hva, bool is_write,
                      struct kvm_gstage_mapping *out_map)
@@ -398,6 +534,10 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
        if (mmu_invalidate_retry(kvm, mmu_seq))
                goto out_unlock;
 
+       /* Check if we are backed by a THP and thus use block mapping if possible */
+       if (vma_pagesize == PAGE_SIZE)
+               vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa);
+
        if (writable) {
                mark_page_dirty_in_slot(kvm, memslot, gfn);
                ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,
index 807c0a0de18275b1a33bcd41d7dce27ee7de0662..b02bf71db33d51147d18d514fe19f30d10ef06cc 100644 (file)
@@ -47,6 +47,7 @@ pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 
        return (pud_t *)p4d;
 }
+EXPORT_SYMBOL_GPL(pud_offset);
 
 p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 {
@@ -55,6 +56,7 @@ p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 
        return (p4d_t *)pgd;
 }
+EXPORT_SYMBOL_GPL(p4d_offset);
 #endif
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP