]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
authorFangyu Yu <fangyu.yu@linux.alibaba.com>
Fri, 3 Apr 2026 15:30:16 +0000 (23:30 +0800)
committerAnup Patel <anup@brainfault.org>
Sat, 4 Apr 2026 07:51:21 +0000 (13:21 +0530)
Introduces one per-VM architecture-specific fields to support runtime
configuration of the G-stage page table format:

- kvm->arch.pgd_levels: the corresponding number of page table levels
  for the selected mode.

These fields replace the previous global variables
kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different
virtual machines to independently select their G-stage page table format
instead of being forced to share the maximum mode detected by the kernel
at boot time.

Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
Reviewed-by: Andrew Jones <andrew.jones@oss.qualcomm.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Link: https://lore.kernel.org/r/20260403153019.9916-2-fangyu.yu@linux.alibaba.com
Signed-off-by: Anup Patel <anup@brainfault.org>
arch/riscv/include/asm/kvm_gstage.h
arch/riscv/include/asm/kvm_host.h
arch/riscv/kvm/gstage.c
arch/riscv/kvm/main.c
arch/riscv/kvm/mmu.c
arch/riscv/kvm/vm.c
arch/riscv/kvm/vmid.c

index a89d1422cc8457016d53746ec3a104793a1e0198..c35874768641c0b6098c1a7e9afb92125382e86b 100644 (file)
@@ -29,16 +29,22 @@ struct kvm_gstage_mapping {
 #define kvm_riscv_gstage_index_bits    10
 #endif
 
-extern unsigned long kvm_riscv_gstage_mode;
-extern unsigned long kvm_riscv_gstage_pgd_levels;
+extern unsigned long kvm_riscv_gstage_max_pgd_levels;
 
 #define kvm_riscv_gstage_pgd_xbits     2
 #define kvm_riscv_gstage_pgd_size      (1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
-#define kvm_riscv_gstage_gpa_bits      (HGATP_PAGE_SHIFT + \
-                                        (kvm_riscv_gstage_pgd_levels * \
-                                         kvm_riscv_gstage_index_bits) + \
-                                        kvm_riscv_gstage_pgd_xbits)
-#define kvm_riscv_gstage_gpa_size      ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
+
+static inline unsigned long kvm_riscv_gstage_gpa_bits(unsigned long pgd_levels)
+{
+       return (HGATP_PAGE_SHIFT +
+               pgd_levels * kvm_riscv_gstage_index_bits +
+               kvm_riscv_gstage_pgd_xbits);
+}
+
+static inline gpa_t kvm_riscv_gstage_gpa_size(unsigned long pgd_levels)
+{
+       return BIT_ULL(kvm_riscv_gstage_gpa_bits(pgd_levels));
+}
 
 bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
                               pte_t **ptepp, u32 *ptep_level);
@@ -73,4 +79,21 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
 
 void kvm_riscv_gstage_mode_detect(void);
 
+static inline unsigned long kvm_riscv_gstage_mode(unsigned long pgd_levels)
+{
+       switch (pgd_levels) {
+       case 2:
+               return HGATP_MODE_SV32X4;
+       case 3:
+               return HGATP_MODE_SV39X4;
+       case 4:
+               return HGATP_MODE_SV48X4;
+       case 5:
+               return HGATP_MODE_SV57X4;
+       default:
+               WARN_ON_ONCE(1);
+               return HGATP_MODE_OFF;
+       }
+}
+
 #endif
index 85e1bb5b4d7e323bf57f87fafc9678dc991f43cd..75b0a951c1bc6f4111d7b44063b4c1852cd66a02 100644 (file)
@@ -83,6 +83,7 @@ struct kvm_arch {
        /* G-stage page table */
        pgd_t *pgd;
        phys_addr_t pgd_phys;
+       unsigned long pgd_levels;
 
        /* Guest Timer */
        struct kvm_guest_timer timer;
index ffec3e5ddcafffcbed9f404210d7a1a34c5d31ac..bf7e54af2c7cf9fc2be046fba2ed8722633d8df5 100644 (file)
 #include <asm/kvm_gstage.h>
 
 #ifdef CONFIG_64BIT
-unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
-unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
+unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3;
 #else
-unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
-unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
+unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2;
 #endif
 
 #define gstage_pte_leaf(__ptep)        \
        (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
 
-static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
+static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
+                                            gpa_t addr, u32 level)
 {
        unsigned long mask;
        unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
 
-       if (level == (kvm_riscv_gstage_pgd_levels - 1))
+       if (level == gstage->kvm->arch.pgd_levels - 1)
                mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
        else
                mask = PTRS_PER_PTE - 1;
@@ -40,12 +39,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
        return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
 }
 
-static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
+static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size,
+                                    u32 *out_level)
 {
        u32 i;
        unsigned long psz = 1UL << 12;
 
-       for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
+       for (i = 0; i < gstage->kvm->arch.pgd_levels; i++) {
                if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
                        *out_level = i;
                        return 0;
@@ -55,21 +55,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
        return -EINVAL;
 }
 
-static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
+static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
+                                     unsigned long *out_pgorder)
 {
-       if (kvm_riscv_gstage_pgd_levels < level)
+       if (gstage->kvm->arch.pgd_levels < level)
                return -EINVAL;
 
        *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
        return 0;
 }
 
-static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
+static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level,
+                                    unsigned long *out_pgsize)
 {
        int rc;
        unsigned long page_order = PAGE_SHIFT;
 
-       rc = gstage_level_to_page_order(level, &page_order);
+       rc = gstage_level_to_page_order(gstage, level, &page_order);
        if (rc)
                return rc;
 
@@ -81,11 +83,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
                               pte_t **ptepp, u32 *ptep_level)
 {
        pte_t *ptep;
-       u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+       u32 current_level = gstage->kvm->arch.pgd_levels - 1;
 
        *ptep_level = current_level;
        ptep = (pte_t *)gstage->pgd;
-       ptep = &ptep[gstage_pte_index(addr, current_level)];
+       ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
        while (ptep && pte_val(ptep_get(ptep))) {
                if (gstage_pte_leaf(ptep)) {
                        *ptep_level = current_level;
@@ -97,7 +99,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
                        current_level--;
                        *ptep_level = current_level;
                        ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
-                       ptep = &ptep[gstage_pte_index(addr, current_level)];
+                       ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
                } else {
                        ptep = NULL;
                }
@@ -110,7 +112,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
 {
        unsigned long order = PAGE_SHIFT;
 
-       if (gstage_level_to_page_order(level, &order))
+       if (gstage_level_to_page_order(gstage, level, &order))
                return;
        addr &= ~(BIT(order) - 1);
 
@@ -125,9 +127,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
                             struct kvm_mmu_memory_cache *pcache,
                             const struct kvm_gstage_mapping *map)
 {
-       u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+       u32 current_level = gstage->kvm->arch.pgd_levels - 1;
        pte_t *next_ptep = (pte_t *)gstage->pgd;
-       pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+       pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
 
        if (current_level < map->level)
                return -EINVAL;
@@ -151,7 +153,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
                }
 
                current_level--;
-               ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+               ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
        }
 
        if (pte_val(*ptep) != pte_val(map->pte)) {
@@ -194,7 +196,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
        out_map->addr = gpa;
        out_map->level = 0;
 
-       ret = gstage_page_size_to_level(page_size, &out_map->level);
+       ret = gstage_page_size_to_level(gstage, page_size, &out_map->level);
        if (ret)
                return ret;
 
@@ -286,7 +288,7 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
                                struct kvm_mmu_memory_cache *pcache,
                                gpa_t addr, u32 target_level, bool flush)
 {
-       u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+       u32 current_level = gstage->kvm->arch.pgd_levels - 1;
        pte_t *next_ptep = (pte_t *)gstage->pgd;
        unsigned long huge_pte, child_pte;
        unsigned long child_page_size;
@@ -297,7 +299,7 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
                return -ENOMEM;
 
        while(current_level > target_level) {
-               ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)];
+               ptep = (pte_t *)&next_ptep[gstage_pte_index(gstage, addr, current_level)];
 
                if (!pte_val(ptep_get(ptep)))
                        break;
@@ -310,7 +312,7 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
 
                huge_pte = pte_val(ptep_get(ptep));
 
-               ret = gstage_level_to_page_size(current_level - 1, &child_page_size);
+               ret = gstage_level_to_page_size(gstage, current_level - 1, &child_page_size);
                if (ret)
                        return ret;
 
@@ -343,7 +345,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
        u32 next_ptep_level;
        unsigned long next_page_size, page_size;
 
-       ret = gstage_level_to_page_size(ptep_level, &page_size);
+       ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
        if (ret)
                return;
 
@@ -355,7 +357,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
        if (ptep_level && !gstage_pte_leaf(ptep)) {
                next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
                next_ptep_level = ptep_level - 1;
-               ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
+               ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
                if (ret)
                        return;
 
@@ -389,7 +391,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
 
        while (addr < end) {
                found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
-               ret = gstage_level_to_page_size(ptep_level, &page_size);
+               ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
                if (ret)
                        break;
 
@@ -423,7 +425,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
 
        while (addr < end) {
                found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
-               ret = gstage_level_to_page_size(ptep_level, &page_size);
+               ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
                if (ret)
                        break;
 
@@ -444,39 +446,34 @@ void __init kvm_riscv_gstage_mode_detect(void)
        /* Try Sv57x4 G-stage mode */
        csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
        if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
-               kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
-               kvm_riscv_gstage_pgd_levels = 5;
+               kvm_riscv_gstage_max_pgd_levels = 5;
                goto done;
        }
 
        /* Try Sv48x4 G-stage mode */
        csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
        if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
-               kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
-               kvm_riscv_gstage_pgd_levels = 4;
+               kvm_riscv_gstage_max_pgd_levels = 4;
                goto done;
        }
 
        /* Try Sv39x4 G-stage mode */
        csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
        if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) {
-               kvm_riscv_gstage_mode = HGATP_MODE_SV39X4;
-               kvm_riscv_gstage_pgd_levels = 3;
+               kvm_riscv_gstage_max_pgd_levels = 3;
                goto done;
        }
 #else /* CONFIG_32BIT */
        /* Try Sv32x4 G-stage mode */
        csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
        if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) {
-               kvm_riscv_gstage_mode = HGATP_MODE_SV32X4;
-               kvm_riscv_gstage_pgd_levels = 2;
+               kvm_riscv_gstage_max_pgd_levels = 2;
                goto done;
        }
 #endif
 
        /* KVM depends on !HGATP_MODE_OFF */
-       kvm_riscv_gstage_mode = HGATP_MODE_OFF;
-       kvm_riscv_gstage_pgd_levels = 0;
+       kvm_riscv_gstage_max_pgd_levels = 0;
 
 done:
        csr_write(CSR_HGATP, 0);
index 5399c3b4071d527c3c6b312682a48d341eba3c36..cb8a65273c1f04a4aa154a34f315b9612e5d1e23 100644 (file)
@@ -105,17 +105,17 @@ static int __init riscv_kvm_init(void)
                return rc;
 
        kvm_riscv_gstage_mode_detect();
-       switch (kvm_riscv_gstage_mode) {
-       case HGATP_MODE_SV32X4:
+       switch (kvm_riscv_gstage_max_pgd_levels) {
+       case 2:
                str = "Sv32x4";
                break;
-       case HGATP_MODE_SV39X4:
+       case 3:
                str = "Sv39x4";
                break;
-       case HGATP_MODE_SV48X4:
+       case 4:
                str = "Sv48x4";
                break;
-       case HGATP_MODE_SV57X4:
+       case 5:
                str = "Sv57x4";
                break;
        default:
@@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void)
                         (rc) ? slist : "no features");
        }
 
-       kvm_info("using %s G-stage page table format\n", str);
+       kvm_info("highest G-stage page table mode is %s\n", str);
 
        kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
 
index 088d33ba90edaf4c417b11b04c89de2d6a50988b..fbcdd75cb9af048a54073573ab646af6f67aa087 100644 (file)
@@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
                if (!writable)
                        map.pte = pte_wrprotect(map.pte);
 
-               ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
+               ret = kvm_mmu_topup_memory_cache(&pcache, kvm->arch.pgd_levels);
                if (ret)
                        goto out;
 
@@ -186,7 +186,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
         * space addressable by the KVM guest GPA space.
         */
        if ((new->base_gfn + new->npages) >=
-           (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
+            kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels) >> PAGE_SHIFT)
                return -EFAULT;
 
        hva = new->userspace_addr;
@@ -472,7 +472,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
        memset(out_map, 0, sizeof(*out_map));
 
        /* We need minimum second+third level pages */
-       ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
+       ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.pgd_levels);
        if (ret) {
                kvm_err("Failed to topup G-stage cache\n");
                return ret;
@@ -575,6 +575,7 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
                return -ENOMEM;
        kvm->arch.pgd = page_to_virt(pgd_page);
        kvm->arch.pgd_phys = page_to_phys(pgd_page);
+       kvm->arch.pgd_levels = kvm_riscv_gstage_max_pgd_levels;
 
        return 0;
 }
@@ -590,10 +591,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
                gstage.flags = 0;
                gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
                gstage.pgd = kvm->arch.pgd;
-               kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
+               kvm_riscv_gstage_unmap_range(&gstage, 0UL,
+                       kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels), false);
                pgd = READ_ONCE(kvm->arch.pgd);
                kvm->arch.pgd = NULL;
                kvm->arch.pgd_phys = 0;
+               kvm->arch.pgd_levels = 0;
        }
        spin_unlock(&kvm->mmu_lock);
 
@@ -603,11 +606,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
 
 void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
 {
-       unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
-       struct kvm_arch *k = &vcpu->kvm->arch;
+       struct kvm_arch *ka = &vcpu->kvm->arch;
+       unsigned long hgatp = kvm_riscv_gstage_mode(ka->pgd_levels)
+                             << HGATP_MODE_SHIFT;
 
-       hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
-       hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
+       hgatp |= (READ_ONCE(ka->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
+       hgatp |= (ka->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
 
        ncsr_write(CSR_HGATP, hgatp);
 
index 13c63ae1a78b25cf3354a2442126fb7e6fd7a4b5..fb7c4e07961ff9a280799670c7c379b89f0d09ba 100644 (file)
@@ -199,7 +199,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = KVM_USER_MEM_SLOTS;
                break;
        case KVM_CAP_VM_GPA_BITS:
-               r = kvm_riscv_gstage_gpa_bits;
+               if (!kvm)
+                       r = kvm_riscv_gstage_gpa_bits(kvm_riscv_gstage_max_pgd_levels);
+               else
+                       r = kvm_riscv_gstage_gpa_bits(kvm->arch.pgd_levels);
                break;
        default:
                r = 0;
index cf34d448289d79cb6b1f3832d97ee6997f27de5c..c15bdb1dd8bef0d305a90b65ad48751a0a2a7913 100644 (file)
@@ -26,7 +26,8 @@ static DEFINE_SPINLOCK(vmid_lock);
 void __init kvm_riscv_gstage_vmid_detect(void)
 {
        /* Figure-out number of VMID bits in HW */
-       csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
+       csr_write(CSR_HGATP, (kvm_riscv_gstage_mode(kvm_riscv_gstage_max_pgd_levels) <<
+                             HGATP_MODE_SHIFT) | HGATP_VMID);
        vmid_bits = csr_read(CSR_HGATP);
        vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
        vmid_bits = fls_long(vmid_bits);