RISC-V: KVM: Split huge pages during fault handling for dirty logging

author Wang Yechao <wang.yechao255@zte.com.cn>

Mon, 30 Mar 2026 08:12:58 +0000 (16:12 +0800)

committer Anup Patel <anup@brainfault.org>

Mon, 30 Mar 2026 08:31:02 +0000 (14:01 +0530)
author Wang Yechao <wang.yechao255@zte.com.cn>
Mon, 30 Mar 2026 08:12:58 +0000 (16:12 +0800)
committer Anup Patel <anup@brainfault.org>
Mon, 30 Mar 2026 08:31:02 +0000 (14:01 +0530)
diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h

index 595e2183173ebd08a5cb39b950c3fc96e6bff7f6..a89d1422cc8457016d53746ec3a104793a1e0198 100644 (file)
--- a/arch/riscv/include/asm/kvm_gstage.h
+++ b/arch/riscv/include/asm/kvm_gstage.h
@@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
                               bool page_rdonly, bool page_exec,
                               struct kvm_gstage_mapping *out_map);
  
+int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
+                               struct kvm_mmu_memory_cache *pcache,
+                               gpa_t addr, u32 target_level, bool flush);
+
  enum kvm_riscv_gstage_op {
         GSTAGE_OP_NOP = 0,      /* Nothing */
         GSTAGE_OP_CLEAR,        /* Clear/Unmap */
diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c

index d2001d5080460fccce19cd616d79dbbf28ddcc1f..ffec3e5ddcafffcbed9f404210d7a1a34c5d31ac 100644 (file)
--- a/arch/riscv/kvm/gstage.c
+++ b/arch/riscv/kvm/gstage.c
@@ -163,13 +163,32 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
         return 0;
  }
  
+static void kvm_riscv_gstage_update_pte_prot(struct kvm_gstage *gstage, u32 level,
+                                            gpa_t addr, pte_t *ptep, pgprot_t prot)
+{
+       pte_t new_pte;
+
+       if (pgprot_val(pte_pgprot(ptep_get(ptep))) == pgprot_val(prot))
+               return;
+
+       new_pte = pfn_pte(pte_pfn(ptep_get(ptep)), prot);
+       new_pte = pte_mkdirty(new_pte);
+
+       set_pte(ptep, new_pte);
+
+       gstage_tlb_flush(gstage, level, addr);
+}
+
  int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
                               struct kvm_mmu_memory_cache *pcache,
                               gpa_t gpa, phys_addr_t hpa, unsigned long page_size,
                               bool page_rdonly, bool page_exec,
                               struct kvm_gstage_mapping *out_map)
  {
+       bool found_leaf;
+       u32 ptep_level;
         pgprot_t prot;
+       pte_t *ptep;
         int ret;
  
         out_map->addr = gpa;
@@ -203,12 +222,119 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
                 else
                         prot = PAGE_WRITE;
         }
+
+       found_leaf = kvm_riscv_gstage_get_leaf(gstage, gpa, &ptep, &ptep_level);
+       if (found_leaf) {
+               /*
+                * ptep_level is the current gstage mapping level of addr, out_map->level
+                * is the required mapping level during fault handling.
+                *
+                * 1) ptep_level > out_map->level
+                * This happens when dirty logging is enabled and huge pages are used.
+                * KVM must track the pages at 4K level, and split the huge mapping
+                * into 4K mappings.
+                *
+                * 2) ptep_level < out_map->level
+                * This happens when dirty logging is disabled and huge pages are used.
+                * The gstage is split into 4K mappings, but the out_map level is now
+                * back to the huge page level. Ignore the out_map level this time, and
+                * just update the pte prot here. Otherwise, we would fall back to mapping
+                * the gstage at huge page level in `kvm_riscv_gstage_set_pte`, with the
+                * overhead of freeing the page tables(not support now), which would slow
+                * down the vCPUs' performance.
+                *
+                * It is better to recover the huge page mapping in the ioctl context when
+                * disabling dirty logging.
+                *
+                * 3) ptep_level == out_map->level
+                * We already have the ptep, just update the pte prot if the pfn not change.
+                * There is no need to invoke `kvm_riscv_gstage_set_pte` again.
+                */
+               if (ptep_level > out_map->level) {
+                       kvm_riscv_gstage_split_huge(gstage, pcache, gpa,
+                                                   out_map->level, true);
+               } else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(ptep_get(ptep))), page_size) == hpa) {
+                       kvm_riscv_gstage_update_pte_prot(gstage, ptep_level, gpa, ptep, prot);
+                       return 0;
+               }
+       }
+
         out_map->pte = pfn_pte(PFN_DOWN(hpa), prot);
         out_map->pte = pte_mkdirty(out_map->pte);
  
         return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
  }
  
+static inline unsigned long make_child_pte(unsigned long huge_pte, int index,
+                                          unsigned long child_page_size)
+{
+       unsigned long child_pte = huge_pte;
+       unsigned long child_pfn_offset;
+
+       /*
+        * The child_pte already has the base address of the huge page being
+        * split. So we just have to OR in the offset to the page at the next
+        * lower level for the given index.
+        */
+       child_pfn_offset = index * (child_page_size / PAGE_SIZE);
+       child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0)));
+
+       return child_pte;
+}
+
+int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
+                               struct kvm_mmu_memory_cache *pcache,
+                               gpa_t addr, u32 target_level, bool flush)
+{
+       u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+       pte_t *next_ptep = (pte_t *)gstage->pgd;
+       unsigned long huge_pte, child_pte;
+       unsigned long child_page_size;
+       pte_t *ptep;
+       int i, ret;
+
+       if (!pcache)
+               return -ENOMEM;
+
+       while(current_level > target_level) {
+               ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)];
+
+               if (!pte_val(ptep_get(ptep)))
+                       break;
+
+               if (!gstage_pte_leaf(ptep)) {
+                       next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+                       current_level--;
+                       continue;
+               }
+
+               huge_pte = pte_val(ptep_get(ptep));
+
+               ret = gstage_level_to_page_size(current_level - 1, &child_page_size);
+               if (ret)
+                       return ret;
+
+               next_ptep = kvm_mmu_memory_cache_alloc(pcache);
+               if (!next_ptep)
+                       return -ENOMEM;
+
+               for (i = 0; i < PTRS_PER_PTE; i++) {
+                       child_pte = make_child_pte(huge_pte, i, child_page_size);
+                       set_pte((pte_t *)&next_ptep[i], __pte(child_pte));
+               }
+
+               set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
+                               __pgprot(_PAGE_TABLE)));
+
+               if (flush)
+                       gstage_tlb_flush(gstage, current_level, addr);
+
+               current_level--;
+       }
+
+       return 0;
+}
+
  void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
                              pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
  {
author	Wang Yechao <wang.yechao255@zte.com.cn>
	Mon, 30 Mar 2026 08:12:58 +0000 (16:12 +0800)
committer	Anup Patel <anup@brainfault.org>
	Mon, 30 Mar 2026 08:31:02 +0000 (14:01 +0530)
arch/riscv/include/asm/kvm_gstage.h		patch \| blob \| blame \| history
arch/riscv/kvm/gstage.c		patch \| blob \| blame \| history