]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
Merge branch kvm-arm64/pkvm-protected-guest into kvmarm-master/next
authorMarc Zyngier <maz@kernel.org>
Wed, 8 Apr 2026 11:25:39 +0000 (12:25 +0100)
committerMarc Zyngier <maz@kernel.org>
Wed, 8 Apr 2026 11:25:39 +0000 (12:25 +0100)
* kvm-arm64/pkvm-protected-guest: (41 commits)
  : .
  : pKVM support for protected guests, implementing the very long
  : awaited support for anonymous memory, as the elusive guestmem
  : has failed to deliver on its promises despite a multi-year
  : effort. Patches courtesy of Will Deacon. From the initial cover
  : letter:
  :
  : "[...] this patch series implements support for protected guest
  : memory with pKVM, where pages are unmapped from the host as they are
  : faulted into the guest and can be shared back from the guest using pKVM
  : hypercalls. Protected guests are created using a new machine type
  : identifier and can be booted to a shell using the kvmtool patches
  : available at [2], which finally means that we are able to test the pVM
  : logic in pKVM. Since this is an incremental step towards full isolation
  : from the host (for example, the CPU register state and DMA accesses are
  : not yet isolated), creating a pVM requires a developer Kconfig option to
  : be enabled in addition to booting with 'kvm-arm.mode=protected' and
  : results in a kernel taint."
  : .
  KVM: arm64: Don't hold 'vm_table_lock' across guest page reclaim
  KVM: arm64: Allow get_pkvm_hyp_vm() to take a reference to a dying VM
  KVM: arm64: Prevent teardown finalisation of referenced 'hyp_vm'
  drivers/virt: pkvm: Add Kconfig dependency on DMA_RESTRICTED_POOL
  KVM: arm64: Rename PKVM_PAGE_STATE_MASK
  KVM: arm64: Extend pKVM page ownership selftests to cover guest hvcs
  KVM: arm64: Extend pKVM page ownership selftests to cover forced reclaim
  KVM: arm64: Register 'selftest_vm' in the VM table
  KVM: arm64: Extend pKVM page ownership selftests to cover guest donation
  KVM: arm64: Add some initial documentation for pKVM
  KVM: arm64: Allow userspace to create protected VMs when pKVM is enabled
  KVM: arm64: Implement the MEM_UNSHARE hypercall for protected VMs
  KVM: arm64: Implement the MEM_SHARE hypercall for protected VMs
  KVM: arm64: Add hvc handler at EL2 for hypercalls from protected VMs
  KVM: arm64: Return -EFAULT from VCPU_RUN on access to a poisoned pte
  KVM: arm64: Reclaim faulting page from pKVM in spurious fault handler
  KVM: arm64: Introduce hypercall to force reclaim of a protected page
  KVM: arm64: Annotate guest donations with handle and gfn in host stage-2
  KVM: arm64: Change 'pkvm_handle_t' to u16
  KVM: arm64: Introduce host_stage2_set_owner_metadata_locked()
  ...

Signed-off-by: Marc Zyngier <maz@kernel.org>
1  2 
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/kvm/arm.c
arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
arch/arm64/kvm/hyp/nvhe/hyp-main.c
arch/arm64/kvm/hyp/nvhe/switch.c
arch/arm64/kvm/hyp/nvhe/sys_regs.c
arch/arm64/kvm/mmu.c
include/uapi/linux/kvm.h

index 93371b6138d1d5a78c21e9bef906d47367254d4e,6c79f7504d80f5c65ce84e7f77f73e8da8dae3b2..37414440cee7f1ce2e31e9e6a3d8263e97eb5cd3
@@@ -81,8 -74,18 +74,20 @@@ enum __kvm_host_smccc_func 
        __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
        __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
        __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
 -      __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
 +      __KVM_HOST_SMCCC_FUNC___vgic_v5_save_apr,
 +      __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr,
++      __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr,
+       /* Hypercalls that are available only when pKVM has finalised. */
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_donate_guest,
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest,
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest,
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest,
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest,
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest,
+       __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest,
        __KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm,
        __KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm,
        __KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
Simple merge
Simple merge
index f8a7b8c04c490fb0e0ed3718323f5e210eb65bb5,5031879ccb87badd306ea5fe9ef5f1e597c2ece4..3cbfae0e3dda13c5f802629de5545b0d4a6f37a7
@@@ -27,11 -27,15 +27,13 @@@ extern struct host_mmu host_mmu
  enum pkvm_component_id {
        PKVM_ID_HOST,
        PKVM_ID_HYP,
-       PKVM_ID_FFA,
+       PKVM_ID_GUEST,
  };
  
 -extern unsigned long hyp_nr_cpus;
 -
  int __pkvm_prot_finalize(void);
  int __pkvm_host_share_hyp(u64 pfn);
+ int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn);
+ int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn);
  int __pkvm_host_unshare_hyp(u64 pfn);
  int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
  int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
index 31f080307d959422f66f2d96a657518c8441f510,90e3b14fe287962541d8aab173a206e0271d7443..fd391eda8fca954d0201906f0dbf6cd9bc6cf30d
@@@ -588,82 -601,16 +603,89 @@@ static void handle___pkvm_start_teardow
  {
        DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
  
-       cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+       cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle);
+ }
+ static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt)
+ {
+       DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+       cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle);
  }
  
 +static void handle___tracing_load(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1);
 +      DECLARE_REG(size_t, desc_size, host_ctxt, 2);
 +
 +      cpu_reg(host_ctxt, 1) = __tracing_load(desc_hva, desc_size);
 +}
 +
 +static void handle___tracing_unload(struct kvm_cpu_context *host_ctxt)
 +{
 +      __tracing_unload();
 +}
 +
 +static void handle___tracing_enable(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(bool, enable, host_ctxt, 1);
 +
 +      cpu_reg(host_ctxt, 1) = __tracing_enable(enable);
 +}
 +
 +static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
 +
 +      cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu);
 +}
 +
 +static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(u32, mult, host_ctxt, 1);
 +      DECLARE_REG(u32, shift, host_ctxt, 2);
 +      DECLARE_REG(u64, epoch_ns, host_ctxt, 3);
 +      DECLARE_REG(u64, epoch_cyc, host_ctxt, 4);
 +
 +      __tracing_update_clock(mult, shift, epoch_ns, epoch_cyc);
 +}
 +
 +static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
 +
 +      cpu_reg(host_ctxt, 1) = __tracing_reset(cpu);
 +}
 +
 +static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(unsigned short, id, host_ctxt, 1);
 +      DECLARE_REG(bool, enable, host_ctxt, 2);
 +
 +      cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable);
 +}
 +
 +static void handle___tracing_write_event(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(u64, id, host_ctxt, 1);
 +
 +      trace_selftest(id);
 +}
 +
 +static void handle___vgic_v5_save_apr(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1);
 +
 +      __vgic_v5_save_apr(kern_hyp_va(cpu_if));
 +}
 +
 +static void handle___vgic_v5_restore_vmcr_apr(struct kvm_cpu_context *host_ctxt)
 +{
 +      DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1);
 +
 +      __vgic_v5_restore_vmcr_apr(kern_hyp_va(cpu_if));
 +}
 +
  typedef void (*hcall_t)(struct kvm_cpu_context *);
  
  #define HANDLE_FUNC(x)        [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@@ -697,8 -636,16 +711,18 @@@ static const hcall_t host_hcall[] = 
        HANDLE_FUNC(__kvm_timer_set_cntvoff),
        HANDLE_FUNC(__vgic_v3_save_aprs),
        HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
 +      HANDLE_FUNC(__vgic_v5_save_apr),
 +      HANDLE_FUNC(__vgic_v5_restore_vmcr_apr),
+       HANDLE_FUNC(__pkvm_host_share_hyp),
+       HANDLE_FUNC(__pkvm_host_unshare_hyp),
+       HANDLE_FUNC(__pkvm_host_donate_guest),
+       HANDLE_FUNC(__pkvm_host_share_guest),
+       HANDLE_FUNC(__pkvm_host_unshare_guest),
+       HANDLE_FUNC(__pkvm_host_relax_perms_guest),
+       HANDLE_FUNC(__pkvm_host_wrprotect_guest),
+       HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
+       HANDLE_FUNC(__pkvm_host_mkyoung_guest),
        HANDLE_FUNC(__pkvm_reserve_vm),
        HANDLE_FUNC(__pkvm_unreserve_vm),
        HANDLE_FUNC(__pkvm_init_vm),
Simple merge
Simple merge
index 03e1f389339c71089e176efd1662df11cab5a544,45358ae8a300827955f13bc17124bc9c42245ccb..c2e12d256e43f4f4c1df6906f9c78148bf07acd1
@@@ -1644,34 -1639,134 +1644,103 @@@ out_unlock
        return ret != -EAGAIN ? ret : 0;
  }
  
 -static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 -              struct kvm_memory_slot *memslot, unsigned long hva)
 +struct kvm_s2_fault_vma_info {
 +      unsigned long   mmu_seq;
 +      long            vma_pagesize;
 +      vm_flags_t      vm_flags;
 +      unsigned long   max_map_size;
 +      struct page     *page;
 +      kvm_pfn_t       pfn;
 +      gfn_t           gfn;
 +      bool            device;
 +      bool            mte_allowed;
 +      bool            is_vma_cacheable;
 +      bool            map_writable;
 +      bool            map_non_cacheable;
 +};
 +
++static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd)
+ {
+       unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
++      struct kvm_vcpu *vcpu = s2fd->vcpu;
+       struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+       struct mm_struct *mm = current->mm;
+       struct kvm *kvm = vcpu->kvm;
+       void *hyp_memcache;
+       struct page *page;
+       int ret;
 -      ret = prepare_mmu_memcache(vcpu, true, &hyp_memcache);
++      hyp_memcache = get_mmu_memcache(vcpu);
++      ret = topup_mmu_memcache(vcpu, hyp_memcache);
+       if (ret)
+               return -ENOMEM;
+       ret = account_locked_vm(mm, 1, true);
+       if (ret)
+               return ret;
+       mmap_read_lock(mm);
 -      ret = pin_user_pages(hva, 1, flags, &page);
++      ret = pin_user_pages(s2fd->hva, 1, flags, &page);
+       mmap_read_unlock(mm);
+       if (ret == -EHWPOISON) {
 -              kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
++              kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT);
+               ret = 0;
+               goto dec_account;
+       } else if (ret != 1) {
+               ret = -EFAULT;
+               goto dec_account;
+       } else if (!folio_test_swapbacked(page_folio(page))) {
+               /*
+                * We really can't deal with page-cache pages returned by GUP
+                * because (a) we may trigger writeback of a page for which we
+                * no longer have access and (b) page_mkclean() won't find the
+                * stage-2 mapping in the rmap so we can get out-of-whack with
+                * the filesystem when marking the page dirty during unpinning
+                * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
+                * without asking ext4 first")).
+                *
+                * Ideally we'd just restrict ourselves to anonymous pages, but
+                * we also want to allow memfd (i.e. shmem) pages, so check for
+                * pages backed by swap in the knowledge that the GUP pin will
+                * prevent try_to_unmap() from succeeding.
+                */
+               ret = -EIO;
+               goto unpin;
+       }
+       write_lock(&kvm->mmu_lock);
 -      ret = pkvm_pgtable_stage2_map(pgt, fault_ipa, PAGE_SIZE,
++      ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE,
+                                     page_to_phys(page), KVM_PGTABLE_PROT_RWX,
+                                     hyp_memcache, 0);
+       write_unlock(&kvm->mmu_lock);
+       if (ret) {
+               if (ret == -EAGAIN)
+                       ret = 0;
+               goto unpin;
+       }
+       return 0;
+ unpin:
+       unpin_user_pages(&page, 1);
+ dec_account:
+       account_locked_vm(mm, 1, false);
+       return ret;
+ }
 -static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 -                        struct kvm_s2_trans *nested,
 -                        struct kvm_memory_slot *memslot, unsigned long hva,
 -                        bool fault_is_perm)
 +static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd,
 +                                   struct kvm_s2_fault_vma_info *s2vi,
 +                                   struct vm_area_struct *vma)
  {
 -      int ret = 0;
 -      bool topup_memcache;
 -      bool write_fault, writable;
 -      bool exec_fault, mte_allowed, is_vma_cacheable;
 -      bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
 -      unsigned long mmu_seq;
 -      phys_addr_t ipa = fault_ipa;
 -      struct kvm *kvm = vcpu->kvm;
 -      struct vm_area_struct *vma;
        short vma_shift;
 -      void *memcache;
 -      gfn_t gfn;
 -      kvm_pfn_t pfn;
 -      bool logging_active = memslot_is_logging(memslot);
 -      bool force_pte = logging_active;
 -      long vma_pagesize, fault_granule;
 -      enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 -      struct kvm_pgtable *pgt;
 -      struct page *page;
 -      vm_flags_t vm_flags;
 -      enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
 -
 -      if (fault_is_perm)
 -              fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
 -      write_fault = kvm_is_write_fault(vcpu);
 -      exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
 -      VM_WARN_ON_ONCE(write_fault && exec_fault);
 -
 -      /*
 -       * Permission faults just need to update the existing leaf entry,
 -       * and so normally don't require allocations from the memcache. The
 -       * only exception to this is when dirty logging is enabled at runtime
 -       * and a write fault needs to collapse a block entry into a table.
 -       */
 -      topup_memcache = !fault_is_perm || (logging_active && write_fault);
 -      ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
 -      if (ret)
 -              return ret;
 -
 -      /*
 -       * Let's check if we will get back a huge page backed by hugetlbfs, or
 -       * get block mapping for device MMIO region.
 -       */
 -      mmap_read_lock(current->mm);
 -      vma = vma_lookup(current->mm, hva);
 -      if (unlikely(!vma)) {
 -              kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
 -              mmap_read_unlock(current->mm);
 -              return -EFAULT;
 -      }
  
 -      if (force_pte)
 +      if (memslot_is_logging(s2fd->memslot)) {
 +              s2vi->max_map_size = PAGE_SIZE;
                vma_shift = PAGE_SHIFT;
 -      else
 -              vma_shift = get_vma_page_shift(vma, hva);
 +      } else {
 +              s2vi->max_map_size = PUD_SIZE;
 +              vma_shift = get_vma_page_shift(vma, s2fd->hva);
 +      }
  
        switch (vma_shift) {
  #ifndef __PAGETABLE_PMD_FOLDED
@@@ -2285,22 -2270,20 +2354,27 @@@ int kvm_handle_guest_abort(struct kvm_v
                goto out_unlock;
        }
  
-       VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
-                       !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
 +      const struct kvm_s2_fault_desc s2fd = {
 +              .vcpu           = vcpu,
 +              .fault_ipa      = fault_ipa,
 +              .nested         = nested,
 +              .memslot        = memslot,
 +              .hva            = hva,
 +      };
 +
-       if (kvm_slot_has_gmem(memslot))
-               ret = gmem_abort(&s2fd);
-       else
-               ret = user_mem_abort(&s2fd);
+       if (kvm_vm_is_protected(vcpu->kvm)) {
 -              ret = pkvm_mem_abort(vcpu, fault_ipa, memslot, hva);
++              ret = pkvm_mem_abort(&s2fd);
+       } else {
+               VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
+                               !write_fault &&
+                               !kvm_vcpu_trap_is_exec_fault(vcpu));
+               if (kvm_slot_has_gmem(memslot))
 -                      ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
 -                                       esr_fsc_is_permission_fault(esr));
++                      ret = gmem_abort(&s2fd);
+               else
 -                      ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
 -                                           esr_fsc_is_permission_fault(esr));
++                      ret = user_mem_abort(&s2fd);
+       }
 +
        if (ret == 0)
                ret = 1;
  out:
Simple merge