From: Marc Zyngier Date: Wed, 8 Apr 2026 11:25:39 +0000 (+0100) Subject: Merge branch kvm-arm64/pkvm-protected-guest into kvmarm-master/next X-Git-Tag: v7.1-rc1~118^2~11^2~2 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=83a3980750e3cc25cb7ded90f11c157eb3f9f428;p=thirdparty%2Fkernel%2Flinux.git Merge branch kvm-arm64/pkvm-protected-guest into kvmarm-master/next * kvm-arm64/pkvm-protected-guest: (41 commits) : . : pKVM support for protected guests, implementing the very long : awaited support for anonymous memory, as the elusive guestmem : has failed to deliver on its promises despite a multi-year : effort. Patches courtesy of Will Deacon. From the initial cover : letter: : : "[...] this patch series implements support for protected guest : memory with pKVM, where pages are unmapped from the host as they are : faulted into the guest and can be shared back from the guest using pKVM : hypercalls. Protected guests are created using a new machine type : identifier and can be booted to a shell using the kvmtool patches : available at [2], which finally means that we are able to test the pVM : logic in pKVM. Since this is an incremental step towards full isolation : from the host (for example, the CPU register state and DMA accesses are : not yet isolated), creating a pVM requires a developer Kconfig option to : be enabled in addition to booting with 'kvm-arm.mode=protected' and : results in a kernel taint." : . KVM: arm64: Don't hold 'vm_table_lock' across guest page reclaim KVM: arm64: Allow get_pkvm_hyp_vm() to take a reference to a dying VM KVM: arm64: Prevent teardown finalisation of referenced 'hyp_vm' drivers/virt: pkvm: Add Kconfig dependency on DMA_RESTRICTED_POOL KVM: arm64: Rename PKVM_PAGE_STATE_MASK KVM: arm64: Extend pKVM page ownership selftests to cover guest hvcs KVM: arm64: Extend pKVM page ownership selftests to cover forced reclaim KVM: arm64: Register 'selftest_vm' in the VM table KVM: arm64: Extend pKVM page ownership selftests to cover guest donation KVM: arm64: Add some initial documentation for pKVM KVM: arm64: Allow userspace to create protected VMs when pKVM is enabled KVM: arm64: Implement the MEM_UNSHARE hypercall for protected VMs KVM: arm64: Implement the MEM_SHARE hypercall for protected VMs KVM: arm64: Add hvc handler at EL2 for hypercalls from protected VMs KVM: arm64: Return -EFAULT from VCPU_RUN on access to a poisoned pte KVM: arm64: Reclaim faulting page from pKVM in spurious fault handler KVM: arm64: Introduce hypercall to force reclaim of a protected page KVM: arm64: Annotate guest donations with handle and gfn in host stage-2 KVM: arm64: Change 'pkvm_handle_t' to u16 KVM: arm64: Introduce host_stage2_set_owner_metadata_locked() ... Signed-off-by: Marc Zyngier --- 83a3980750e3cc25cb7ded90f11c157eb3f9f428 diff --cc arch/arm64/include/asm/kvm_asm.h index 93371b6138d1d,6c79f7504d80f..37414440cee7f --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@@ -81,8 -74,18 +74,20 @@@ enum __kvm_host_smccc_func __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, - __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v5_save_apr, + __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr, ++ __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr, + + /* Hypercalls that are available only when pKVM has finalised. */ + __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, + __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp, + __KVM_HOST_SMCCC_FUNC___pkvm_host_donate_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest, __KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm, __KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, diff --cc arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index f8a7b8c04c490,5031879ccb87b..3cbfae0e3dda1 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@@ -27,11 -27,15 +27,13 @@@ extern struct host_mmu host_mmu enum pkvm_component_id { PKVM_ID_HOST, PKVM_ID_HYP, - PKVM_ID_FFA, + PKVM_ID_GUEST, }; -extern unsigned long hyp_nr_cpus; - int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); + int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn); + int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn); int __pkvm_host_unshare_hyp(u64 pfn); int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); diff --cc arch/arm64/kvm/hyp/nvhe/hyp-main.c index 31f080307d959,90e3b14fe2879..fd391eda8fca9 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@@ -588,82 -601,16 +603,89 @@@ static void handle___pkvm_start_teardow { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); - cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle); + cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle); + } + + static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt) + { + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle); } +static void handle___tracing_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1); + DECLARE_REG(size_t, desc_size, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __tracing_load(desc_hva, desc_size); +} + +static void handle___tracing_unload(struct kvm_cpu_context *host_ctxt) +{ + __tracing_unload(); +} + +static void handle___tracing_enable(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(bool, enable, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_enable(enable); +} + +static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned int, cpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu); +} + +static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u32, mult, host_ctxt, 1); + DECLARE_REG(u32, shift, host_ctxt, 2); + DECLARE_REG(u64, epoch_ns, host_ctxt, 3); + DECLARE_REG(u64, epoch_cyc, host_ctxt, 4); + + __tracing_update_clock(mult, shift, epoch_ns, epoch_cyc); +} + +static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned int, cpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_reset(cpu); +} + +static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned short, id, host_ctxt, 1); + DECLARE_REG(bool, enable, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable); +} + +static void handle___tracing_write_event(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, id, host_ctxt, 1); + + trace_selftest(id); +} + +static void handle___vgic_v5_save_apr(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v5_save_apr(kern_hyp_va(cpu_if)); +} + +static void handle___vgic_v5_restore_vmcr_apr(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v5_restore_vmcr_apr(kern_hyp_va(cpu_if)); +} + typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x @@@ -697,8 -636,16 +711,18 @@@ static const hcall_t host_hcall[] = HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), + HANDLE_FUNC(__vgic_v5_save_apr), + HANDLE_FUNC(__vgic_v5_restore_vmcr_apr), + + HANDLE_FUNC(__pkvm_host_share_hyp), + HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_donate_guest), + HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__pkvm_reserve_vm), HANDLE_FUNC(__pkvm_unreserve_vm), HANDLE_FUNC(__pkvm_init_vm), diff --cc arch/arm64/kvm/mmu.c index 03e1f389339c7,45358ae8a3008..c2e12d256e43f --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@@ -1644,34 -1639,134 +1644,103 @@@ out_unlock return ret != -EAGAIN ? ret : 0; } -static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_memory_slot *memslot, unsigned long hva) +struct kvm_s2_fault_vma_info { + unsigned long mmu_seq; + long vma_pagesize; + vm_flags_t vm_flags; + unsigned long max_map_size; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + bool device; + bool mte_allowed; + bool is_vma_cacheable; + bool map_writable; + bool map_non_cacheable; +}; + ++static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd) + { + unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; ++ struct kvm_vcpu *vcpu = s2fd->vcpu; + struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; + struct mm_struct *mm = current->mm; + struct kvm *kvm = vcpu->kvm; + void *hyp_memcache; + struct page *page; + int ret; + - ret = prepare_mmu_memcache(vcpu, true, &hyp_memcache); ++ hyp_memcache = get_mmu_memcache(vcpu); ++ ret = topup_mmu_memcache(vcpu, hyp_memcache); + if (ret) + return -ENOMEM; + + ret = account_locked_vm(mm, 1, true); + if (ret) + return ret; + + mmap_read_lock(mm); - ret = pin_user_pages(hva, 1, flags, &page); ++ ret = pin_user_pages(s2fd->hva, 1, flags, &page); + mmap_read_unlock(mm); + + if (ret == -EHWPOISON) { - kvm_send_hwpoison_signal(hva, PAGE_SHIFT); ++ kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT); + ret = 0; + goto dec_account; + } else if (ret != 1) { + ret = -EFAULT; + goto dec_account; + } else if (!folio_test_swapbacked(page_folio(page))) { + /* + * We really can't deal with page-cache pages returned by GUP + * because (a) we may trigger writeback of a page for which we + * no longer have access and (b) page_mkclean() won't find the + * stage-2 mapping in the rmap so we can get out-of-whack with + * the filesystem when marking the page dirty during unpinning + * (see cc5095747edf ("ext4: don't BUG if someone dirty pages + * without asking ext4 first")). + * + * Ideally we'd just restrict ourselves to anonymous pages, but + * we also want to allow memfd (i.e. shmem) pages, so check for + * pages backed by swap in the knowledge that the GUP pin will + * prevent try_to_unmap() from succeeding. + */ + ret = -EIO; + goto unpin; + } + + write_lock(&kvm->mmu_lock); - ret = pkvm_pgtable_stage2_map(pgt, fault_ipa, PAGE_SIZE, ++ ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE, + page_to_phys(page), KVM_PGTABLE_PROT_RWX, + hyp_memcache, 0); + write_unlock(&kvm->mmu_lock); + if (ret) { + if (ret == -EAGAIN) + ret = 0; + goto unpin; + } + + return 0; + unpin: + unpin_user_pages(&page, 1); + dec_account: + account_locked_vm(mm, 1, false); + return ret; + } + -static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_s2_trans *nested, - struct kvm_memory_slot *memslot, unsigned long hva, - bool fault_is_perm) +static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd, + struct kvm_s2_fault_vma_info *s2vi, + struct vm_area_struct *vma) { - int ret = 0; - bool topup_memcache; - bool write_fault, writable; - bool exec_fault, mte_allowed, is_vma_cacheable; - bool s2_force_noncacheable = false, vfio_allow_any_uc = false; - unsigned long mmu_seq; - phys_addr_t ipa = fault_ipa; - struct kvm *kvm = vcpu->kvm; - struct vm_area_struct *vma; short vma_shift; - void *memcache; - gfn_t gfn; - kvm_pfn_t pfn; - bool logging_active = memslot_is_logging(memslot); - bool force_pte = logging_active; - long vma_pagesize, fault_granule; - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; - struct kvm_pgtable *pgt; - struct page *page; - vm_flags_t vm_flags; - enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; - - if (fault_is_perm) - fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); - write_fault = kvm_is_write_fault(vcpu); - exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); - VM_WARN_ON_ONCE(write_fault && exec_fault); - - /* - * Permission faults just need to update the existing leaf entry, - * and so normally don't require allocations from the memcache. The - * only exception to this is when dirty logging is enabled at runtime - * and a write fault needs to collapse a block entry into a table. - */ - topup_memcache = !fault_is_perm || (logging_active && write_fault); - ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); - if (ret) - return ret; - - /* - * Let's check if we will get back a huge page backed by hugetlbfs, or - * get block mapping for device MMIO region. - */ - mmap_read_lock(current->mm); - vma = vma_lookup(current->mm, hva); - if (unlikely(!vma)) { - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); - mmap_read_unlock(current->mm); - return -EFAULT; - } - if (force_pte) + if (memslot_is_logging(s2fd->memslot)) { + s2vi->max_map_size = PAGE_SIZE; vma_shift = PAGE_SHIFT; - else - vma_shift = get_vma_page_shift(vma, hva); + } else { + s2vi->max_map_size = PUD_SIZE; + vma_shift = get_vma_page_shift(vma, s2fd->hva); + } switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED @@@ -2285,22 -2270,20 +2354,27 @@@ int kvm_handle_guest_abort(struct kvm_v goto out_unlock; } - VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && - !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); - + const struct kvm_s2_fault_desc s2fd = { + .vcpu = vcpu, + .fault_ipa = fault_ipa, + .nested = nested, + .memslot = memslot, + .hva = hva, + }; + - if (kvm_slot_has_gmem(memslot)) - ret = gmem_abort(&s2fd); - else - ret = user_mem_abort(&s2fd); + if (kvm_vm_is_protected(vcpu->kvm)) { - ret = pkvm_mem_abort(vcpu, fault_ipa, memslot, hva); ++ ret = pkvm_mem_abort(&s2fd); + } else { + VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && + !write_fault && + !kvm_vcpu_trap_is_exec_fault(vcpu)); + + if (kvm_slot_has_gmem(memslot)) - ret = gmem_abort(vcpu, fault_ipa, nested, memslot, - esr_fsc_is_permission_fault(esr)); ++ ret = gmem_abort(&s2fd); + else - ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, - esr_fsc_is_permission_fault(esr)); ++ ret = user_mem_abort(&s2fd); + } + if (ret == 0) ret = 1; out: