__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
- __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
+ __KVM_HOST_SMCCC_FUNC___vgic_v5_save_apr,
+ __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr,
++ __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr,
+
+ /* Hypercalls that are available only when pKVM has finalised. */
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_donate_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest,
+ __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest,
__KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm,
__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
- cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+ cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle);
+ }
+
+ static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt)
+ {
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle);
}
+static void handle___tracing_load(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1);
+ DECLARE_REG(size_t, desc_size, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __tracing_load(desc_hva, desc_size);
+}
+
+static void handle___tracing_unload(struct kvm_cpu_context *host_ctxt)
+{
+ __tracing_unload();
+}
+
+static void handle___tracing_enable(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(bool, enable, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_enable(enable);
+}
+
+static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu);
+}
+
+static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u32, mult, host_ctxt, 1);
+ DECLARE_REG(u32, shift, host_ctxt, 2);
+ DECLARE_REG(u64, epoch_ns, host_ctxt, 3);
+ DECLARE_REG(u64, epoch_cyc, host_ctxt, 4);
+
+ __tracing_update_clock(mult, shift, epoch_ns, epoch_cyc);
+}
+
+static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_reset(cpu);
+}
+
+static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned short, id, host_ctxt, 1);
+ DECLARE_REG(bool, enable, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable);
+}
+
+static void handle___tracing_write_event(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, id, host_ctxt, 1);
+
+ trace_selftest(id);
+}
+
+static void handle___vgic_v5_save_apr(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v5_save_apr(kern_hyp_va(cpu_if));
+}
+
+static void handle___vgic_v5_restore_vmcr_apr(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v5_restore_vmcr_apr(kern_hyp_va(cpu_if));
+}
+
typedef void (*hcall_t)(struct kvm_cpu_context *);
#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
HANDLE_FUNC(__kvm_timer_set_cntvoff),
HANDLE_FUNC(__vgic_v3_save_aprs),
HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
+ HANDLE_FUNC(__vgic_v5_save_apr),
+ HANDLE_FUNC(__vgic_v5_restore_vmcr_apr),
+
+ HANDLE_FUNC(__pkvm_host_share_hyp),
+ HANDLE_FUNC(__pkvm_host_unshare_hyp),
+ HANDLE_FUNC(__pkvm_host_donate_guest),
+ HANDLE_FUNC(__pkvm_host_share_guest),
+ HANDLE_FUNC(__pkvm_host_unshare_guest),
+ HANDLE_FUNC(__pkvm_host_relax_perms_guest),
+ HANDLE_FUNC(__pkvm_host_wrprotect_guest),
+ HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
+ HANDLE_FUNC(__pkvm_host_mkyoung_guest),
HANDLE_FUNC(__pkvm_reserve_vm),
HANDLE_FUNC(__pkvm_unreserve_vm),
HANDLE_FUNC(__pkvm_init_vm),
return ret != -EAGAIN ? ret : 0;
}
-static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_memory_slot *memslot, unsigned long hva)
+struct kvm_s2_fault_vma_info {
+ unsigned long mmu_seq;
+ long vma_pagesize;
+ vm_flags_t vm_flags;
+ unsigned long max_map_size;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ bool device;
+ bool mte_allowed;
+ bool is_vma_cacheable;
+ bool map_writable;
+ bool map_non_cacheable;
+};
+
++static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd)
+ {
+ unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
++ struct kvm_vcpu *vcpu = s2fd->vcpu;
+ struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+ struct mm_struct *mm = current->mm;
+ struct kvm *kvm = vcpu->kvm;
+ void *hyp_memcache;
+ struct page *page;
+ int ret;
+
- ret = prepare_mmu_memcache(vcpu, true, &hyp_memcache);
++ hyp_memcache = get_mmu_memcache(vcpu);
++ ret = topup_mmu_memcache(vcpu, hyp_memcache);
+ if (ret)
+ return -ENOMEM;
+
+ ret = account_locked_vm(mm, 1, true);
+ if (ret)
+ return ret;
+
+ mmap_read_lock(mm);
- ret = pin_user_pages(hva, 1, flags, &page);
++ ret = pin_user_pages(s2fd->hva, 1, flags, &page);
+ mmap_read_unlock(mm);
+
+ if (ret == -EHWPOISON) {
- kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
++ kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT);
+ ret = 0;
+ goto dec_account;
+ } else if (ret != 1) {
+ ret = -EFAULT;
+ goto dec_account;
+ } else if (!folio_test_swapbacked(page_folio(page))) {
+ /*
+ * We really can't deal with page-cache pages returned by GUP
+ * because (a) we may trigger writeback of a page for which we
+ * no longer have access and (b) page_mkclean() won't find the
+ * stage-2 mapping in the rmap so we can get out-of-whack with
+ * the filesystem when marking the page dirty during unpinning
+ * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
+ * without asking ext4 first")).
+ *
+ * Ideally we'd just restrict ourselves to anonymous pages, but
+ * we also want to allow memfd (i.e. shmem) pages, so check for
+ * pages backed by swap in the knowledge that the GUP pin will
+ * prevent try_to_unmap() from succeeding.
+ */
+ ret = -EIO;
+ goto unpin;
+ }
+
+ write_lock(&kvm->mmu_lock);
- ret = pkvm_pgtable_stage2_map(pgt, fault_ipa, PAGE_SIZE,
++ ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE,
+ page_to_phys(page), KVM_PGTABLE_PROT_RWX,
+ hyp_memcache, 0);
+ write_unlock(&kvm->mmu_lock);
+ if (ret) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto unpin;
+ }
+
+ return 0;
+ unpin:
+ unpin_user_pages(&page, 1);
+ dec_account:
+ account_locked_vm(mm, 1, false);
+ return ret;
+ }
+
-static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_s2_trans *nested,
- struct kvm_memory_slot *memslot, unsigned long hva,
- bool fault_is_perm)
+static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd,
+ struct kvm_s2_fault_vma_info *s2vi,
+ struct vm_area_struct *vma)
{
- int ret = 0;
- bool topup_memcache;
- bool write_fault, writable;
- bool exec_fault, mte_allowed, is_vma_cacheable;
- bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
- unsigned long mmu_seq;
- phys_addr_t ipa = fault_ipa;
- struct kvm *kvm = vcpu->kvm;
- struct vm_area_struct *vma;
short vma_shift;
- void *memcache;
- gfn_t gfn;
- kvm_pfn_t pfn;
- bool logging_active = memslot_is_logging(memslot);
- bool force_pte = logging_active;
- long vma_pagesize, fault_granule;
- enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
- struct kvm_pgtable *pgt;
- struct page *page;
- vm_flags_t vm_flags;
- enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
-
- if (fault_is_perm)
- fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
- write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
- VM_WARN_ON_ONCE(write_fault && exec_fault);
-
- /*
- * Permission faults just need to update the existing leaf entry,
- * and so normally don't require allocations from the memcache. The
- * only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table.
- */
- topup_memcache = !fault_is_perm || (logging_active && write_fault);
- ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
- if (ret)
- return ret;
-
- /*
- * Let's check if we will get back a huge page backed by hugetlbfs, or
- * get block mapping for device MMIO region.
- */
- mmap_read_lock(current->mm);
- vma = vma_lookup(current->mm, hva);
- if (unlikely(!vma)) {
- kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
- mmap_read_unlock(current->mm);
- return -EFAULT;
- }
- if (force_pte)
+ if (memslot_is_logging(s2fd->memslot)) {
+ s2vi->max_map_size = PAGE_SIZE;
vma_shift = PAGE_SHIFT;
- else
- vma_shift = get_vma_page_shift(vma, hva);
+ } else {
+ s2vi->max_map_size = PUD_SIZE;
+ vma_shift = get_vma_page_shift(vma, s2fd->hva);
+ }
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
goto out_unlock;
}
- VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
- !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
-
+ const struct kvm_s2_fault_desc s2fd = {
+ .vcpu = vcpu,
+ .fault_ipa = fault_ipa,
+ .nested = nested,
+ .memslot = memslot,
+ .hva = hva,
+ };
+
- if (kvm_slot_has_gmem(memslot))
- ret = gmem_abort(&s2fd);
- else
- ret = user_mem_abort(&s2fd);
+ if (kvm_vm_is_protected(vcpu->kvm)) {
- ret = pkvm_mem_abort(vcpu, fault_ipa, memslot, hva);
++ ret = pkvm_mem_abort(&s2fd);
+ } else {
+ VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
+ !write_fault &&
+ !kvm_vcpu_trap_is_exec_fault(vcpu));
+
+ if (kvm_slot_has_gmem(memslot))
- ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
- esr_fsc_is_permission_fault(esr));
++ ret = gmem_abort(&s2fd);
+ else
- ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
- esr_fsc_is_permission_fault(esr));
++ ret = user_mem_abort(&s2fd);
+ }
+
if (ret == 0)
ret = 1;
out: