From f5714ce6e51b5f0a7f2e09a7e2b5756562e25d50 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 13 Sep 2024 14:53:47 +0200 Subject: [PATCH] 6.6-stable patches added patches: loongarch-kvm-implement-kvm-mmu-operations.patch loongarch-use-accessors-to-page-table-entries-instead-of-direct-dereference.patch net-xilinx-axienet-fix-race-in-axienet_stop.patch --- ...rch-kvm-implement-kvm-mmu-operations.patch | 1090 +++++++++++++++++ ...ntries-instead-of-direct-dereference.patch | 345 ++++++ ...inx-axienet-fix-race-in-axienet_stop.patch | 91 ++ queue-6.6/series | 3 + 4 files changed, 1529 insertions(+) create mode 100644 queue-6.6/loongarch-kvm-implement-kvm-mmu-operations.patch create mode 100644 queue-6.6/loongarch-use-accessors-to-page-table-entries-instead-of-direct-dereference.patch create mode 100644 queue-6.6/net-xilinx-axienet-fix-race-in-axienet_stop.patch diff --git a/queue-6.6/loongarch-kvm-implement-kvm-mmu-operations.patch b/queue-6.6/loongarch-kvm-implement-kvm-mmu-operations.patch new file mode 100644 index 00000000000..ae5e2168007 --- /dev/null +++ b/queue-6.6/loongarch-kvm-implement-kvm-mmu-operations.patch @@ -0,0 +1,1090 @@ +From stable+bounces-75645-greg=kroah.com@vger.kernel.org Tue Sep 10 15:12:27 2024 +From: He Lugang +Date: Tue, 10 Sep 2024 21:11:18 +0800 +Subject: LoongArch: KVM: Implement kvm mmu operations +To: stable@vger.kernel.org +Cc: Tianrui Zhao , Bibo Mao , Huacai Chen , He Lugang +Message-ID: <091CB73198EC428B+20240910131119.18625-1-helugang@uniontech.com> + +From: Tianrui Zhao + +commit 752e2cd7b4fb412f3e008493e0195e357bab9773 upstream + +Implement LoongArch kvm mmu, it is used to switch gpa to hpa when guest +exit because of address translation exception. + +This patch implement: allocating gpa page table, searching gpa from it, +and flushing guest gpa in the table. + +Reviewed-by: Bibo Mao +Tested-by: Huacai Chen +Signed-off-by: Tianrui Zhao +Signed-off-by: Huacai Chen +Signed-off-by: He Lugang +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/include/asm/kvm_mmu.h | 139 +++++ + arch/loongarch/kvm/mmu.c | 914 +++++++++++++++++++++++++++++++++++ + 2 files changed, 1053 insertions(+) + create mode 100644 arch/loongarch/include/asm/kvm_mmu.h + create mode 100644 arch/loongarch/kvm/mmu.c + +--- /dev/null ++++ b/arch/loongarch/include/asm/kvm_mmu.h +@@ -0,0 +1,139 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited ++ */ ++ ++#ifndef __ASM_LOONGARCH_KVM_MMU_H__ ++#define __ASM_LOONGARCH_KVM_MMU_H__ ++ ++#include ++#include ++#include ++ ++/* ++ * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels ++ * for which pages need to be cached. ++ */ ++#define KVM_MMU_CACHE_MIN_PAGES (CONFIG_PGTABLE_LEVELS - 1) ++ ++#define _KVM_FLUSH_PGTABLE 0x1 ++#define _KVM_HAS_PGMASK 0x2 ++#define kvm_pfn_pte(pfn, prot) (((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) ++#define kvm_pte_pfn(x) ((phys_addr_t)((x & _PFN_MASK) >> PFN_PTE_SHIFT)) ++ ++typedef unsigned long kvm_pte_t; ++typedef struct kvm_ptw_ctx kvm_ptw_ctx; ++typedef int (*kvm_pte_ops)(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx); ++ ++struct kvm_ptw_ctx { ++ kvm_pte_ops ops; ++ unsigned long flag; ++ ++ /* for kvm_arch_mmu_enable_log_dirty_pt_masked use */ ++ unsigned long mask; ++ unsigned long gfn; ++ ++ /* page walk mmu info */ ++ unsigned int level; ++ unsigned long pgtable_shift; ++ unsigned long invalid_entry; ++ unsigned long *invalid_ptes; ++ unsigned int *pte_shifts; ++ void *opaque; ++ ++ /* free pte table page list */ ++ struct list_head list; ++}; ++ ++kvm_pte_t *kvm_pgd_alloc(void); ++ ++static inline void kvm_set_pte(kvm_pte_t *ptep, kvm_pte_t val) ++{ ++ WRITE_ONCE(*ptep, val); ++} ++ ++static inline int kvm_pte_write(kvm_pte_t pte) { return pte & _PAGE_WRITE; } ++static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & _PAGE_DIRTY; } ++static inline int kvm_pte_young(kvm_pte_t pte) { return pte & _PAGE_ACCESSED; } ++static inline int kvm_pte_huge(kvm_pte_t pte) { return pte & _PAGE_HUGE; } ++ ++static inline kvm_pte_t kvm_pte_mkyoung(kvm_pte_t pte) ++{ ++ return pte | _PAGE_ACCESSED; ++} ++ ++static inline kvm_pte_t kvm_pte_mkold(kvm_pte_t pte) ++{ ++ return pte & ~_PAGE_ACCESSED; ++} ++ ++static inline kvm_pte_t kvm_pte_mkdirty(kvm_pte_t pte) ++{ ++ return pte | _PAGE_DIRTY; ++} ++ ++static inline kvm_pte_t kvm_pte_mkclean(kvm_pte_t pte) ++{ ++ return pte & ~_PAGE_DIRTY; ++} ++ ++static inline kvm_pte_t kvm_pte_mkhuge(kvm_pte_t pte) ++{ ++ return pte | _PAGE_HUGE; ++} ++ ++static inline kvm_pte_t kvm_pte_mksmall(kvm_pte_t pte) ++{ ++ return pte & ~_PAGE_HUGE; ++} ++ ++static inline int kvm_need_flush(kvm_ptw_ctx *ctx) ++{ ++ return ctx->flag & _KVM_FLUSH_PGTABLE; ++} ++ ++static inline kvm_pte_t *kvm_pgtable_offset(kvm_ptw_ctx *ctx, kvm_pte_t *table, ++ phys_addr_t addr) ++{ ++ ++ return table + ((addr >> ctx->pgtable_shift) & (PTRS_PER_PTE - 1)); ++} ++ ++static inline phys_addr_t kvm_pgtable_addr_end(kvm_ptw_ctx *ctx, ++ phys_addr_t addr, phys_addr_t end) ++{ ++ phys_addr_t boundary, size; ++ ++ size = 0x1UL << ctx->pgtable_shift; ++ boundary = (addr + size) & ~(size - 1); ++ return (boundary - 1 < end - 1) ? boundary : end; ++} ++ ++static inline int kvm_pte_present(kvm_ptw_ctx *ctx, kvm_pte_t *entry) ++{ ++ if (!ctx || ctx->level == 0) ++ return !!(*entry & _PAGE_PRESENT); ++ ++ return *entry != ctx->invalid_entry; ++} ++ ++static inline int kvm_pte_none(kvm_ptw_ctx *ctx, kvm_pte_t *entry) ++{ ++ return *entry == ctx->invalid_entry; ++} ++ ++static inline void kvm_ptw_enter(kvm_ptw_ctx *ctx) ++{ ++ ctx->level--; ++ ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; ++ ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; ++} ++ ++static inline void kvm_ptw_exit(kvm_ptw_ctx *ctx) ++{ ++ ctx->level++; ++ ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; ++ ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; ++} ++ ++#endif /* __ASM_LOONGARCH_KVM_MMU_H__ */ +--- /dev/null ++++ b/arch/loongarch/kvm/mmu.c +@@ -0,0 +1,914 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) ++{ ++ ctx->level = kvm->arch.root_level; ++ /* pte table */ ++ ctx->invalid_ptes = kvm->arch.invalid_ptes; ++ ctx->pte_shifts = kvm->arch.pte_shifts; ++ ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; ++ ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; ++ ctx->opaque = kvm; ++} ++ ++/* ++ * Mark a range of guest physical address space old (all accesses fault) in the ++ * VM's GPA page table to allow detection of commonly used pages. ++ */ ++static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) ++{ ++ if (kvm_pte_young(*pte)) { ++ *pte = kvm_pte_mkold(*pte); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Mark a range of guest physical address space clean (writes fault) in the VM's ++ * GPA page table to allow dirty page tracking. ++ */ ++static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) ++{ ++ gfn_t offset; ++ kvm_pte_t val; ++ ++ val = *pte; ++ /* ++ * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end ++ * may cross hugepage, for first huge page parameter addr is equal to ++ * start, however for the second huge page addr is base address of ++ * this huge page, rather than start or end address ++ */ ++ if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) { ++ offset = (addr >> PAGE_SHIFT) - ctx->gfn; ++ if (!(BIT(offset) & ctx->mask)) ++ return 0; ++ } ++ ++ /* ++ * Need not split huge page now, just set write-proect pte bit ++ * Split huge page until next write fault ++ */ ++ if (kvm_pte_dirty(val)) { ++ *pte = kvm_pte_mkclean(val); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Clear pte entry ++ */ ++static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) ++{ ++ struct kvm *kvm; ++ ++ kvm = ctx->opaque; ++ if (ctx->level) ++ kvm->stat.hugepages--; ++ else ++ kvm->stat.pages--; ++ ++ *pte = ctx->invalid_entry; ++ ++ return 1; ++} ++ ++/* ++ * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. ++ * ++ * Allocate a blank KVM GPA page directory (PGD) for representing guest physical ++ * to host physical page mappings. ++ * ++ * Returns: Pointer to new KVM GPA page directory. ++ * NULL on allocation failure. ++ */ ++kvm_pte_t *kvm_pgd_alloc(void) ++{ ++ kvm_pte_t *pgd; ++ ++ pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0); ++ if (pgd) ++ pgd_init((void *)pgd); ++ ++ return pgd; ++} ++ ++static void _kvm_pte_init(void *addr, unsigned long val) ++{ ++ unsigned long *p, *end; ++ ++ p = (unsigned long *)addr; ++ end = p + PTRS_PER_PTE; ++ do { ++ p[0] = val; ++ p[1] = val; ++ p[2] = val; ++ p[3] = val; ++ p[4] = val; ++ p += 8; ++ p[-3] = val; ++ p[-2] = val; ++ p[-1] = val; ++ } while (p != end); ++} ++ ++/* ++ * Caller must hold kvm->mm_lock ++ * ++ * Walk the page tables of kvm to find the PTE corresponding to the ++ * address @addr. If page tables don't exist for @addr, they will be created ++ * from the MMU cache if @cache is not NULL. ++ */ ++static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, ++ struct kvm_mmu_memory_cache *cache, ++ unsigned long addr, int level) ++{ ++ kvm_ptw_ctx ctx; ++ kvm_pte_t *entry, *child; ++ ++ kvm_ptw_prepare(kvm, &ctx); ++ child = kvm->arch.pgd; ++ while (ctx.level > level) { ++ entry = kvm_pgtable_offset(&ctx, child, addr); ++ if (kvm_pte_none(&ctx, entry)) { ++ if (!cache) ++ return NULL; ++ ++ child = kvm_mmu_memory_cache_alloc(cache); ++ _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]); ++ kvm_set_pte(entry, __pa(child)); ++ } else if (kvm_pte_huge(*entry)) { ++ return entry; ++ } else ++ child = (kvm_pte_t *)__va(PHYSADDR(*entry)); ++ kvm_ptw_enter(&ctx); ++ } ++ ++ entry = kvm_pgtable_offset(&ctx, child, addr); ++ ++ return entry; ++} ++ ++/* ++ * Page walker for VM shadow mmu at last level ++ * The last level is small pte page or huge pmd page ++ */ ++static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) ++{ ++ int ret; ++ phys_addr_t next, start, size; ++ struct list_head *list; ++ kvm_pte_t *entry, *child; ++ ++ ret = 0; ++ start = addr; ++ child = (kvm_pte_t *)__va(PHYSADDR(*dir)); ++ entry = kvm_pgtable_offset(ctx, child, addr); ++ do { ++ next = addr + (0x1UL << ctx->pgtable_shift); ++ if (!kvm_pte_present(ctx, entry)) ++ continue; ++ ++ ret |= ctx->ops(entry, addr, ctx); ++ } while (entry++, addr = next, addr < end); ++ ++ if (kvm_need_flush(ctx)) { ++ size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); ++ if (start + size == end) { ++ list = (struct list_head *)child; ++ list_add_tail(list, &ctx->list); ++ *dir = ctx->invalid_ptes[ctx->level + 1]; ++ } ++ } ++ ++ return ret; ++} ++ ++/* ++ * Page walker for VM shadow mmu at page table dir level ++ */ ++static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) ++{ ++ int ret; ++ phys_addr_t next, start, size; ++ struct list_head *list; ++ kvm_pte_t *entry, *child; ++ ++ ret = 0; ++ start = addr; ++ child = (kvm_pte_t *)__va(PHYSADDR(*dir)); ++ entry = kvm_pgtable_offset(ctx, child, addr); ++ do { ++ next = kvm_pgtable_addr_end(ctx, addr, end); ++ if (!kvm_pte_present(ctx, entry)) ++ continue; ++ ++ if (kvm_pte_huge(*entry)) { ++ ret |= ctx->ops(entry, addr, ctx); ++ continue; ++ } ++ ++ kvm_ptw_enter(ctx); ++ if (ctx->level == 0) ++ ret |= kvm_ptw_leaf(entry, addr, next, ctx); ++ else ++ ret |= kvm_ptw_dir(entry, addr, next, ctx); ++ kvm_ptw_exit(ctx); ++ } while (entry++, addr = next, addr < end); ++ ++ if (kvm_need_flush(ctx)) { ++ size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); ++ if (start + size == end) { ++ list = (struct list_head *)child; ++ list_add_tail(list, &ctx->list); ++ *dir = ctx->invalid_ptes[ctx->level + 1]; ++ } ++ } ++ ++ return ret; ++} ++ ++/* ++ * Page walker for VM shadow mmu at page root table ++ */ ++static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) ++{ ++ int ret; ++ phys_addr_t next; ++ kvm_pte_t *entry; ++ ++ ret = 0; ++ entry = kvm_pgtable_offset(ctx, dir, addr); ++ do { ++ next = kvm_pgtable_addr_end(ctx, addr, end); ++ if (!kvm_pte_present(ctx, entry)) ++ continue; ++ ++ kvm_ptw_enter(ctx); ++ ret |= kvm_ptw_dir(entry, addr, next, ctx); ++ kvm_ptw_exit(ctx); ++ } while (entry++, addr = next, addr < end); ++ ++ return ret; ++} ++ ++/* ++ * kvm_flush_range() - Flush a range of guest physical addresses. ++ * @kvm: KVM pointer. ++ * @start_gfn: Guest frame number of first page in GPA range to flush. ++ * @end_gfn: Guest frame number of last page in GPA range to flush. ++ * @lock: Whether to hold mmu_lock or not ++ * ++ * Flushes a range of GPA mappings from the GPA page tables. ++ */ ++static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock) ++{ ++ int ret; ++ kvm_ptw_ctx ctx; ++ struct list_head *pos, *temp; ++ ++ ctx.ops = kvm_flush_pte; ++ ctx.flag = _KVM_FLUSH_PGTABLE; ++ kvm_ptw_prepare(kvm, &ctx); ++ INIT_LIST_HEAD(&ctx.list); ++ ++ if (lock) { ++ spin_lock(&kvm->mmu_lock); ++ ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, ++ end_gfn << PAGE_SHIFT, &ctx); ++ spin_unlock(&kvm->mmu_lock); ++ } else ++ ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, ++ end_gfn << PAGE_SHIFT, &ctx); ++ ++ /* Flush vpid for each vCPU individually */ ++ if (ret) ++ kvm_flush_remote_tlbs(kvm); ++ ++ /* ++ * free pte table page after mmu_lock ++ * the pte table page is linked together with ctx.list ++ */ ++ list_for_each_safe(pos, temp, &ctx.list) { ++ list_del(pos); ++ free_page((unsigned long)pos); ++ } ++} ++ ++/* ++ * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. ++ * @kvm: KVM pointer. ++ * @start_gfn: Guest frame number of first page in GPA range to flush. ++ * @end_gfn: Guest frame number of last page in GPA range to flush. ++ * ++ * Make a range of GPA mappings clean so that guest writes will fault and ++ * trigger dirty page logging. ++ * ++ * The caller must hold the @kvm->mmu_lock spinlock. ++ * ++ * Returns: Whether any GPA mappings were modified, which would require ++ * derived mappings (GVA page tables & TLB enties) to be ++ * invalidated. ++ */ ++static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) ++{ ++ kvm_ptw_ctx ctx; ++ ++ ctx.ops = kvm_mkclean_pte; ++ ctx.flag = 0; ++ kvm_ptw_prepare(kvm, &ctx); ++ return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx); ++} ++ ++/* ++ * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages ++ * @kvm: The KVM pointer ++ * @slot: The memory slot associated with mask ++ * @gfn_offset: The gfn offset in memory slot ++ * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory ++ * slot to be write protected ++ * ++ * Walks bits set in mask write protects the associated pte's. Caller must ++ * acquire @kvm->mmu_lock. ++ */ ++void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, ++ struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) ++{ ++ kvm_ptw_ctx ctx; ++ gfn_t base_gfn = slot->base_gfn + gfn_offset; ++ gfn_t start = base_gfn + __ffs(mask); ++ gfn_t end = base_gfn + __fls(mask) + 1; ++ ++ ctx.ops = kvm_mkclean_pte; ++ ctx.flag = _KVM_HAS_PGMASK; ++ ctx.mask = mask; ++ ctx.gfn = base_gfn; ++ kvm_ptw_prepare(kvm, &ctx); ++ ++ kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); ++} ++ ++void kvm_arch_commit_memory_region(struct kvm *kvm, ++ struct kvm_memory_slot *old, ++ const struct kvm_memory_slot *new, ++ enum kvm_mr_change change) ++{ ++ int needs_flush; ++ ++ /* ++ * If dirty page logging is enabled, write protect all pages in the slot ++ * ready for dirty logging. ++ * ++ * There is no need to do this in any of the following cases: ++ * CREATE: No dirty mappings will already exist. ++ * MOVE/DELETE: The old mappings will already have been cleaned up by ++ * kvm_arch_flush_shadow_memslot() ++ */ ++ if (change == KVM_MR_FLAGS_ONLY && ++ (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && ++ new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { ++ spin_lock(&kvm->mmu_lock); ++ /* Write protect GPA page table entries */ ++ needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn, ++ new->base_gfn + new->npages); ++ spin_unlock(&kvm->mmu_lock); ++ if (needs_flush) ++ kvm_flush_remote_tlbs(kvm); ++ } ++} ++ ++void kvm_arch_flush_shadow_all(struct kvm *kvm) ++{ ++ kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0); ++} ++ ++void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) ++{ ++ /* ++ * The slot has been made invalid (ready for moving or deletion), so we ++ * need to ensure that it can no longer be accessed by any guest vCPUs. ++ */ ++ kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1); ++} ++ ++bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) ++{ ++ kvm_ptw_ctx ctx; ++ ++ ctx.flag = 0; ++ ctx.ops = kvm_flush_pte; ++ kvm_ptw_prepare(kvm, &ctx); ++ INIT_LIST_HEAD(&ctx.list); ++ ++ return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, ++ range->end << PAGE_SHIFT, &ctx); ++} ++ ++bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) ++{ ++ unsigned long prot_bits; ++ kvm_pte_t *ptep; ++ kvm_pfn_t pfn = pte_pfn(range->arg.pte); ++ gpa_t gpa = range->start << PAGE_SHIFT; ++ ++ ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); ++ if (!ptep) ++ return false; ++ ++ /* Replacing an absent or old page doesn't need flushes */ ++ if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) { ++ kvm_set_pte(ptep, 0); ++ return false; ++ } ++ ++ /* Fill new pte if write protected or page migrated */ ++ prot_bits = _PAGE_PRESENT | __READABLE; ++ prot_bits |= _CACHE_MASK & pte_val(range->arg.pte); ++ ++ /* ++ * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support ++ * _PAGE_WRITE for map_page_fast if next page write fault ++ * _PAGE_DIRTY since gpa has already recorded as dirty page ++ */ ++ prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte); ++ kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits))); ++ ++ return true; ++} ++ ++bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) ++{ ++ kvm_ptw_ctx ctx; ++ ++ ctx.flag = 0; ++ ctx.ops = kvm_mkold_pte; ++ kvm_ptw_prepare(kvm, &ctx); ++ ++ return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, ++ range->end << PAGE_SHIFT, &ctx); ++} ++ ++bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) ++{ ++ gpa_t gpa = range->start << PAGE_SHIFT; ++ kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); ++ ++ if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * kvm_map_page_fast() - Fast path GPA fault handler. ++ * @vcpu: vCPU pointer. ++ * @gpa: Guest physical address of fault. ++ * @write: Whether the fault was due to a write. ++ * ++ * Perform fast path GPA fault handling, doing all that can be done without ++ * calling into KVM. This handles marking old pages young (for idle page ++ * tracking), and dirtying of clean pages (for dirty page logging). ++ * ++ * Returns: 0 on success, in which case we can update derived mappings and ++ * resume guest execution. ++ * -EFAULT on failure due to absent GPA mapping or write to ++ * read-only page, in which case KVM must be consulted. ++ */ ++static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) ++{ ++ int ret = 0; ++ kvm_pfn_t pfn = 0; ++ kvm_pte_t *ptep, changed, new; ++ gfn_t gfn = gpa >> PAGE_SHIFT; ++ struct kvm *kvm = vcpu->kvm; ++ struct kvm_memory_slot *slot; ++ ++ spin_lock(&kvm->mmu_lock); ++ ++ /* Fast path - just check GPA page table for an existing entry */ ++ ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); ++ if (!ptep || !kvm_pte_present(NULL, ptep)) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ /* Track access to pages marked old */ ++ new = *ptep; ++ if (!kvm_pte_young(new)) ++ new = kvm_pte_mkyoung(new); ++ /* call kvm_set_pfn_accessed() after unlock */ ++ ++ if (write && !kvm_pte_dirty(new)) { ++ if (!kvm_pte_write(new)) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ if (kvm_pte_huge(new)) { ++ /* ++ * Do not set write permission when dirty logging is ++ * enabled for HugePages ++ */ ++ slot = gfn_to_memslot(kvm, gfn); ++ if (kvm_slot_dirty_track_enabled(slot)) { ++ ret = -EFAULT; ++ goto out; ++ } ++ } ++ ++ /* Track dirtying of writeable pages */ ++ new = kvm_pte_mkdirty(new); ++ } ++ ++ changed = new ^ (*ptep); ++ if (changed) { ++ kvm_set_pte(ptep, new); ++ pfn = kvm_pte_pfn(new); ++ } ++ spin_unlock(&kvm->mmu_lock); ++ ++ /* ++ * Fixme: pfn may be freed after mmu_lock ++ * kvm_try_get_pfn(pfn)/kvm_release_pfn pair to prevent this? ++ */ ++ if (kvm_pte_young(changed)) ++ kvm_set_pfn_accessed(pfn); ++ ++ if (kvm_pte_dirty(changed)) { ++ mark_page_dirty(kvm, gfn); ++ kvm_set_pfn_dirty(pfn); ++ } ++ return ret; ++out: ++ spin_unlock(&kvm->mmu_lock); ++ return ret; ++} ++ ++static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, ++ unsigned long hva, unsigned long map_size, bool write) ++{ ++ size_t size; ++ gpa_t gpa_start; ++ hva_t uaddr_start, uaddr_end; ++ ++ /* Disable dirty logging on HugePages */ ++ if (kvm_slot_dirty_track_enabled(memslot) && write) ++ return false; ++ ++ size = memslot->npages * PAGE_SIZE; ++ gpa_start = memslot->base_gfn << PAGE_SHIFT; ++ uaddr_start = memslot->userspace_addr; ++ uaddr_end = uaddr_start + size; ++ ++ /* ++ * Pages belonging to memslots that don't have the same alignment ++ * within a PMD for userspace and GPA cannot be mapped with stage-2 ++ * PMD entries, because we'll end up mapping the wrong pages. ++ * ++ * Consider a layout like the following: ++ * ++ * memslot->userspace_addr: ++ * +-----+--------------------+--------------------+---+ ++ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| ++ * +-----+--------------------+--------------------+---+ ++ * ++ * memslot->base_gfn << PAGE_SIZE: ++ * +---+--------------------+--------------------+-----+ ++ * |abc|def Stage-2 block | Stage-2 block |tvxyz| ++ * +---+--------------------+--------------------+-----+ ++ * ++ * If we create those stage-2 blocks, we'll end up with this incorrect ++ * mapping: ++ * d -> f ++ * e -> g ++ * f -> h ++ */ ++ if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) ++ return false; ++ ++ /* ++ * Next, let's make sure we're not trying to map anything not covered ++ * by the memslot. This means we have to prohibit block size mappings ++ * for the beginning and end of a non-block aligned and non-block sized ++ * memory slot (illustrated by the head and tail parts of the ++ * userspace view above containing pages 'abcde' and 'xyz', ++ * respectively). ++ * ++ * Note that it doesn't matter if we do the check using the ++ * userspace_addr or the base_gfn, as both are equally aligned (per ++ * the check above) and equally sized. ++ */ ++ return (hva & ~(map_size - 1)) >= uaddr_start && ++ (hva & ~(map_size - 1)) + map_size <= uaddr_end; ++} ++ ++/* ++ * Lookup the mapping level for @gfn in the current mm. ++ * ++ * WARNING! Use of host_pfn_mapping_level() requires the caller and the end ++ * consumer to be tied into KVM's handlers for MMU notifier events! ++ * ++ * There are several ways to safely use this helper: ++ * ++ * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before ++ * consuming it. In this case, mmu_lock doesn't need to be held during the ++ * lookup, but it does need to be held while checking the MMU notifier. ++ * ++ * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation ++ * event for the hva. This can be done by explicit checking the MMU notifier ++ * or by ensuring that KVM already has a valid mapping that covers the hva. ++ * ++ * - Do not use the result to install new mappings, e.g. use the host mapping ++ * level only to decide whether or not to zap an entry. In this case, it's ++ * not required to hold mmu_lock (though it's highly likely the caller will ++ * want to hold mmu_lock anyways, e.g. to modify SPTEs). ++ * ++ * Note! The lookup can still race with modifications to host page tables, but ++ * the above "rules" ensure KVM will not _consume_ the result of the walk if a ++ * race with the primary MMU occurs. ++ */ ++static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, ++ const struct kvm_memory_slot *slot) ++{ ++ int level = 0; ++ unsigned long hva; ++ unsigned long flags; ++ pgd_t pgd; ++ p4d_t p4d; ++ pud_t pud; ++ pmd_t pmd; ++ ++ /* ++ * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() ++ * is not solely for performance, it's also necessary to avoid the ++ * "writable" check in __gfn_to_hva_many(), which will always fail on ++ * read-only memslots due to gfn_to_hva() assuming writes. Earlier ++ * page fault steps have already verified the guest isn't writing a ++ * read-only memslot. ++ */ ++ hva = __gfn_to_hva_memslot(slot, gfn); ++ ++ /* ++ * Disable IRQs to prevent concurrent tear down of host page tables, ++ * e.g. if the primary MMU promotes a P*D to a huge page and then frees ++ * the original page table. ++ */ ++ local_irq_save(flags); ++ ++ /* ++ * Read each entry once. As above, a non-leaf entry can be promoted to ++ * a huge page _during_ this walk. Re-reading the entry could send the ++ * walk into the weeks, e.g. p*d_large() returns false (sees the old ++ * value) and then p*d_offset() walks into the target huge page instead ++ * of the old page table (sees the new value). ++ */ ++ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); ++ if (pgd_none(pgd)) ++ goto out; ++ ++ p4d = READ_ONCE(*p4d_offset(&pgd, hva)); ++ if (p4d_none(p4d) || !p4d_present(p4d)) ++ goto out; ++ ++ pud = READ_ONCE(*pud_offset(&p4d, hva)); ++ if (pud_none(pud) || !pud_present(pud)) ++ goto out; ++ ++ pmd = READ_ONCE(*pmd_offset(&pud, hva)); ++ if (pmd_none(pmd) || !pmd_present(pmd)) ++ goto out; ++ ++ if (kvm_pte_huge(pmd_val(pmd))) ++ level = 1; ++ ++out: ++ local_irq_restore(flags); ++ return level; ++} ++ ++/* ++ * Split huge page ++ */ ++static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn) ++{ ++ int i; ++ kvm_pte_t val, *child; ++ struct kvm *kvm = vcpu->kvm; ++ struct kvm_mmu_memory_cache *memcache; ++ ++ memcache = &vcpu->arch.mmu_page_cache; ++ child = kvm_mmu_memory_cache_alloc(memcache); ++ val = kvm_pte_mksmall(*ptep); ++ for (i = 0; i < PTRS_PER_PTE; i++) { ++ kvm_set_pte(child + i, val); ++ val += PAGE_SIZE; ++ } ++ ++ /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ ++ kvm_set_pte(ptep, __pa(child)); ++ ++ kvm->stat.hugepages--; ++ kvm->stat.pages += PTRS_PER_PTE; ++ ++ return child + (gfn & (PTRS_PER_PTE - 1)); ++} ++ ++/* ++ * kvm_map_page() - Map a guest physical page. ++ * @vcpu: vCPU pointer. ++ * @gpa: Guest physical address of fault. ++ * @write: Whether the fault was due to a write. ++ * ++ * Handle GPA faults by creating a new GPA mapping (or updating an existing ++ * one). ++ * ++ * This takes care of marking pages young or dirty (idle/dirty page tracking), ++ * asking KVM for the corresponding PFN, and creating a mapping in the GPA page ++ * tables. Derived mappings (GVA page tables and TLBs) must be handled by the ++ * caller. ++ * ++ * Returns: 0 on success ++ * -EFAULT if there is no memory region at @gpa or a write was ++ * attempted to a read-only memory region. This is usually handled ++ * as an MMIO access. ++ */ ++static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) ++{ ++ bool writeable; ++ int srcu_idx, err, retry_no = 0, level; ++ unsigned long hva, mmu_seq, prot_bits; ++ kvm_pfn_t pfn; ++ kvm_pte_t *ptep, new_pte; ++ gfn_t gfn = gpa >> PAGE_SHIFT; ++ struct kvm *kvm = vcpu->kvm; ++ struct kvm_memory_slot *memslot; ++ struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; ++ ++ /* Try the fast path to handle old / clean pages */ ++ srcu_idx = srcu_read_lock(&kvm->srcu); ++ err = kvm_map_page_fast(vcpu, gpa, write); ++ if (!err) ++ goto out; ++ ++ memslot = gfn_to_memslot(kvm, gfn); ++ hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); ++ if (kvm_is_error_hva(hva) || (write && !writeable)) { ++ err = -EFAULT; ++ goto out; ++ } ++ ++ /* We need a minimum of cached pages ready for page table creation */ ++ err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); ++ if (err) ++ goto out; ++ ++retry: ++ /* ++ * Used to check for invalidations in progress, of the pfn that is ++ * returned by pfn_to_pfn_prot below. ++ */ ++ mmu_seq = kvm->mmu_invalidate_seq; ++ /* ++ * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in ++ * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't ++ * risk the page we get a reference to getting unmapped before we have a ++ * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. ++ * ++ * This smp_rmb() pairs with the effective smp_wmb() of the combination ++ * of the pte_unmap_unlock() after the PTE is zapped, and the ++ * spin_lock() in kvm_mmu_invalidate_invalidate_() before ++ * mmu_invalidate_seq is incremented. ++ */ ++ smp_rmb(); ++ ++ /* Slow path - ask KVM core whether we can access this GPA */ ++ pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable); ++ if (is_error_noslot_pfn(pfn)) { ++ err = -EFAULT; ++ goto out; ++ } ++ ++ /* Check if an invalidation has taken place since we got pfn */ ++ spin_lock(&kvm->mmu_lock); ++ if (mmu_invalidate_retry_hva(kvm, mmu_seq, hva)) { ++ /* ++ * This can happen when mappings are changed asynchronously, but ++ * also synchronously if a COW is triggered by ++ * gfn_to_pfn_prot(). ++ */ ++ spin_unlock(&kvm->mmu_lock); ++ kvm_release_pfn_clean(pfn); ++ if (retry_no > 100) { ++ retry_no = 0; ++ schedule(); ++ } ++ retry_no++; ++ goto retry; ++ } ++ ++ /* ++ * For emulated devices such virtio device, actual cache attribute is ++ * determined by physical machine. ++ * For pass through physical device, it should be uncachable ++ */ ++ prot_bits = _PAGE_PRESENT | __READABLE; ++ if (pfn_valid(pfn)) ++ prot_bits |= _CACHE_CC; ++ else ++ prot_bits |= _CACHE_SUC; ++ ++ if (writeable) { ++ prot_bits |= _PAGE_WRITE; ++ if (write) ++ prot_bits |= __WRITEABLE; ++ } ++ ++ /* Disable dirty logging on HugePages */ ++ level = 0; ++ if (!fault_supports_huge_mapping(memslot, hva, PMD_SIZE, write)) { ++ level = 0; ++ } else { ++ level = host_pfn_mapping_level(kvm, gfn, memslot); ++ if (level == 1) { ++ gfn = gfn & ~(PTRS_PER_PTE - 1); ++ pfn = pfn & ~(PTRS_PER_PTE - 1); ++ } ++ } ++ ++ /* Ensure page tables are allocated */ ++ ptep = kvm_populate_gpa(kvm, memcache, gpa, level); ++ new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits)); ++ if (level == 1) { ++ new_pte = kvm_pte_mkhuge(new_pte); ++ /* ++ * previous pmd entry is invalid_pte_table ++ * there is invalid tlb with small page ++ * need flush these invalid tlbs for current vcpu ++ */ ++ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); ++ ++kvm->stat.hugepages; ++ } else if (kvm_pte_huge(*ptep) && write) ++ ptep = kvm_split_huge(vcpu, ptep, gfn); ++ else ++ ++kvm->stat.pages; ++ kvm_set_pte(ptep, new_pte); ++ spin_unlock(&kvm->mmu_lock); ++ ++ if (prot_bits & _PAGE_DIRTY) { ++ mark_page_dirty_in_slot(kvm, memslot, gfn); ++ kvm_set_pfn_dirty(pfn); ++ } ++ ++ kvm_set_pfn_accessed(pfn); ++ kvm_release_pfn_clean(pfn); ++out: ++ srcu_read_unlock(&kvm->srcu, srcu_idx); ++ return err; ++} ++ ++int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) ++{ ++ int ret; ++ ++ ret = kvm_map_page(vcpu, gpa, write); ++ if (ret) ++ return ret; ++ ++ /* Invalidate this entry in the TLB */ ++ kvm_flush_tlb_gpa(vcpu, gpa); ++ ++ return 0; ++} ++ ++void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) ++{ ++} ++ ++int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, ++ struct kvm_memory_slot *new, enum kvm_mr_change change) ++{ ++ return 0; ++} ++ ++void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, ++ const struct kvm_memory_slot *memslot) ++{ ++ kvm_flush_remote_tlbs(kvm); ++} diff --git a/queue-6.6/loongarch-use-accessors-to-page-table-entries-instead-of-direct-dereference.patch b/queue-6.6/loongarch-use-accessors-to-page-table-entries-instead-of-direct-dereference.patch new file mode 100644 index 00000000000..183ef6a9a30 --- /dev/null +++ b/queue-6.6/loongarch-use-accessors-to-page-table-entries-instead-of-direct-dereference.patch @@ -0,0 +1,345 @@ +From stable+bounces-75642-greg=kroah.com@vger.kernel.org Tue Sep 10 15:12:22 2024 +From: He Lugang +Date: Tue, 10 Sep 2024 21:11:19 +0800 +Subject: LoongArch: Use accessors to page table entries instead of direct dereference +To: stable@vger.kernel.org +Cc: Huacai Chen +Message-ID: <1197B2966A66F7F9+20240910131119.18625-2-helugang@uniontech.com> + +From: Huacai Chen + +commit 4574815abf43e2bf05643e1b3f7a2e5d6df894f0 upstream + +As very well explained in commit 20a004e7b017cce282 ("arm64: mm: Use +READ_ONCE/WRITE_ONCE when accessing page tables"), an architecture whose +page table walker can modify the PTE in parallel must use READ_ONCE()/ +WRITE_ONCE() macro to avoid any compiler transformation. + +So apply that to LoongArch which is such an architecture, in order to +avoid potential problems. + +Similar to commit edf955647269422e ("riscv: Use accessors to page table +entries instead of direct dereference"). + +Signed-off-by: Huacai Chen +Signed-off-by: Greg Kroah-Hartman +--- + arch/loongarch/include/asm/hugetlb.h | 4 +- + arch/loongarch/include/asm/kfence.h | 6 ++-- + arch/loongarch/include/asm/pgtable.h | 48 +++++++++++++++++++++-------------- + arch/loongarch/kvm/mmu.c | 8 ++--- + arch/loongarch/mm/hugetlbpage.c | 6 ++-- + arch/loongarch/mm/init.c | 10 +++---- + arch/loongarch/mm/kasan_init.c | 10 +++---- + arch/loongarch/mm/pgtable.c | 2 - + 8 files changed, 52 insertions(+), 42 deletions(-) + +--- a/arch/loongarch/include/asm/hugetlb.h ++++ b/arch/loongarch/include/asm/hugetlb.h +@@ -34,7 +34,7 @@ static inline pte_t huge_ptep_get_and_cl + unsigned long addr, pte_t *ptep) + { + pte_t clear; +- pte_t pte = *ptep; ++ pte_t pte = ptep_get(ptep); + + pte_val(clear) = (unsigned long)invalid_pte_table; + set_pte_at(mm, addr, ptep, clear); +@@ -65,7 +65,7 @@ static inline int huge_ptep_set_access_f + pte_t *ptep, pte_t pte, + int dirty) + { +- int changed = !pte_same(*ptep, pte); ++ int changed = !pte_same(ptep_get(ptep), pte); + + if (changed) { + set_pte_at(vma->vm_mm, addr, ptep, pte); +--- a/arch/loongarch/include/asm/kfence.h ++++ b/arch/loongarch/include/asm/kfence.h +@@ -43,13 +43,13 @@ static inline bool kfence_protect_page(u + { + pte_t *pte = virt_to_kpte(addr); + +- if (WARN_ON(!pte) || pte_none(*pte)) ++ if (WARN_ON(!pte) || pte_none(ptep_get(pte))) + return false; + + if (protect) +- set_pte(pte, __pte(pte_val(*pte) & ~(_PAGE_VALID | _PAGE_PRESENT))); ++ set_pte(pte, __pte(pte_val(ptep_get(pte)) & ~(_PAGE_VALID | _PAGE_PRESENT))); + else +- set_pte(pte, __pte(pte_val(*pte) | (_PAGE_VALID | _PAGE_PRESENT))); ++ set_pte(pte, __pte(pte_val(ptep_get(pte)) | (_PAGE_VALID | _PAGE_PRESENT))); + + preempt_disable(); + local_flush_tlb_one(addr); +--- a/arch/loongarch/include/asm/pgtable.h ++++ b/arch/loongarch/include/asm/pgtable.h +@@ -106,6 +106,9 @@ extern unsigned long empty_zero_page[PAG + #define KFENCE_AREA_START (VMEMMAP_END + 1) + #define KFENCE_AREA_END (KFENCE_AREA_START + KFENCE_AREA_SIZE - 1) + ++#define ptep_get(ptep) READ_ONCE(*(ptep)) ++#define pmdp_get(pmdp) READ_ONCE(*(pmdp)) ++ + #define pte_ERROR(e) \ + pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) + #ifndef __PAGETABLE_PMD_FOLDED +@@ -147,11 +150,6 @@ static inline int p4d_present(p4d_t p4d) + return p4d_val(p4d) != (unsigned long)invalid_pud_table; + } + +-static inline void p4d_clear(p4d_t *p4dp) +-{ +- p4d_val(*p4dp) = (unsigned long)invalid_pud_table; +-} +- + static inline pud_t *p4d_pgtable(p4d_t p4d) + { + return (pud_t *)p4d_val(p4d); +@@ -159,7 +157,12 @@ static inline pud_t *p4d_pgtable(p4d_t p + + static inline void set_p4d(p4d_t *p4d, p4d_t p4dval) + { +- *p4d = p4dval; ++ WRITE_ONCE(*p4d, p4dval); ++} ++ ++static inline void p4d_clear(p4d_t *p4dp) ++{ ++ set_p4d(p4dp, __p4d((unsigned long)invalid_pud_table)); + } + + #define p4d_phys(p4d) PHYSADDR(p4d_val(p4d)) +@@ -193,17 +196,20 @@ static inline int pud_present(pud_t pud) + return pud_val(pud) != (unsigned long)invalid_pmd_table; + } + +-static inline void pud_clear(pud_t *pudp) ++static inline pmd_t *pud_pgtable(pud_t pud) + { +- pud_val(*pudp) = ((unsigned long)invalid_pmd_table); ++ return (pmd_t *)pud_val(pud); + } + +-static inline pmd_t *pud_pgtable(pud_t pud) ++static inline void set_pud(pud_t *pud, pud_t pudval) + { +- return (pmd_t *)pud_val(pud); ++ WRITE_ONCE(*pud, pudval); + } + +-#define set_pud(pudptr, pudval) do { *(pudptr) = (pudval); } while (0) ++static inline void pud_clear(pud_t *pudp) ++{ ++ set_pud(pudp, __pud((unsigned long)invalid_pmd_table)); ++} + + #define pud_phys(pud) PHYSADDR(pud_val(pud)) + #define pud_page(pud) (pfn_to_page(pud_phys(pud) >> PAGE_SHIFT)) +@@ -231,12 +237,15 @@ static inline int pmd_present(pmd_t pmd) + return pmd_val(pmd) != (unsigned long)invalid_pte_table; + } + +-static inline void pmd_clear(pmd_t *pmdp) ++static inline void set_pmd(pmd_t *pmd, pmd_t pmdval) + { +- pmd_val(*pmdp) = ((unsigned long)invalid_pte_table); ++ WRITE_ONCE(*pmd, pmdval); + } + +-#define set_pmd(pmdptr, pmdval) do { *(pmdptr) = (pmdval); } while (0) ++static inline void pmd_clear(pmd_t *pmdp) ++{ ++ set_pmd(pmdp, __pmd((unsigned long)invalid_pte_table)); ++} + + #define pmd_phys(pmd) PHYSADDR(pmd_val(pmd)) + +@@ -314,7 +323,8 @@ extern void paging_init(void); + + static inline void set_pte(pte_t *ptep, pte_t pteval) + { +- *ptep = pteval; ++ WRITE_ONCE(*ptep, pteval); ++ + if (pte_val(pteval) & _PAGE_GLOBAL) { + pte_t *buddy = ptep_buddy(ptep); + /* +@@ -341,8 +351,8 @@ static inline void set_pte(pte_t *ptep, + : [buddy] "+m" (buddy->pte), [tmp] "=&r" (tmp) + : [global] "r" (page_global)); + #else /* !CONFIG_SMP */ +- if (pte_none(*buddy)) +- pte_val(*buddy) = pte_val(*buddy) | _PAGE_GLOBAL; ++ if (pte_none(ptep_get(buddy))) ++ WRITE_ONCE(*buddy, __pte(pte_val(ptep_get(buddy)) | _PAGE_GLOBAL)); + #endif /* CONFIG_SMP */ + } + } +@@ -350,7 +360,7 @@ static inline void set_pte(pte_t *ptep, + static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) + { + /* Preserve global status for the pair */ +- if (pte_val(*ptep_buddy(ptep)) & _PAGE_GLOBAL) ++ if (pte_val(ptep_get(ptep_buddy(ptep))) & _PAGE_GLOBAL) + set_pte(ptep, __pte(_PAGE_GLOBAL)); + else + set_pte(ptep, __pte(0)); +@@ -591,7 +601,7 @@ static inline pmd_t pmd_mkinvalid(pmd_t + static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) + { +- pmd_t old = *pmdp; ++ pmd_t old = pmdp_get(pmdp); + + pmd_clear(pmdp); + +--- a/arch/loongarch/kvm/mmu.c ++++ b/arch/loongarch/kvm/mmu.c +@@ -679,19 +679,19 @@ static int host_pfn_mapping_level(struct + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ +- pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); ++ pgd = pgdp_get(pgd_offset(kvm->mm, hva)); + if (pgd_none(pgd)) + goto out; + +- p4d = READ_ONCE(*p4d_offset(&pgd, hva)); ++ p4d = p4dp_get(p4d_offset(&pgd, hva)); + if (p4d_none(p4d) || !p4d_present(p4d)) + goto out; + +- pud = READ_ONCE(*pud_offset(&p4d, hva)); ++ pud = pudp_get(pud_offset(&p4d, hva)); + if (pud_none(pud) || !pud_present(pud)) + goto out; + +- pmd = READ_ONCE(*pmd_offset(&pud, hva)); ++ pmd = pmdp_get(pmd_offset(&pud, hva)); + if (pmd_none(pmd) || !pmd_present(pmd)) + goto out; + +--- a/arch/loongarch/mm/hugetlbpage.c ++++ b/arch/loongarch/mm/hugetlbpage.c +@@ -39,11 +39,11 @@ pte_t *huge_pte_offset(struct mm_struct + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, addr); +- if (pgd_present(*pgd)) { ++ if (pgd_present(pgdp_get(pgd))) { + p4d = p4d_offset(pgd, addr); +- if (p4d_present(*p4d)) { ++ if (p4d_present(p4dp_get(p4d))) { + pud = pud_offset(p4d, addr); +- if (pud_present(*pud)) ++ if (pud_present(pudp_get(pud))) + pmd = pmd_offset(pud, addr); + } + } +--- a/arch/loongarch/mm/init.c ++++ b/arch/loongarch/mm/init.c +@@ -140,7 +140,7 @@ void __meminit vmemmap_set_pmd(pmd_t *pm + int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) + { +- int huge = pmd_val(*pmd) & _PAGE_HUGE; ++ int huge = pmd_val(pmdp_get(pmd)) & _PAGE_HUGE; + + if (huge) + vmemmap_verify((pte_t *)pmd, node, addr, next); +@@ -172,7 +172,7 @@ pte_t * __init populate_kernel_pte(unsig + pud_t *pud; + pmd_t *pmd; + +- if (p4d_none(*p4d)) { ++ if (p4d_none(p4dp_get(p4d))) { + pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pud) + panic("%s: Failed to allocate memory\n", __func__); +@@ -183,7 +183,7 @@ pte_t * __init populate_kernel_pte(unsig + } + + pud = pud_offset(p4d, addr); +- if (pud_none(*pud)) { ++ if (pud_none(pudp_get(pud))) { + pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pmd) + panic("%s: Failed to allocate memory\n", __func__); +@@ -194,7 +194,7 @@ pte_t * __init populate_kernel_pte(unsig + } + + pmd = pmd_offset(pud, addr); +- if (!pmd_present(*pmd)) { ++ if (!pmd_present(pmdp_get(pmd))) { + pte_t *pte; + + pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); +@@ -215,7 +215,7 @@ void __init __set_fixmap(enum fixed_addr + BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); + + ptep = populate_kernel_pte(addr); +- if (!pte_none(*ptep)) { ++ if (!pte_none(ptep_get(ptep))) { + pte_ERROR(*ptep); + return; + } +--- a/arch/loongarch/mm/kasan_init.c ++++ b/arch/loongarch/mm/kasan_init.c +@@ -105,7 +105,7 @@ static phys_addr_t __init kasan_alloc_ze + + static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, bool early) + { +- if (__pmd_none(early, READ_ONCE(*pmdp))) { ++ if (__pmd_none(early, pmdp_get(pmdp))) { + phys_addr_t pte_phys = early ? + __pa_symbol(kasan_early_shadow_pte) : kasan_alloc_zeroed_page(node); + if (!early) +@@ -118,7 +118,7 @@ static pte_t *__init kasan_pte_offset(pm + + static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, bool early) + { +- if (__pud_none(early, READ_ONCE(*pudp))) { ++ if (__pud_none(early, pudp_get(pudp))) { + phys_addr_t pmd_phys = early ? + __pa_symbol(kasan_early_shadow_pmd) : kasan_alloc_zeroed_page(node); + if (!early) +@@ -131,7 +131,7 @@ static pmd_t *__init kasan_pmd_offset(pu + + static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early) + { +- if (__p4d_none(early, READ_ONCE(*p4dp))) { ++ if (__p4d_none(early, p4dp_get(p4dp))) { + phys_addr_t pud_phys = early ? + __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); + if (!early) +@@ -154,7 +154,7 @@ static void __init kasan_pte_populate(pm + : kasan_alloc_zeroed_page(node); + next = addr + PAGE_SIZE; + set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); +- } while (ptep++, addr = next, addr != end && __pte_none(early, READ_ONCE(*ptep))); ++ } while (ptep++, addr = next, addr != end && __pte_none(early, ptep_get(ptep))); + } + + static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, +@@ -166,7 +166,7 @@ static void __init kasan_pmd_populate(pu + do { + next = pmd_addr_end(addr, end); + kasan_pte_populate(pmdp, addr, next, node, early); +- } while (pmdp++, addr = next, addr != end && __pmd_none(early, READ_ONCE(*pmdp))); ++ } while (pmdp++, addr = next, addr != end && __pmd_none(early, pmdp_get(pmdp))); + } + + static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr, +--- a/arch/loongarch/mm/pgtable.c ++++ b/arch/loongarch/mm/pgtable.c +@@ -128,7 +128,7 @@ pmd_t mk_pmd(struct page *page, pgprot_t + void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) + { +- *pmdp = pmd; ++ WRITE_ONCE(*pmdp, pmd); + flush_tlb_all(); + } + diff --git a/queue-6.6/net-xilinx-axienet-fix-race-in-axienet_stop.patch b/queue-6.6/net-xilinx-axienet-fix-race-in-axienet_stop.patch new file mode 100644 index 00000000000..71d4495aa4b --- /dev/null +++ b/queue-6.6/net-xilinx-axienet-fix-race-in-axienet_stop.patch @@ -0,0 +1,91 @@ +From 858430db28a5f5a11f8faa3a6fa805438e6f0851 Mon Sep 17 00:00:00 2001 +From: Sean Anderson +Date: Tue, 3 Sep 2024 13:51:41 -0400 +Subject: net: xilinx: axienet: Fix race in axienet_stop + +From: Sean Anderson + +commit 858430db28a5f5a11f8faa3a6fa805438e6f0851 upstream. + +axienet_dma_err_handler can race with axienet_stop in the following +manner: + +CPU 1 CPU 2 +====================== ================== +axienet_stop() + napi_disable() + axienet_dma_stop() + axienet_dma_err_handler() + napi_disable() + axienet_dma_stop() + axienet_dma_start() + napi_enable() + cancel_work_sync() + free_irq() + +Fix this by setting a flag in axienet_stop telling +axienet_dma_err_handler not to bother doing anything. I chose not to use +disable_work_sync to allow for easier backporting. + +Signed-off-by: Sean Anderson +Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") +Link: https://patch.msgid.link/20240903175141.4132898-1-sean.anderson@linux.dev +Signed-off-by: Jakub Kicinski +[ Adjusted to apply before dmaengine support ] +Signed-off-by: Sean Anderson +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/xilinx/xilinx_axienet.h | 3 +++ + drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 8 ++++++++ + 2 files changed, 11 insertions(+) + +--- a/drivers/net/ethernet/xilinx/xilinx_axienet.h ++++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h +@@ -419,6 +419,8 @@ struct axidma_bd { + * @tx_bytes: TX byte count for statistics + * @tx_stat_sync: Synchronization object for TX stats + * @dma_err_task: Work structure to process Axi DMA errors ++ * @stopping: Set when @dma_err_task shouldn't do anything because we are ++ * about to stop the device. + * @tx_irq: Axidma TX IRQ number + * @rx_irq: Axidma RX IRQ number + * @eth_irq: Ethernet core IRQ number +@@ -481,6 +483,7 @@ struct axienet_local { + struct u64_stats_sync tx_stat_sync; + + struct work_struct dma_err_task; ++ bool stopping; + + int tx_irq; + int rx_irq; +--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c ++++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +@@ -1162,6 +1162,7 @@ static int axienet_open(struct net_devic + phylink_start(lp->phylink); + + /* Enable worker thread for Axi DMA error handling */ ++ lp->stopping = false; + INIT_WORK(&lp->dma_err_task, axienet_dma_err_handler); + + napi_enable(&lp->napi_rx); +@@ -1217,6 +1218,9 @@ static int axienet_stop(struct net_devic + + dev_dbg(&ndev->dev, "axienet_close()\n"); + ++ WRITE_ONCE(lp->stopping, true); ++ flush_work(&lp->dma_err_task); ++ + napi_disable(&lp->napi_tx); + napi_disable(&lp->napi_rx); + +@@ -1761,6 +1765,10 @@ static void axienet_dma_err_handler(stru + dma_err_task); + struct net_device *ndev = lp->ndev; + ++ /* Don't bother if we are going to stop anyway */ ++ if (READ_ONCE(lp->stopping)) ++ return; ++ + napi_disable(&lp->napi_tx); + napi_disable(&lp->napi_rx); + diff --git a/queue-6.6/series b/queue-6.6/series index be7d41e07ae..6b6d65f24d7 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -44,3 +44,6 @@ net-tighten-bad-gso-csum-offset-check-in-virtio_net_hdr.patch dm-integrity-fix-a-race-condition-when-accessing-recalc_sector.patch x86-hyperv-fix-kexec-crash-due-to-vp-assist-page-corruption.patch mm-avoid-leaving-partial-pfn-mappings-around-in-error-case.patch +net-xilinx-axienet-fix-race-in-axienet_stop.patch +loongarch-kvm-implement-kvm-mmu-operations.patch +loongarch-use-accessors-to-page-table-entries-instead-of-direct-dereference.patch -- 2.47.3