1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6 #include <linux/sched.h>
7 #include <linux/mm_types.h>
8 #include <linux/memblock.h>
9 #include <linux/memremap.h>
10 #include <linux/pkeys.h>
11 #include <linux/debugfs.h>
12 #include <linux/proc_fs.h>
13 #include <misc/cxl-base.h>
15 #include <asm/pgalloc.h>
17 #include <asm/trace.h>
18 #include <asm/powernv.h>
19 #include <asm/firmware.h>
20 #include <asm/ultravisor.h>
21 #include <asm/kexec.h>
23 #include <mm/mmu_decl.h>
24 #include <trace/events/thp.h>
28 struct mmu_psize_def mmu_psize_defs
[MMU_PAGE_COUNT
];
29 EXPORT_SYMBOL_GPL(mmu_psize_defs
);
31 #ifdef CONFIG_SPARSEMEM_VMEMMAP
32 int mmu_vmemmap_psize
= MMU_PAGE_4K
;
35 unsigned long __pmd_frag_nr
;
36 EXPORT_SYMBOL(__pmd_frag_nr
);
37 unsigned long __pmd_frag_size_shift
;
38 EXPORT_SYMBOL(__pmd_frag_size_shift
);
40 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
42 * This is called when relaxing access to a hugepage. It's also called in the page
43 * fault path when we don't hit any of the major fault cases, ie, a minor
44 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
45 * handled those two for us, we additionally deal with missing execute
46 * permission here on some processors
48 int pmdp_set_access_flags(struct vm_area_struct
*vma
, unsigned long address
,
49 pmd_t
*pmdp
, pmd_t entry
, int dirty
)
52 #ifdef CONFIG_DEBUG_VM
53 WARN_ON(!pmd_trans_huge(*pmdp
) && !pmd_devmap(*pmdp
));
54 assert_spin_locked(pmd_lockptr(vma
->vm_mm
, pmdp
));
56 changed
= !pmd_same(*(pmdp
), entry
);
59 * We can use MMU_PAGE_2M here, because only radix
60 * path look at the psize.
62 __ptep_set_access_flags(vma
, pmdp_ptep(pmdp
),
63 pmd_pte(entry
), address
, MMU_PAGE_2M
);
68 int pudp_set_access_flags(struct vm_area_struct
*vma
, unsigned long address
,
69 pud_t
*pudp
, pud_t entry
, int dirty
)
72 #ifdef CONFIG_DEBUG_VM
73 WARN_ON(!pud_devmap(*pudp
));
74 assert_spin_locked(pud_lockptr(vma
->vm_mm
, pudp
));
76 changed
= !pud_same(*(pudp
), entry
);
79 * We can use MMU_PAGE_1G here, because only radix
80 * path look at the psize.
82 __ptep_set_access_flags(vma
, pudp_ptep(pudp
),
83 pud_pte(entry
), address
, MMU_PAGE_1G
);
89 int pmdp_test_and_clear_young(struct vm_area_struct
*vma
,
90 unsigned long address
, pmd_t
*pmdp
)
92 return __pmdp_test_and_clear_young(vma
->vm_mm
, address
, pmdp
);
95 int pudp_test_and_clear_young(struct vm_area_struct
*vma
,
96 unsigned long address
, pud_t
*pudp
)
98 return __pudp_test_and_clear_young(vma
->vm_mm
, address
, pudp
);
102 * set a new huge pmd. We should not be called for updating
103 * an existing pmd entry. That should go via pmd_hugepage_update.
105 void set_pmd_at(struct mm_struct
*mm
, unsigned long addr
,
106 pmd_t
*pmdp
, pmd_t pmd
)
108 #ifdef CONFIG_DEBUG_VM
110 * Make sure hardware valid bit is not set. We don't do
111 * tlb flush for this update.
114 WARN_ON(pte_hw_valid(pmd_pte(*pmdp
)) && !pte_protnone(pmd_pte(*pmdp
)));
115 assert_spin_locked(pmd_lockptr(mm
, pmdp
));
116 WARN_ON(!(pmd_large(pmd
)));
118 trace_hugepage_set_pmd(addr
, pmd_val(pmd
));
119 return set_pte_at(mm
, addr
, pmdp_ptep(pmdp
), pmd_pte(pmd
));
122 void set_pud_at(struct mm_struct
*mm
, unsigned long addr
,
123 pud_t
*pudp
, pud_t pud
)
125 #ifdef CONFIG_DEBUG_VM
127 * Make sure hardware valid bit is not set. We don't do
128 * tlb flush for this update.
131 WARN_ON(pte_hw_valid(pud_pte(*pudp
)));
132 assert_spin_locked(pud_lockptr(mm
, pudp
));
133 WARN_ON(!(pud_large(pud
)));
135 trace_hugepage_set_pud(addr
, pud_val(pud
));
136 return set_pte_at(mm
, addr
, pudp_ptep(pudp
), pud_pte(pud
));
139 static void do_serialize(void *arg
)
141 /* We've taken the IPI, so try to trim the mask while here */
142 if (radix_enabled()) {
143 struct mm_struct
*mm
= arg
;
144 exit_lazy_flush_tlb(mm
, false);
149 * Serialize against __find_linux_pte() which does lock-less
150 * lookup in page tables with local interrupts disabled. For huge pages
151 * it casts pmd_t to pte_t. Since format of pte_t is different from
152 * pmd_t we want to prevent transit from pmd pointing to page table
153 * to pmd pointing to huge page (and back) while interrupts are disabled.
154 * We clear pmd to possibly replace it with page table pointer in
155 * different code paths. So make sure we wait for the parallel
156 * __find_linux_pte() to finish.
158 void serialize_against_pte_lookup(struct mm_struct
*mm
)
161 smp_call_function_many(mm_cpumask(mm
), do_serialize
, mm
, 1);
165 * We use this to invalidate a pmdp entry before switching from a
166 * hugepte to regular pmd entry.
168 pmd_t
pmdp_invalidate(struct vm_area_struct
*vma
, unsigned long address
,
171 unsigned long old_pmd
;
173 old_pmd
= pmd_hugepage_update(vma
->vm_mm
, address
, pmdp
, _PAGE_PRESENT
, _PAGE_INVALID
);
174 flush_pmd_tlb_range(vma
, address
, address
+ HPAGE_PMD_SIZE
);
175 return __pmd(old_pmd
);
178 pmd_t
pmdp_huge_get_and_clear_full(struct vm_area_struct
*vma
,
179 unsigned long addr
, pmd_t
*pmdp
, int full
)
182 VM_BUG_ON(addr
& ~HPAGE_PMD_MASK
);
183 VM_BUG_ON((pmd_present(*pmdp
) && !pmd_trans_huge(*pmdp
) &&
184 !pmd_devmap(*pmdp
)) || !pmd_present(*pmdp
));
185 pmd
= pmdp_huge_get_and_clear(vma
->vm_mm
, addr
, pmdp
);
187 * if it not a fullmm flush, then we can possibly end up converting
188 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
189 * Make sure we flush the tlb in this case.
192 flush_pmd_tlb_range(vma
, addr
, addr
+ HPAGE_PMD_SIZE
);
196 pud_t
pudp_huge_get_and_clear_full(struct vm_area_struct
*vma
,
197 unsigned long addr
, pud_t
*pudp
, int full
)
201 VM_BUG_ON(addr
& ~HPAGE_PMD_MASK
);
202 VM_BUG_ON((pud_present(*pudp
) && !pud_devmap(*pudp
)) ||
203 !pud_present(*pudp
));
204 pud
= pudp_huge_get_and_clear(vma
->vm_mm
, addr
, pudp
);
206 * if it not a fullmm flush, then we can possibly end up converting
207 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
208 * Make sure we flush the tlb in this case.
211 flush_pud_tlb_range(vma
, addr
, addr
+ HPAGE_PUD_SIZE
);
215 static pmd_t
pmd_set_protbits(pmd_t pmd
, pgprot_t pgprot
)
217 return __pmd(pmd_val(pmd
) | pgprot_val(pgprot
));
220 static pud_t
pud_set_protbits(pud_t pud
, pgprot_t pgprot
)
222 return __pud(pud_val(pud
) | pgprot_val(pgprot
));
226 * At some point we should be able to get rid of
227 * pmd_mkhuge() and mk_huge_pmd() when we update all the
228 * other archs to mark the pmd huge in pfn_pmd()
230 pmd_t
pfn_pmd(unsigned long pfn
, pgprot_t pgprot
)
234 pmdv
= (pfn
<< PAGE_SHIFT
) & PTE_RPN_MASK
;
236 return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv
), pgprot
));
239 pud_t
pfn_pud(unsigned long pfn
, pgprot_t pgprot
)
243 pudv
= (pfn
<< PAGE_SHIFT
) & PTE_RPN_MASK
;
245 return __pud_mkhuge(pud_set_protbits(__pud(pudv
), pgprot
));
248 pmd_t
mk_pmd(struct page
*page
, pgprot_t pgprot
)
250 return pfn_pmd(page_to_pfn(page
), pgprot
);
253 pmd_t
pmd_modify(pmd_t pmd
, pgprot_t newprot
)
258 pmdv
&= _HPAGE_CHG_MASK
;
259 return pmd_set_protbits(__pmd(pmdv
), newprot
);
261 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
263 /* For use by kexec, called with MMU off */
264 notrace
void mmu_cleanup_all(void)
267 radix__mmu_cleanup_all();
268 else if (mmu_hash_ops
.hpte_clear_all
)
269 mmu_hash_ops
.hpte_clear_all();
274 #ifdef CONFIG_MEMORY_HOTPLUG
275 int __meminit
create_section_mapping(unsigned long start
, unsigned long end
,
276 int nid
, pgprot_t prot
)
279 return radix__create_section_mapping(start
, end
, nid
, prot
);
281 return hash__create_section_mapping(start
, end
, nid
, prot
);
284 int __meminit
remove_section_mapping(unsigned long start
, unsigned long end
)
287 return radix__remove_section_mapping(start
, end
);
289 return hash__remove_section_mapping(start
, end
);
291 #endif /* CONFIG_MEMORY_HOTPLUG */
293 void __init
mmu_partition_table_init(void)
295 unsigned long patb_size
= 1UL << PATB_SIZE_SHIFT
;
298 /* Initialize the Partition Table with no entries */
299 partition_tb
= memblock_alloc(patb_size
, patb_size
);
301 panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
302 __func__
, patb_size
, patb_size
);
304 ptcr
= __pa(partition_tb
) | (PATB_SIZE_SHIFT
- 12);
305 set_ptcr_when_no_uv(ptcr
);
306 powernv_set_nmmu_ptcr(ptcr
);
309 static void flush_partition(unsigned int lpid
, bool radix
)
312 radix__flush_all_lpid(lpid
);
313 radix__flush_all_lpid_guest(lpid
);
315 asm volatile("ptesync" : : : "memory");
316 asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
317 "r" (TLBIEL_INVAL_SET_LPID
), "r" (lpid
));
318 /* do we need fixup here ?*/
319 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
320 trace_tlbie(lpid
, 0, TLBIEL_INVAL_SET_LPID
, lpid
, 2, 0, 0);
324 void mmu_partition_table_set_entry(unsigned int lpid
, unsigned long dw0
,
325 unsigned long dw1
, bool flush
)
327 unsigned long old
= be64_to_cpu(partition_tb
[lpid
].patb0
);
330 * When ultravisor is enabled, the partition table is stored in secure
331 * memory and can only be accessed doing an ultravisor call. However, we
332 * maintain a copy of the partition table in normal memory to allow Nest
333 * MMU translations to occur (for normal VMs).
335 * Therefore, here we always update partition_tb, regardless of whether
336 * we are running under an ultravisor or not.
338 partition_tb
[lpid
].patb0
= cpu_to_be64(dw0
);
339 partition_tb
[lpid
].patb1
= cpu_to_be64(dw1
);
342 * If ultravisor is enabled, we do an ultravisor call to register the
343 * partition table entry (PATE), which also do a global flush of TLBs
344 * and partition table caches for the lpid. Otherwise, just do the
345 * flush. The type of flush (hash or radix) depends on what the previous
346 * use of the partition ID was, not the new use.
348 if (firmware_has_feature(FW_FEATURE_ULTRAVISOR
)) {
349 uv_register_pate(lpid
, dw0
, dw1
);
350 pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
354 * Boot does not need to flush, because MMU is off and each
355 * CPU does a tlbiel_all() before switching them on, which
356 * flushes everything.
358 flush_partition(lpid
, (old
& PATB_HR
));
361 EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry
);
363 static pmd_t
*get_pmd_from_cache(struct mm_struct
*mm
)
365 void *pmd_frag
, *ret
;
367 if (PMD_FRAG_NR
== 1)
370 spin_lock(&mm
->page_table_lock
);
371 ret
= mm
->context
.pmd_frag
;
373 pmd_frag
= ret
+ PMD_FRAG_SIZE
;
375 * If we have taken up all the fragments mark PTE page NULL
377 if (((unsigned long)pmd_frag
& ~PAGE_MASK
) == 0)
379 mm
->context
.pmd_frag
= pmd_frag
;
381 spin_unlock(&mm
->page_table_lock
);
385 static pmd_t
*__alloc_for_pmdcache(struct mm_struct
*mm
)
388 struct ptdesc
*ptdesc
;
389 gfp_t gfp
= GFP_KERNEL_ACCOUNT
| __GFP_ZERO
;
392 gfp
&= ~__GFP_ACCOUNT
;
393 ptdesc
= pagetable_alloc(gfp
, 0);
396 if (!pagetable_pmd_ctor(ptdesc
)) {
397 pagetable_free(ptdesc
);
401 atomic_set(&ptdesc
->pt_frag_refcount
, 1);
403 ret
= ptdesc_address(ptdesc
);
405 * if we support only one fragment just return the
408 if (PMD_FRAG_NR
== 1)
411 spin_lock(&mm
->page_table_lock
);
413 * If we find ptdesc_page set, we return
414 * the allocated page with single fragment
417 if (likely(!mm
->context
.pmd_frag
)) {
418 atomic_set(&ptdesc
->pt_frag_refcount
, PMD_FRAG_NR
);
419 mm
->context
.pmd_frag
= ret
+ PMD_FRAG_SIZE
;
421 spin_unlock(&mm
->page_table_lock
);
426 pmd_t
*pmd_fragment_alloc(struct mm_struct
*mm
, unsigned long vmaddr
)
430 pmd
= get_pmd_from_cache(mm
);
434 return __alloc_for_pmdcache(mm
);
437 void pmd_fragment_free(unsigned long *pmd
)
439 struct ptdesc
*ptdesc
= virt_to_ptdesc(pmd
);
441 if (pagetable_is_reserved(ptdesc
))
442 return free_reserved_ptdesc(ptdesc
);
444 BUG_ON(atomic_read(&ptdesc
->pt_frag_refcount
) <= 0);
445 if (atomic_dec_and_test(&ptdesc
->pt_frag_refcount
)) {
446 pagetable_pmd_dtor(ptdesc
);
447 pagetable_free(ptdesc
);
451 static inline void pgtable_free(void *table
, int index
)
455 pte_fragment_free(table
, 0);
458 pmd_fragment_free(table
);
463 #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
464 /* 16M hugepd directory at pud level */
466 BUILD_BUG_ON(H_16M_CACHE_INDEX
<= 0);
467 kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX
), table
);
469 /* 16G hugepd directory at the pgd level */
471 BUILD_BUG_ON(H_16G_CACHE_INDEX
<= 0);
472 kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX
), table
);
475 /* We don't free pgd table via RCU callback */
481 void pgtable_free_tlb(struct mmu_gather
*tlb
, void *table
, int index
)
483 unsigned long pgf
= (unsigned long)table
;
485 BUG_ON(index
> MAX_PGTABLE_INDEX_SIZE
);
487 tlb_remove_table(tlb
, (void *)pgf
);
490 void __tlb_remove_table(void *_table
)
492 void *table
= (void *)((unsigned long)_table
& ~MAX_PGTABLE_INDEX_SIZE
);
493 unsigned int index
= (unsigned long)_table
& MAX_PGTABLE_INDEX_SIZE
;
495 return pgtable_free(table
, index
);
498 #ifdef CONFIG_PROC_FS
499 atomic_long_t direct_pages_count
[MMU_PAGE_COUNT
];
501 void arch_report_meminfo(struct seq_file
*m
)
504 * Hash maps the memory with one size mmu_linear_psize.
505 * So don't bother to print these on hash
507 if (!radix_enabled())
509 seq_printf(m
, "DirectMap4k: %8lu kB\n",
510 atomic_long_read(&direct_pages_count
[MMU_PAGE_4K
]) << 2);
511 seq_printf(m
, "DirectMap64k: %8lu kB\n",
512 atomic_long_read(&direct_pages_count
[MMU_PAGE_64K
]) << 6);
513 seq_printf(m
, "DirectMap2M: %8lu kB\n",
514 atomic_long_read(&direct_pages_count
[MMU_PAGE_2M
]) << 11);
515 seq_printf(m
, "DirectMap1G: %8lu kB\n",
516 atomic_long_read(&direct_pages_count
[MMU_PAGE_1G
]) << 20);
518 #endif /* CONFIG_PROC_FS */
520 pte_t
ptep_modify_prot_start(struct vm_area_struct
*vma
, unsigned long addr
,
523 unsigned long pte_val
;
526 * Clear the _PAGE_PRESENT so that no hardware parallel update is
527 * possible. Also keep the pte_present true so that we don't take
530 pte_val
= pte_update(vma
->vm_mm
, addr
, ptep
, _PAGE_PRESENT
, _PAGE_INVALID
, 0);
532 return __pte(pte_val
);
536 void ptep_modify_prot_commit(struct vm_area_struct
*vma
, unsigned long addr
,
537 pte_t
*ptep
, pte_t old_pte
, pte_t pte
)
540 return radix__ptep_modify_prot_commit(vma
, addr
,
542 set_pte_at(vma
->vm_mm
, addr
, ptep
, pte
);
546 * For hash translation mode, we use the deposited table to store hash slot
547 * information and they are stored at PTRS_PER_PMD offset from related pmd
548 * location. Hence a pmd move requires deposit and withdraw.
550 * For radix translation with split pmd ptl, we store the deposited table in the
551 * pmd page. Hence if we have different pmd page we need to withdraw during pmd
554 * With hash we use deposited table always irrespective of anon or not.
555 * With radix we use deposited table only for anonymous mapping.
557 int pmd_move_must_withdraw(struct spinlock
*new_pmd_ptl
,
558 struct spinlock
*old_pmd_ptl
,
559 struct vm_area_struct
*vma
)
562 return (new_pmd_ptl
!= old_pmd_ptl
) && vma_is_anonymous(vma
);
568 * Does the CPU support tlbie?
570 bool tlbie_capable __read_mostly
= true;
571 EXPORT_SYMBOL(tlbie_capable
);
574 * Should tlbie be used for management of CPU TLBs, for kernel and process
575 * address spaces? tlbie may still be used for nMMU accelerators, and for KVM
576 * guest address spaces.
578 bool tlbie_enabled __read_mostly
= true;
580 static int __init
setup_disable_tlbie(char *str
)
582 if (!radix_enabled()) {
583 pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
587 tlbie_capable
= false;
588 tlbie_enabled
= false;
592 __setup("disable_tlbie", setup_disable_tlbie
);
594 static int __init
pgtable_debugfs_setup(void)
600 * There is no locking vs tlb flushing when changing this value.
601 * The tlb flushers will see one value or another, and use either
602 * tlbie or tlbiel with IPIs. In both cases the TLBs will be
603 * invalidated as expected.
605 debugfs_create_bool("tlbie_enabled", 0600,
611 arch_initcall(pgtable_debugfs_setup
);
613 #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN)
615 * Override the generic version in mm/memremap.c.
617 * With hash translation, the direct-map range is mapped with just one
618 * page size selected by htab_init_page_sizes(). Consult
619 * mmu_psize_defs[] to determine the minimum page size alignment.
621 unsigned long memremap_compat_align(void)
623 if (!radix_enabled()) {
624 unsigned int shift
= mmu_psize_defs
[mmu_linear_psize
].shift
;
625 return max(SUBSECTION_SIZE
, 1UL << shift
);
628 return SUBSECTION_SIZE
;
630 EXPORT_SYMBOL_GPL(memremap_compat_align
);
633 pgprot_t
vm_get_page_prot(unsigned long vm_flags
)
637 /* Radix supports execute-only, but protection_map maps X -> RX */
638 if (radix_enabled() && ((vm_flags
& VM_ACCESS_FLAGS
) == VM_EXEC
)) {
639 prot
= pgprot_val(PAGE_EXECONLY
);
641 prot
= pgprot_val(protection_map
[vm_flags
&
642 (VM_ACCESS_FLAGS
| VM_SHARED
)]);
645 if (vm_flags
& VM_SAO
)
648 #ifdef CONFIG_PPC_MEM_KEYS
649 prot
|= vmflag_to_pte_pkey_bits(vm_flags
);
652 return __pgprot(prot
);
654 EXPORT_SYMBOL(vm_get_page_prot
);