1 // SPDX-License-Identifier: GPL-2.0
3 * HugeTLB Vmemmap Optimization (HVO)
5 * Copyright (c) 2020, ByteDance. All rights reserved.
7 * Author: Muchun Song <songmuchun@bytedance.com>
9 * See Documentation/mm/vmemmap_dedup.rst
11 #define pr_fmt(fmt) "HugeTLB: " fmt
13 #include <linux/pgtable.h>
14 #include <linux/moduleparam.h>
15 #include <linux/bootmem_info.h>
16 #include <asm/pgalloc.h>
17 #include <asm/tlbflush.h>
18 #include "hugetlb_vmemmap.h"
21 * struct vmemmap_remap_walk - walk vmemmap page table
23 * @remap_pte: called for each lowest-level entry (PTE).
24 * @nr_walked: the number of walked pte.
25 * @reuse_page: the page which is reused for the tail vmemmap pages.
26 * @reuse_addr: the virtual address of the @reuse_page page.
27 * @vmemmap_pages: the list head of the vmemmap pages that can be freed
30 struct vmemmap_remap_walk
{
31 void (*remap_pte
)(pte_t
*pte
, unsigned long addr
,
32 struct vmemmap_remap_walk
*walk
);
33 unsigned long nr_walked
;
34 struct page
*reuse_page
;
35 unsigned long reuse_addr
;
36 struct list_head
*vmemmap_pages
;
39 static int split_vmemmap_huge_pmd(pmd_t
*pmd
, unsigned long start
)
43 unsigned long addr
= start
;
47 spin_lock(&init_mm
.page_table_lock
);
48 head
= pmd_leaf(*pmd
) ? pmd_page(*pmd
) : NULL
;
49 spin_unlock(&init_mm
.page_table_lock
);
54 pgtable
= pte_alloc_one_kernel(&init_mm
);
58 pmd_populate_kernel(&init_mm
, &__pmd
, pgtable
);
60 for (i
= 0; i
< PTRS_PER_PTE
; i
++, addr
+= PAGE_SIZE
) {
62 pgprot_t pgprot
= PAGE_KERNEL
;
64 entry
= mk_pte(head
+ i
, pgprot
);
65 pte
= pte_offset_kernel(&__pmd
, addr
);
66 set_pte_at(&init_mm
, addr
, pte
, entry
);
69 spin_lock(&init_mm
.page_table_lock
);
70 if (likely(pmd_leaf(*pmd
))) {
72 * Higher order allocations from buddy allocator must be able to
73 * be treated as indepdenent small pages (as they can be freed
76 if (!PageReserved(head
))
77 split_page(head
, get_order(PMD_SIZE
));
79 /* Make pte visible before pmd. See comment in pmd_install(). */
81 pmd_populate_kernel(&init_mm
, pmd
, pgtable
);
82 flush_tlb_kernel_range(start
, start
+ PMD_SIZE
);
84 pte_free_kernel(&init_mm
, pgtable
);
86 spin_unlock(&init_mm
.page_table_lock
);
91 static void vmemmap_pte_range(pmd_t
*pmd
, unsigned long addr
,
93 struct vmemmap_remap_walk
*walk
)
95 pte_t
*pte
= pte_offset_kernel(pmd
, addr
);
98 * The reuse_page is found 'first' in table walk before we start
99 * remapping (which is calling @walk->remap_pte).
101 if (!walk
->reuse_page
) {
102 walk
->reuse_page
= pte_page(ptep_get(pte
));
104 * Because the reuse address is part of the range that we are
105 * walking, skip the reuse address range.
112 for (; addr
!= end
; addr
+= PAGE_SIZE
, pte
++) {
113 walk
->remap_pte(pte
, addr
, walk
);
118 static int vmemmap_pmd_range(pud_t
*pud
, unsigned long addr
,
120 struct vmemmap_remap_walk
*walk
)
125 pmd
= pmd_offset(pud
, addr
);
129 ret
= split_vmemmap_huge_pmd(pmd
, addr
& PMD_MASK
);
133 next
= pmd_addr_end(addr
, end
);
134 vmemmap_pte_range(pmd
, addr
, next
, walk
);
135 } while (pmd
++, addr
= next
, addr
!= end
);
140 static int vmemmap_pud_range(p4d_t
*p4d
, unsigned long addr
,
142 struct vmemmap_remap_walk
*walk
)
147 pud
= pud_offset(p4d
, addr
);
151 next
= pud_addr_end(addr
, end
);
152 ret
= vmemmap_pmd_range(pud
, addr
, next
, walk
);
155 } while (pud
++, addr
= next
, addr
!= end
);
160 static int vmemmap_p4d_range(pgd_t
*pgd
, unsigned long addr
,
162 struct vmemmap_remap_walk
*walk
)
167 p4d
= p4d_offset(pgd
, addr
);
171 next
= p4d_addr_end(addr
, end
);
172 ret
= vmemmap_pud_range(p4d
, addr
, next
, walk
);
175 } while (p4d
++, addr
= next
, addr
!= end
);
180 static int vmemmap_remap_range(unsigned long start
, unsigned long end
,
181 struct vmemmap_remap_walk
*walk
)
183 unsigned long addr
= start
;
187 VM_BUG_ON(!PAGE_ALIGNED(start
));
188 VM_BUG_ON(!PAGE_ALIGNED(end
));
190 pgd
= pgd_offset_k(addr
);
194 next
= pgd_addr_end(addr
, end
);
195 ret
= vmemmap_p4d_range(pgd
, addr
, next
, walk
);
198 } while (pgd
++, addr
= next
, addr
!= end
);
200 flush_tlb_kernel_range(start
, end
);
206 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
207 * allocator or buddy allocator. If the PG_reserved flag is set, it means
208 * that it allocated from the memblock allocator, just free it via the
209 * free_bootmem_page(). Otherwise, use __free_page().
211 static inline void free_vmemmap_page(struct page
*page
)
213 if (PageReserved(page
))
214 free_bootmem_page(page
);
219 /* Free a list of the vmemmap pages */
220 static void free_vmemmap_page_list(struct list_head
*list
)
222 struct page
*page
, *next
;
224 list_for_each_entry_safe(page
, next
, list
, lru
)
225 free_vmemmap_page(page
);
228 static void vmemmap_remap_pte(pte_t
*pte
, unsigned long addr
,
229 struct vmemmap_remap_walk
*walk
)
232 * Remap the tail pages as read-only to catch illegal write operation
235 pgprot_t pgprot
= PAGE_KERNEL_RO
;
236 struct page
*page
= pte_page(ptep_get(pte
));
239 /* Remapping the head page requires r/w */
240 if (unlikely(addr
== walk
->reuse_addr
)) {
241 pgprot
= PAGE_KERNEL
;
242 list_del(&walk
->reuse_page
->lru
);
245 * Makes sure that preceding stores to the page contents from
246 * vmemmap_remap_free() become visible before the set_pte_at()
252 entry
= mk_pte(walk
->reuse_page
, pgprot
);
253 list_add_tail(&page
->lru
, walk
->vmemmap_pages
);
254 set_pte_at(&init_mm
, addr
, pte
, entry
);
258 * How many struct page structs need to be reset. When we reuse the head
259 * struct page, the special metadata (e.g. page->flags or page->mapping)
260 * cannot copy to the tail struct page structs. The invalid value will be
261 * checked in the free_tail_page_prepare(). In order to avoid the message
262 * of "corrupted mapping in tail page". We need to reset at least 3 (one
263 * head struct page struct and two tail struct page structs) struct page
266 #define NR_RESET_STRUCT_PAGE 3
268 static inline void reset_struct_pages(struct page
*start
)
270 struct page
*from
= start
+ NR_RESET_STRUCT_PAGE
;
272 BUILD_BUG_ON(NR_RESET_STRUCT_PAGE
* 2 > PAGE_SIZE
/ sizeof(struct page
));
273 memcpy(start
, from
, sizeof(*from
) * NR_RESET_STRUCT_PAGE
);
276 static void vmemmap_restore_pte(pte_t
*pte
, unsigned long addr
,
277 struct vmemmap_remap_walk
*walk
)
279 pgprot_t pgprot
= PAGE_KERNEL
;
283 BUG_ON(pte_page(ptep_get(pte
)) != walk
->reuse_page
);
285 page
= list_first_entry(walk
->vmemmap_pages
, struct page
, lru
);
286 list_del(&page
->lru
);
287 to
= page_to_virt(page
);
288 copy_page(to
, (void *)walk
->reuse_addr
);
289 reset_struct_pages(to
);
292 * Makes sure that preceding stores to the page contents become visible
293 * before the set_pte_at() write.
296 set_pte_at(&init_mm
, addr
, pte
, mk_pte(page
, pgprot
));
300 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
301 * to the page which @reuse is mapped to, then free vmemmap
302 * which the range are mapped to.
303 * @start: start address of the vmemmap virtual address range that we want
305 * @end: end address of the vmemmap virtual address range that we want to
307 * @reuse: reuse address.
309 * Return: %0 on success, negative error code otherwise.
311 static int vmemmap_remap_free(unsigned long start
, unsigned long end
,
315 LIST_HEAD(vmemmap_pages
);
316 struct vmemmap_remap_walk walk
= {
317 .remap_pte
= vmemmap_remap_pte
,
319 .vmemmap_pages
= &vmemmap_pages
,
321 int nid
= page_to_nid((struct page
*)start
);
322 gfp_t gfp_mask
= GFP_KERNEL
| __GFP_THISNODE
| __GFP_NORETRY
|
326 * Allocate a new head vmemmap page to avoid breaking a contiguous
327 * block of struct page memory when freeing it back to page allocator
328 * in free_vmemmap_page_list(). This will allow the likely contiguous
329 * struct page backing memory to be kept contiguous and allowing for
330 * more allocations of hugepages. Fallback to the currently
331 * mapped head page in case should it fail to allocate.
333 walk
.reuse_page
= alloc_pages_node(nid
, gfp_mask
, 0);
334 if (walk
.reuse_page
) {
335 copy_page(page_to_virt(walk
.reuse_page
),
336 (void *)walk
.reuse_addr
);
337 list_add(&walk
.reuse_page
->lru
, &vmemmap_pages
);
341 * In order to make remapping routine most efficient for the huge pages,
342 * the routine of vmemmap page table walking has the following rules
343 * (see more details from the vmemmap_pte_range()):
345 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
346 * should be continuous.
347 * - The @reuse address is part of the range [@reuse, @end) that we are
348 * walking which is passed to vmemmap_remap_range().
349 * - The @reuse address is the first in the complete range.
351 * So we need to make sure that @start and @reuse meet the above rules.
353 BUG_ON(start
- reuse
!= PAGE_SIZE
);
355 mmap_read_lock(&init_mm
);
356 ret
= vmemmap_remap_range(reuse
, end
, &walk
);
357 if (ret
&& walk
.nr_walked
) {
358 end
= reuse
+ walk
.nr_walked
* PAGE_SIZE
;
360 * vmemmap_pages contains pages from the previous
361 * vmemmap_remap_range call which failed. These
362 * are pages which were removed from the vmemmap.
363 * They will be restored in the following call.
365 walk
= (struct vmemmap_remap_walk
) {
366 .remap_pte
= vmemmap_restore_pte
,
368 .vmemmap_pages
= &vmemmap_pages
,
371 vmemmap_remap_range(reuse
, end
, &walk
);
373 mmap_read_unlock(&init_mm
);
375 free_vmemmap_page_list(&vmemmap_pages
);
380 static int alloc_vmemmap_page_list(unsigned long start
, unsigned long end
,
381 struct list_head
*list
)
383 gfp_t gfp_mask
= GFP_KERNEL
| __GFP_RETRY_MAYFAIL
| __GFP_THISNODE
;
384 unsigned long nr_pages
= (end
- start
) >> PAGE_SHIFT
;
385 int nid
= page_to_nid((struct page
*)start
);
386 struct page
*page
, *next
;
389 page
= alloc_pages_node(nid
, gfp_mask
, 0);
392 list_add_tail(&page
->lru
, list
);
397 list_for_each_entry_safe(page
, next
, list
, lru
)
403 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
404 * to the page which is from the @vmemmap_pages
406 * @start: start address of the vmemmap virtual address range that we want
408 * @end: end address of the vmemmap virtual address range that we want to
410 * @reuse: reuse address.
412 * Return: %0 on success, negative error code otherwise.
414 static int vmemmap_remap_alloc(unsigned long start
, unsigned long end
,
417 LIST_HEAD(vmemmap_pages
);
418 struct vmemmap_remap_walk walk
= {
419 .remap_pte
= vmemmap_restore_pte
,
421 .vmemmap_pages
= &vmemmap_pages
,
424 /* See the comment in the vmemmap_remap_free(). */
425 BUG_ON(start
- reuse
!= PAGE_SIZE
);
427 if (alloc_vmemmap_page_list(start
, end
, &vmemmap_pages
))
430 mmap_read_lock(&init_mm
);
431 vmemmap_remap_range(reuse
, end
, &walk
);
432 mmap_read_unlock(&init_mm
);
437 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key
);
438 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key
);
440 static bool vmemmap_optimize_enabled
= IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
);
441 core_param(hugetlb_free_vmemmap
, vmemmap_optimize_enabled
, bool, 0);
444 * hugetlb_vmemmap_restore - restore previously optimized (by
445 * hugetlb_vmemmap_optimize()) vmemmap pages which
446 * will be reallocated and remapped.
448 * @head: the head page whose vmemmap pages will be restored.
450 * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
451 * negative error code otherwise.
453 int hugetlb_vmemmap_restore(const struct hstate
*h
, struct page
*head
)
456 unsigned long vmemmap_start
= (unsigned long)head
, vmemmap_end
;
457 unsigned long vmemmap_reuse
;
459 if (!HPageVmemmapOptimized(head
))
462 vmemmap_end
= vmemmap_start
+ hugetlb_vmemmap_size(h
);
463 vmemmap_reuse
= vmemmap_start
;
464 vmemmap_start
+= HUGETLB_VMEMMAP_RESERVE_SIZE
;
467 * The pages which the vmemmap virtual address range [@vmemmap_start,
468 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
469 * the range is mapped to the page which @vmemmap_reuse is mapped to.
470 * When a HugeTLB page is freed to the buddy allocator, previously
471 * discarded vmemmap pages must be allocated and remapping.
473 ret
= vmemmap_remap_alloc(vmemmap_start
, vmemmap_end
, vmemmap_reuse
);
475 ClearHPageVmemmapOptimized(head
);
476 static_branch_dec(&hugetlb_optimize_vmemmap_key
);
482 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
483 static bool vmemmap_should_optimize(const struct hstate
*h
, const struct page
*head
)
485 if (!READ_ONCE(vmemmap_optimize_enabled
))
488 if (!hugetlb_vmemmap_optimizable(h
))
491 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG
)) {
493 struct page
*vmemmap_page
;
494 unsigned long vaddr
= (unsigned long)head
;
497 * Only the vmemmap page's vmemmap page can be self-hosted.
498 * Walking the page tables to find the backing page of the
501 pmdp
= pmd_off_k(vaddr
);
503 * The READ_ONCE() is used to stabilize *pmdp in a register or
504 * on the stack so that it will stop changing under the code.
505 * The only concurrent operation where it can be changed is
506 * split_vmemmap_huge_pmd() (*pmdp will be stable after this
509 pmd
= READ_ONCE(*pmdp
);
511 vmemmap_page
= pmd_page(pmd
) + pte_index(vaddr
);
513 vmemmap_page
= pte_page(*pte_offset_kernel(pmdp
, vaddr
));
515 * Due to HugeTLB alignment requirements and the vmemmap pages
516 * being at the start of the hotplugged memory region in
517 * memory_hotplug.memmap_on_memory case. Checking any vmemmap
518 * page's vmemmap page if it is marked as VmemmapSelfHosted is
521 * [ hotplugged memory ]
522 * [ section ][...][ section ]
523 * [ vmemmap ][ usable memory ]
529 * +-------------------------------------------+
531 if (PageVmemmapSelfHosted(vmemmap_page
))
539 * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
541 * @head: the head page whose vmemmap pages will be optimized.
543 * This function only tries to optimize @head's vmemmap pages and does not
544 * guarantee that the optimization will succeed after it returns. The caller
545 * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
546 * have been optimized.
548 void hugetlb_vmemmap_optimize(const struct hstate
*h
, struct page
*head
)
550 unsigned long vmemmap_start
= (unsigned long)head
, vmemmap_end
;
551 unsigned long vmemmap_reuse
;
553 if (!vmemmap_should_optimize(h
, head
))
556 static_branch_inc(&hugetlb_optimize_vmemmap_key
);
558 vmemmap_end
= vmemmap_start
+ hugetlb_vmemmap_size(h
);
559 vmemmap_reuse
= vmemmap_start
;
560 vmemmap_start
+= HUGETLB_VMEMMAP_RESERVE_SIZE
;
563 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
564 * to the page which @vmemmap_reuse is mapped to, then free the pages
565 * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
567 if (vmemmap_remap_free(vmemmap_start
, vmemmap_end
, vmemmap_reuse
))
568 static_branch_dec(&hugetlb_optimize_vmemmap_key
);
570 SetHPageVmemmapOptimized(head
);
573 static struct ctl_table hugetlb_vmemmap_sysctls
[] = {
575 .procname
= "hugetlb_optimize_vmemmap",
576 .data
= &vmemmap_optimize_enabled
,
577 .maxlen
= sizeof(vmemmap_optimize_enabled
),
579 .proc_handler
= proc_dobool
,
584 static int __init
hugetlb_vmemmap_init(void)
586 const struct hstate
*h
;
588 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
589 BUILD_BUG_ON(__NR_USED_SUBPAGE
* sizeof(struct page
) > HUGETLB_VMEMMAP_RESERVE_SIZE
);
592 if (hugetlb_vmemmap_optimizable(h
)) {
593 register_sysctl_init("vm", hugetlb_vmemmap_sysctls
);
599 late_initcall(hugetlb_vmemmap_init
);