]>
Commit | Line | Data |
---|---|---|
f41f2ed4 MS |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
5981611d | 3 | * Optimize vmemmap pages associated with HugeTLB |
f41f2ed4 MS |
4 | * |
5 | * Copyright (c) 2020, Bytedance. All rights reserved. | |
6 | * | |
7 | * Author: Muchun Song <songmuchun@bytedance.com> | |
8 | * | |
60a427db | 9 | * See Documentation/vm/vmemmap_dedup.rst |
f41f2ed4 | 10 | */ |
e9fdff87 MS |
11 | #define pr_fmt(fmt) "HugeTLB: " fmt |
12 | ||
78f39084 | 13 | #include <linux/memory_hotplug.h> |
f41f2ed4 MS |
14 | #include "hugetlb_vmemmap.h" |
15 | ||
16 | /* | |
17 | * There are a lot of struct page structures associated with each HugeTLB page. | |
18 | * For tail pages, the value of compound_head is the same. So we can reuse first | |
e7d32485 MS |
19 | * page of head page structures. We map the virtual addresses of all the pages |
20 | * of tail page structures to the head page struct, and then free these page | |
21 | * frames. Therefore, we need to reserve one pages as vmemmap areas. | |
f41f2ed4 | 22 | */ |
e7d32485 | 23 | #define RESERVE_VMEMMAP_NR 1U |
f41f2ed4 MS |
24 | #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) |
25 | ||
78f39084 MS |
26 | enum vmemmap_optimize_mode { |
27 | VMEMMAP_OPTIMIZE_OFF, | |
28 | VMEMMAP_OPTIMIZE_ON, | |
29 | }; | |
30 | ||
47010c04 | 31 | DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, |
f10f1442 MS |
32 | hugetlb_optimize_vmemmap_key); |
33 | EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); | |
e9fdff87 | 34 | |
78f39084 | 35 | static enum vmemmap_optimize_mode vmemmap_optimize_mode = |
0111def9 | 36 | IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); |
78f39084 MS |
37 | |
38 | static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) | |
39 | { | |
40 | if (vmemmap_optimize_mode == to) | |
41 | return; | |
42 | ||
43 | if (to == VMEMMAP_OPTIMIZE_OFF) | |
44 | static_branch_dec(&hugetlb_optimize_vmemmap_key); | |
45 | else | |
46 | static_branch_inc(&hugetlb_optimize_vmemmap_key); | |
47 | WRITE_ONCE(vmemmap_optimize_mode, to); | |
48 | } | |
49 | ||
5981611d | 50 | static int __init hugetlb_vmemmap_early_param(char *buf) |
e9fdff87 | 51 | { |
9c54c522 | 52 | bool enable; |
78f39084 | 53 | enum vmemmap_optimize_mode mode; |
9c54c522 MS |
54 | |
55 | if (kstrtobool(buf, &enable)) | |
e9fdff87 MS |
56 | return -EINVAL; |
57 | ||
78f39084 MS |
58 | mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; |
59 | vmemmap_optimize_mode_switch(mode); | |
e9fdff87 MS |
60 | |
61 | return 0; | |
62 | } | |
5981611d | 63 | early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); |
f41f2ed4 | 64 | |
ad2fa371 MS |
65 | /* |
66 | * Previously discarded vmemmap pages will be allocated and remapping | |
67 | * after this function returns zero. | |
68 | */ | |
5981611d | 69 | int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) |
ad2fa371 MS |
70 | { |
71 | int ret; | |
72 | unsigned long vmemmap_addr = (unsigned long)head; | |
5981611d | 73 | unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; |
ad2fa371 MS |
74 | |
75 | if (!HPageVmemmapOptimized(head)) | |
76 | return 0; | |
77 | ||
5981611d MS |
78 | vmemmap_addr += RESERVE_VMEMMAP_SIZE; |
79 | vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); | |
80 | vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); | |
81 | vmemmap_reuse = vmemmap_addr - PAGE_SIZE; | |
82 | ||
ad2fa371 MS |
83 | /* |
84 | * The pages which the vmemmap virtual address range [@vmemmap_addr, | |
85 | * @vmemmap_end) are mapped to are freed to the buddy allocator, and | |
86 | * the range is mapped to the page which @vmemmap_reuse is mapped to. | |
87 | * When a HugeTLB page is freed to the buddy allocator, previously | |
88 | * discarded vmemmap pages must be allocated and remapping. | |
89 | */ | |
90 | ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, | |
91 | GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); | |
78f39084 | 92 | if (!ret) { |
ad2fa371 | 93 | ClearHPageVmemmapOptimized(head); |
78f39084 MS |
94 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
95 | } | |
ad2fa371 MS |
96 | |
97 | return ret; | |
98 | } | |
99 | ||
5981611d | 100 | void hugetlb_vmemmap_free(struct hstate *h, struct page *head) |
f41f2ed4 MS |
101 | { |
102 | unsigned long vmemmap_addr = (unsigned long)head; | |
5981611d | 103 | unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; |
f41f2ed4 | 104 | |
5981611d MS |
105 | vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); |
106 | if (!vmemmap_pages) | |
f41f2ed4 MS |
107 | return; |
108 | ||
78f39084 MS |
109 | if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) |
110 | return; | |
111 | ||
112 | static_branch_inc(&hugetlb_optimize_vmemmap_key); | |
113 | ||
5981611d MS |
114 | vmemmap_addr += RESERVE_VMEMMAP_SIZE; |
115 | vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); | |
116 | vmemmap_reuse = vmemmap_addr - PAGE_SIZE; | |
f41f2ed4 MS |
117 | |
118 | /* | |
119 | * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) | |
120 | * to the page which @vmemmap_reuse is mapped to, then free the pages | |
121 | * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. | |
122 | */ | |
78f39084 MS |
123 | if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) |
124 | static_branch_dec(&hugetlb_optimize_vmemmap_key); | |
125 | else | |
3bc2b6a7 | 126 | SetHPageVmemmapOptimized(head); |
f41f2ed4 | 127 | } |
77490587 MS |
128 | |
129 | void __init hugetlb_vmemmap_init(struct hstate *h) | |
130 | { | |
131 | unsigned int nr_pages = pages_per_huge_page(h); | |
132 | unsigned int vmemmap_pages; | |
133 | ||
134 | /* | |
135 | * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct | |
47010c04 | 136 | * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, |
77490587 MS |
137 | * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. |
138 | */ | |
139 | BUILD_BUG_ON(__NR_USED_SUBPAGE >= | |
140 | RESERVE_VMEMMAP_SIZE / sizeof(struct page)); | |
141 | ||
0effdf46 MS |
142 | if (!is_power_of_2(sizeof(struct page))) { |
143 | pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); | |
144 | static_branch_disable(&hugetlb_optimize_vmemmap_key); | |
145 | return; | |
146 | } | |
147 | ||
77490587 MS |
148 | vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; |
149 | /* | |
e7d32485 MS |
150 | * The head page is not to be freed to buddy allocator, the other tail |
151 | * pages will map to the head page, so they can be freed. | |
77490587 MS |
152 | * |
153 | * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true | |
154 | * on some architectures (e.g. aarch64). See Documentation/arm64/ | |
155 | * hugetlbpage.rst for more details. | |
156 | */ | |
157 | if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) | |
5981611d | 158 | h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; |
77490587 | 159 | |
5981611d MS |
160 | pr_info("can optimize %d vmemmap pages for %s\n", |
161 | h->optimize_vmemmap_pages, h->name); | |
77490587 | 162 | } |
78f39084 MS |
163 | |
164 | #ifdef CONFIG_PROC_SYSCTL | |
165 | static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, | |
166 | void *buffer, size_t *length, | |
167 | loff_t *ppos) | |
168 | { | |
169 | int ret; | |
170 | enum vmemmap_optimize_mode mode; | |
171 | static DEFINE_MUTEX(sysctl_mutex); | |
172 | ||
173 | if (write && !capable(CAP_SYS_ADMIN)) | |
174 | return -EPERM; | |
175 | ||
176 | mutex_lock(&sysctl_mutex); | |
177 | mode = vmemmap_optimize_mode; | |
178 | table->data = &mode; | |
179 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | |
180 | if (write && !ret) | |
181 | vmemmap_optimize_mode_switch(mode); | |
182 | mutex_unlock(&sysctl_mutex); | |
183 | ||
184 | return ret; | |
185 | } | |
186 | ||
187 | static struct ctl_table hugetlb_vmemmap_sysctls[] = { | |
188 | { | |
189 | .procname = "hugetlb_optimize_vmemmap", | |
190 | .maxlen = sizeof(enum vmemmap_optimize_mode), | |
191 | .mode = 0644, | |
192 | .proc_handler = hugetlb_optimize_vmemmap_handler, | |
193 | .extra1 = SYSCTL_ZERO, | |
194 | .extra2 = SYSCTL_ONE, | |
195 | }, | |
196 | { } | |
197 | }; | |
198 | ||
199 | static __init int hugetlb_vmemmap_sysctls_init(void) | |
200 | { | |
201 | /* | |
202 | * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" | |
203 | * crosses page boundaries, the vmemmap pages cannot be optimized. | |
204 | */ | |
205 | if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page))) | |
206 | register_sysctl_init("vm", hugetlb_vmemmap_sysctls); | |
207 | ||
208 | return 0; | |
209 | } | |
210 | late_initcall(hugetlb_vmemmap_sysctls_init); | |
211 | #endif /* CONFIG_PROC_SYSCTL */ |