]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - mm/huge_memory.c
Merge tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel...
[thirdparty/kernel/stable.git] / mm / huge_memory.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
71e3aac0
AA
2/*
3 * Copyright (C) 2009 Red Hat, Inc.
71e3aac0
AA
4 */
5
ae3a8c1c
AM
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
71e3aac0
AA
8#include <linux/mm.h>
9#include <linux/sched.h>
fa6c0231 10#include <linux/sched/mm.h>
f7ccbae4 11#include <linux/sched/coredump.h>
6a3827d7 12#include <linux/sched/numa_balancing.h>
71e3aac0
AA
13#include <linux/highmem.h>
14#include <linux/hugetlb.h>
15#include <linux/mmu_notifier.h>
16#include <linux/rmap.h>
17#include <linux/swap.h>
97ae1749 18#include <linux/shrinker.h>
ba76149f 19#include <linux/mm_inline.h>
e9b61f19 20#include <linux/swapops.h>
fb5c2029 21#include <linux/backing-dev.h>
4897c765 22#include <linux/dax.h>
ba76149f 23#include <linux/khugepaged.h>
878aee7d 24#include <linux/freezer.h>
f25748e3 25#include <linux/pfn_t.h>
a664b2d8 26#include <linux/mman.h>
3565fce3 27#include <linux/memremap.h>
325adeb5 28#include <linux/pagemap.h>
49071d43 29#include <linux/debugfs.h>
4daae3b4 30#include <linux/migrate.h>
43b5fbbd 31#include <linux/hashtable.h>
6b251fc9 32#include <linux/userfaultfd_k.h>
33c3fc71 33#include <linux/page_idle.h>
baa355fd 34#include <linux/shmem_fs.h>
6b31d595 35#include <linux/oom.h>
98fa15f3 36#include <linux/numa.h>
f7da677b 37#include <linux/page_owner.h>
a1a3a2fc 38#include <linux/sched/sysctl.h>
467b171a 39#include <linux/memory-tiers.h>
4ef9ad19 40#include <linux/compat.h>
be25d1d4 41#include <linux/pgalloc_tag.h>
97ae1749 42
71e3aac0
AA
43#include <asm/tlb.h>
44#include <asm/pgalloc.h>
45#include "internal.h"
014bb1de 46#include "swap.h"
71e3aac0 47
283fd6fe
AK
48#define CREATE_TRACE_POINTS
49#include <trace/events/thp.h>
50
ba76149f 51/*
b14d595a
MD
52 * By default, transparent hugepage support is disabled in order to avoid
53 * risking an increased memory footprint for applications that are not
54 * guaranteed to benefit from it. When transparent hugepage support is
55 * enabled, it is for all mappings, and khugepaged scans all mappings.
8bfa3f9a
JW
56 * Defrag is invoked by khugepaged hugepage allocations and by page faults
57 * for all hugepage allocations.
ba76149f 58 */
71e3aac0 59unsigned long transparent_hugepage_flags __read_mostly =
13ece886 60#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
ba76149f 61 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
13ece886
AA
62#endif
63#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
64 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
65#endif
444eb2a4 66 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
79da5407
KS
67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
68 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
ba76149f 69
54d91729
QZ
70static struct shrinker *deferred_split_shrinker;
71static unsigned long deferred_split_count(struct shrinker *shrink,
72 struct shrink_control *sc);
73static unsigned long deferred_split_scan(struct shrinker *shrink,
74 struct shrink_control *sc);
f000565a 75
97ae1749 76static atomic_t huge_zero_refcount;
5691753d 77struct folio *huge_zero_folio __read_mostly;
3b77e8c8 78unsigned long huge_zero_pfn __read_mostly = ~0UL;
3485b883
RR
79unsigned long huge_anon_orders_always __read_mostly;
80unsigned long huge_anon_orders_madvise __read_mostly;
81unsigned long huge_anon_orders_inherit __read_mostly;
82
83unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
e0ffb29b
MW
84 unsigned long vm_flags,
85 unsigned long tva_flags,
3485b883
RR
86 unsigned long orders)
87{
e0ffb29b
MW
88 bool smaps = tva_flags & TVA_SMAPS;
89 bool in_pf = tva_flags & TVA_IN_PF;
90 bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
3485b883
RR
91 /* Check the intersection of requested and supported orders. */
92 orders &= vma_is_anonymous(vma) ?
93 THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
94 if (!orders)
95 return 0;
4a6c1297 96
9fec5168 97 if (!vma->vm_mm) /* vdso */
3485b883 98 return 0;
9fec5168 99
7da4e2cb
YS
100 /*
101 * Explicitly disabled through madvise or prctl, or some
102 * architectures may disable THP for some mappings, for
103 * example, s390 kvm.
104 * */
105 if ((vm_flags & VM_NOHUGEPAGE) ||
106 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
3485b883 107 return 0;
7da4e2cb
YS
108 /*
109 * If the hardware/firmware marked hugepage support disabled.
110 */
3c556d24 111 if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
3485b883 112 return 0;
c0630669 113
7da4e2cb 114 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
9fec5168 115 if (vma_is_dax(vma))
3485b883 116 return in_pf ? orders : 0;
7da4e2cb
YS
117
118 /*
7a81751f 119 * khugepaged special VMA and hugetlb VMA.
7da4e2cb
YS
120 * Must be checked after dax since some dax mappings may have
121 * VM_MIXEDMAP set.
122 */
7a81751f 123 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
3485b883 124 return 0;
9fec5168 125
7da4e2cb 126 /*
3485b883
RR
127 * Check alignment for file vma and size for both file and anon vma by
128 * filtering out the unsuitable orders.
7da4e2cb
YS
129 *
130 * Skip the check for page fault. Huge fault does the check in fault
3485b883 131 * handlers.
7da4e2cb 132 */
3485b883
RR
133 if (!in_pf) {
134 int order = highest_order(orders);
135 unsigned long addr;
136
137 while (orders) {
138 addr = vma->vm_end - (PAGE_SIZE << order);
139 if (thp_vma_suitable_order(vma, addr, order))
140 break;
141 order = next_order(&orders, order);
142 }
143
144 if (!orders)
145 return 0;
146 }
9fec5168 147
7da4e2cb
YS
148 /*
149 * Enabled via shmem mount options or sysfs settings.
150 * Must be done before hugepage flags check since shmem has its
151 * own flags.
152 */
153 if (!in_pf && shmem_file(vma->vm_file))
2cf13384 154 return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
3485b883
RR
155 !enforce_sysfs, vma->vm_mm, vm_flags)
156 ? orders : 0;
9fec5168 157
7a81751f 158 if (!vma_is_anonymous(vma)) {
3485b883
RR
159 /*
160 * Enforce sysfs THP requirements as necessary. Anonymous vmas
161 * were already handled in thp_vma_allowable_orders().
162 */
163 if (enforce_sysfs &&
164 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
165 !hugepage_global_always())))
166 return 0;
167
7a81751f
ZK
168 /*
169 * Trust that ->huge_fault() handlers know what they are doing
170 * in fault path.
171 */
172 if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
3485b883 173 return orders;
7a81751f
ZK
174 /* Only regular file is valid in collapse path */
175 if (((!in_pf || smaps)) && file_thp_enabled(vma))
3485b883
RR
176 return orders;
177 return 0;
7a81751f 178 }
9fec5168
YS
179
180 if (vma_is_temporary_stack(vma))
3485b883 181 return 0;
9fec5168
YS
182
183 /*
184 * THPeligible bit of smaps should show 1 for proper VMAs even
185 * though anon_vma is not initialized yet.
7da4e2cb
YS
186 *
187 * Allow page fault since anon_vma may be not initialized until
188 * the first page fault.
9fec5168
YS
189 */
190 if (!vma->anon_vma)
3485b883 191 return (smaps || in_pf) ? orders : 0;
9fec5168 192
3485b883 193 return orders;
7635d9cb
MH
194}
195
aaa9705b 196static bool get_huge_zero_page(void)
97ae1749 197{
5691753d 198 struct folio *zero_folio;
97ae1749
KS
199retry:
200 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
aaa9705b 201 return true;
97ae1749 202
5691753d 203 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
4a6c1297 204 HPAGE_PMD_ORDER);
5691753d 205 if (!zero_folio) {
d8a8e1f0 206 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
aaa9705b 207 return false;
d8a8e1f0 208 }
97ae1749 209 preempt_disable();
5691753d 210 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
97ae1749 211 preempt_enable();
5691753d 212 folio_put(zero_folio);
97ae1749
KS
213 goto retry;
214 }
5691753d 215 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
97ae1749
KS
216
217 /* We take additional reference here. It will be put back by shrinker */
218 atomic_set(&huge_zero_refcount, 2);
219 preempt_enable();
f4981502 220 count_vm_event(THP_ZERO_PAGE_ALLOC);
aaa9705b 221 return true;
4a6c1297
KS
222}
223
6fcb52a5 224static void put_huge_zero_page(void)
4a6c1297 225{
97ae1749
KS
226 /*
227 * Counter should never go to zero here. Only shrinker can put
228 * last reference.
229 */
230 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
4a6c1297
KS
231}
232
5691753d 233struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
6fcb52a5
AL
234{
235 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
5691753d 236 return READ_ONCE(huge_zero_folio);
6fcb52a5
AL
237
238 if (!get_huge_zero_page())
239 return NULL;
240
241 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
242 put_huge_zero_page();
243
5691753d 244 return READ_ONCE(huge_zero_folio);
6fcb52a5
AL
245}
246
632230ff 247void mm_put_huge_zero_folio(struct mm_struct *mm)
6fcb52a5
AL
248{
249 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
250 put_huge_zero_page();
251}
252
48896466
GC
253static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
254 struct shrink_control *sc)
4a6c1297 255{
48896466
GC
256 /* we can free zero page only if last reference remains */
257 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
258}
97ae1749 259
48896466
GC
260static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
261 struct shrink_control *sc)
262{
97ae1749 263 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
5691753d
MWO
264 struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
265 BUG_ON(zero_folio == NULL);
3b77e8c8 266 WRITE_ONCE(huge_zero_pfn, ~0UL);
5691753d 267 folio_put(zero_folio);
48896466 268 return HPAGE_PMD_NR;
97ae1749
KS
269 }
270
271 return 0;
4a6c1297
KS
272}
273
54d91729 274static struct shrinker *huge_zero_page_shrinker;
97ae1749 275
71e3aac0 276#ifdef CONFIG_SYSFS
71e3aac0
AA
277static ssize_t enabled_show(struct kobject *kobj,
278 struct kobj_attribute *attr, char *buf)
279{
bfb0ffeb
JP
280 const char *output;
281
444eb2a4 282 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
bfb0ffeb
JP
283 output = "[always] madvise never";
284 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
285 &transparent_hugepage_flags))
286 output = "always [madvise] never";
444eb2a4 287 else
bfb0ffeb
JP
288 output = "always madvise [never]";
289
290 return sysfs_emit(buf, "%s\n", output);
71e3aac0 291}
444eb2a4 292
71e3aac0
AA
293static ssize_t enabled_store(struct kobject *kobj,
294 struct kobj_attribute *attr,
295 const char *buf, size_t count)
296{
21440d7e 297 ssize_t ret = count;
ba76149f 298
f42f2552 299 if (sysfs_streq(buf, "always")) {
21440d7e
DR
300 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
301 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
f42f2552 302 } else if (sysfs_streq(buf, "madvise")) {
21440d7e
DR
303 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
304 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
f42f2552 305 } else if (sysfs_streq(buf, "never")) {
21440d7e
DR
306 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
307 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
308 } else
309 ret = -EINVAL;
ba76149f
AA
310
311 if (ret > 0) {
b46e756f 312 int err = start_stop_khugepaged();
ba76149f
AA
313 if (err)
314 ret = err;
315 }
ba76149f 316 return ret;
71e3aac0 317}
37139bb0
ML
318
319static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
71e3aac0 320
b46e756f 321ssize_t single_hugepage_flag_show(struct kobject *kobj,
bfb0ffeb
JP
322 struct kobj_attribute *attr, char *buf,
323 enum transparent_hugepage_flag flag)
71e3aac0 324{
bfb0ffeb
JP
325 return sysfs_emit(buf, "%d\n",
326 !!test_bit(flag, &transparent_hugepage_flags));
71e3aac0 327}
e27e6151 328
b46e756f 329ssize_t single_hugepage_flag_store(struct kobject *kobj,
71e3aac0
AA
330 struct kobj_attribute *attr,
331 const char *buf, size_t count,
332 enum transparent_hugepage_flag flag)
333{
e27e6151
BH
334 unsigned long value;
335 int ret;
336
337 ret = kstrtoul(buf, 10, &value);
338 if (ret < 0)
339 return ret;
340 if (value > 1)
341 return -EINVAL;
342
343 if (value)
71e3aac0 344 set_bit(flag, &transparent_hugepage_flags);
e27e6151 345 else
71e3aac0 346 clear_bit(flag, &transparent_hugepage_flags);
71e3aac0
AA
347
348 return count;
349}
350
71e3aac0
AA
351static ssize_t defrag_show(struct kobject *kobj,
352 struct kobj_attribute *attr, char *buf)
353{
bfb0ffeb
JP
354 const char *output;
355
356 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
357 &transparent_hugepage_flags))
358 output = "[always] defer defer+madvise madvise never";
359 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
360 &transparent_hugepage_flags))
361 output = "always [defer] defer+madvise madvise never";
362 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
363 &transparent_hugepage_flags))
364 output = "always defer [defer+madvise] madvise never";
365 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
366 &transparent_hugepage_flags))
367 output = "always defer defer+madvise [madvise] never";
368 else
369 output = "always defer defer+madvise madvise [never]";
370
371 return sysfs_emit(buf, "%s\n", output);
71e3aac0 372}
21440d7e 373
71e3aac0
AA
374static ssize_t defrag_store(struct kobject *kobj,
375 struct kobj_attribute *attr,
376 const char *buf, size_t count)
377{
f42f2552 378 if (sysfs_streq(buf, "always")) {
21440d7e
DR
379 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
380 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
381 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
382 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
f42f2552 383 } else if (sysfs_streq(buf, "defer+madvise")) {
21440d7e
DR
384 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
385 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
386 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
387 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
f42f2552 388 } else if (sysfs_streq(buf, "defer")) {
4fad7fb6
DR
389 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
390 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
391 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
392 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
f42f2552 393 } else if (sysfs_streq(buf, "madvise")) {
21440d7e
DR
394 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
395 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
396 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
397 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
f42f2552 398 } else if (sysfs_streq(buf, "never")) {
21440d7e
DR
399 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
400 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
401 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
402 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
403 } else
404 return -EINVAL;
405
406 return count;
71e3aac0 407}
37139bb0 408static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
71e3aac0 409
79da5407 410static ssize_t use_zero_page_show(struct kobject *kobj,
ae7a927d 411 struct kobj_attribute *attr, char *buf)
79da5407 412{
b46e756f 413 return single_hugepage_flag_show(kobj, attr, buf,
ae7a927d 414 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
79da5407
KS
415}
416static ssize_t use_zero_page_store(struct kobject *kobj,
417 struct kobj_attribute *attr, const char *buf, size_t count)
418{
b46e756f 419 return single_hugepage_flag_store(kobj, attr, buf, count,
79da5407
KS
420 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
421}
37139bb0 422static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
49920d28
HD
423
424static ssize_t hpage_pmd_size_show(struct kobject *kobj,
ae7a927d 425 struct kobj_attribute *attr, char *buf)
49920d28 426{
ae7a927d 427 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
49920d28
HD
428}
429static struct kobj_attribute hpage_pmd_size_attr =
430 __ATTR_RO(hpage_pmd_size);
431
71e3aac0
AA
432static struct attribute *hugepage_attr[] = {
433 &enabled_attr.attr,
434 &defrag_attr.attr,
79da5407 435 &use_zero_page_attr.attr,
49920d28 436 &hpage_pmd_size_attr.attr,
396bcc52 437#ifdef CONFIG_SHMEM
5a6e75f8 438 &shmem_enabled_attr.attr,
71e3aac0
AA
439#endif
440 NULL,
441};
442
8aa95a21 443static const struct attribute_group hugepage_attr_group = {
71e3aac0 444 .attrs = hugepage_attr,
ba76149f
AA
445};
446
3485b883
RR
447static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
448static void thpsize_release(struct kobject *kobj);
449static DEFINE_SPINLOCK(huge_anon_orders_lock);
450static LIST_HEAD(thpsize_list);
451
452struct thpsize {
453 struct kobject kobj;
454 struct list_head node;
455 int order;
456};
457
458#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
459
460static ssize_t thpsize_enabled_show(struct kobject *kobj,
461 struct kobj_attribute *attr, char *buf)
462{
463 int order = to_thpsize(kobj)->order;
464 const char *output;
465
466 if (test_bit(order, &huge_anon_orders_always))
467 output = "[always] inherit madvise never";
468 else if (test_bit(order, &huge_anon_orders_inherit))
469 output = "always [inherit] madvise never";
470 else if (test_bit(order, &huge_anon_orders_madvise))
471 output = "always inherit [madvise] never";
472 else
473 output = "always inherit madvise [never]";
474
475 return sysfs_emit(buf, "%s\n", output);
476}
477
478static ssize_t thpsize_enabled_store(struct kobject *kobj,
479 struct kobj_attribute *attr,
480 const char *buf, size_t count)
481{
482 int order = to_thpsize(kobj)->order;
483 ssize_t ret = count;
484
485 if (sysfs_streq(buf, "always")) {
486 spin_lock(&huge_anon_orders_lock);
487 clear_bit(order, &huge_anon_orders_inherit);
488 clear_bit(order, &huge_anon_orders_madvise);
489 set_bit(order, &huge_anon_orders_always);
490 spin_unlock(&huge_anon_orders_lock);
491 } else if (sysfs_streq(buf, "inherit")) {
492 spin_lock(&huge_anon_orders_lock);
493 clear_bit(order, &huge_anon_orders_always);
494 clear_bit(order, &huge_anon_orders_madvise);
495 set_bit(order, &huge_anon_orders_inherit);
496 spin_unlock(&huge_anon_orders_lock);
497 } else if (sysfs_streq(buf, "madvise")) {
498 spin_lock(&huge_anon_orders_lock);
499 clear_bit(order, &huge_anon_orders_always);
500 clear_bit(order, &huge_anon_orders_inherit);
501 set_bit(order, &huge_anon_orders_madvise);
502 spin_unlock(&huge_anon_orders_lock);
503 } else if (sysfs_streq(buf, "never")) {
504 spin_lock(&huge_anon_orders_lock);
505 clear_bit(order, &huge_anon_orders_always);
506 clear_bit(order, &huge_anon_orders_inherit);
507 clear_bit(order, &huge_anon_orders_madvise);
508 spin_unlock(&huge_anon_orders_lock);
509 } else
510 ret = -EINVAL;
511
512 return ret;
513}
514
515static struct kobj_attribute thpsize_enabled_attr =
516 __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
517
518static struct attribute *thpsize_attrs[] = {
519 &thpsize_enabled_attr.attr,
520 NULL,
521};
522
523static const struct attribute_group thpsize_attr_group = {
524 .attrs = thpsize_attrs,
525};
526
527static const struct kobj_type thpsize_ktype = {
528 .release = &thpsize_release,
529 .sysfs_ops = &kobj_sysfs_ops,
530};
531
ec33687c
BS
532DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
533
534static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
535{
536 unsigned long sum = 0;
537 int cpu;
538
539 for_each_possible_cpu(cpu) {
540 struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
541
542 sum += this->stats[order][item];
543 }
544
545 return sum;
546}
547
548#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
549static ssize_t _name##_show(struct kobject *kobj, \
550 struct kobj_attribute *attr, char *buf) \
551{ \
552 int order = to_thpsize(kobj)->order; \
553 \
554 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
555} \
556static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
557
558DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
559DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
560DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
d0f048ac
BS
561DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
562DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
ec33687c
BS
563
564static struct attribute *stats_attrs[] = {
565 &anon_fault_alloc_attr.attr,
566 &anon_fault_fallback_attr.attr,
567 &anon_fault_fallback_charge_attr.attr,
d0f048ac
BS
568 &anon_swpout_attr.attr,
569 &anon_swpout_fallback_attr.attr,
ec33687c
BS
570 NULL,
571};
572
573static struct attribute_group stats_attr_group = {
574 .name = "stats",
575 .attrs = stats_attrs,
576};
577
3485b883
RR
578static struct thpsize *thpsize_create(int order, struct kobject *parent)
579{
580 unsigned long size = (PAGE_SIZE << order) / SZ_1K;
581 struct thpsize *thpsize;
582 int ret;
583
584 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
585 if (!thpsize)
586 return ERR_PTR(-ENOMEM);
587
588 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
589 "hugepages-%lukB", size);
590 if (ret) {
591 kfree(thpsize);
592 return ERR_PTR(ret);
593 }
594
595 ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
596 if (ret) {
597 kobject_put(&thpsize->kobj);
598 return ERR_PTR(ret);
599 }
600
ec33687c
BS
601 ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
602 if (ret) {
603 kobject_put(&thpsize->kobj);
604 return ERR_PTR(ret);
605 }
606
3485b883
RR
607 thpsize->order = order;
608 return thpsize;
609}
610
611static void thpsize_release(struct kobject *kobj)
612{
613 kfree(to_thpsize(kobj));
614}
615
569e5590 616static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
71e3aac0 617{
71e3aac0 618 int err;
3485b883
RR
619 struct thpsize *thpsize;
620 unsigned long orders;
621 int order;
622
623 /*
624 * Default to setting PMD-sized THP to inherit the global setting and
625 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
626 * constant so we have to do this here.
627 */
628 huge_anon_orders_inherit = BIT(PMD_ORDER);
71e3aac0 629
569e5590
SL
630 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
631 if (unlikely(!*hugepage_kobj)) {
ae3a8c1c 632 pr_err("failed to create transparent hugepage kobject\n");
569e5590 633 return -ENOMEM;
ba76149f
AA
634 }
635
569e5590 636 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
ba76149f 637 if (err) {
ae3a8c1c 638 pr_err("failed to register transparent hugepage group\n");
569e5590 639 goto delete_obj;
ba76149f
AA
640 }
641
569e5590 642 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
ba76149f 643 if (err) {
ae3a8c1c 644 pr_err("failed to register transparent hugepage group\n");
569e5590 645 goto remove_hp_group;
ba76149f 646 }
569e5590 647
3485b883
RR
648 orders = THP_ORDERS_ALL_ANON;
649 order = highest_order(orders);
650 while (orders) {
651 thpsize = thpsize_create(order, *hugepage_kobj);
652 if (IS_ERR(thpsize)) {
653 pr_err("failed to create thpsize for order %d\n", order);
654 err = PTR_ERR(thpsize);
655 goto remove_all;
656 }
657 list_add(&thpsize->node, &thpsize_list);
658 order = next_order(&orders, order);
659 }
660
569e5590
SL
661 return 0;
662
3485b883
RR
663remove_all:
664 hugepage_exit_sysfs(*hugepage_kobj);
665 return err;
569e5590
SL
666remove_hp_group:
667 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
668delete_obj:
669 kobject_put(*hugepage_kobj);
670 return err;
671}
672
673static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
674{
3485b883
RR
675 struct thpsize *thpsize, *tmp;
676
677 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
678 list_del(&thpsize->node);
679 kobject_put(&thpsize->kobj);
680 }
681
569e5590
SL
682 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
683 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
684 kobject_put(hugepage_kobj);
685}
686#else
687static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
688{
689 return 0;
690}
691
692static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
693{
694}
695#endif /* CONFIG_SYSFS */
696
54d91729
QZ
697static int __init thp_shrinker_init(void)
698{
699 huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
700 if (!huge_zero_page_shrinker)
701 return -ENOMEM;
702
703 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
704 SHRINKER_MEMCG_AWARE |
705 SHRINKER_NONSLAB,
706 "thp-deferred_split");
707 if (!deferred_split_shrinker) {
708 shrinker_free(huge_zero_page_shrinker);
709 return -ENOMEM;
710 }
711
712 huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
713 huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
714 shrinker_register(huge_zero_page_shrinker);
715
716 deferred_split_shrinker->count_objects = deferred_split_count;
717 deferred_split_shrinker->scan_objects = deferred_split_scan;
718 shrinker_register(deferred_split_shrinker);
719
720 return 0;
721}
722
723static void __init thp_shrinker_exit(void)
724{
725 shrinker_free(huge_zero_page_shrinker);
726 shrinker_free(deferred_split_shrinker);
727}
728
569e5590
SL
729static int __init hugepage_init(void)
730{
731 int err;
732 struct kobject *hugepage_kobj;
733
734 if (!has_transparent_hugepage()) {
3c556d24 735 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
569e5590
SL
736 return -EINVAL;
737 }
738
ff20c2e0
KS
739 /*
740 * hugepages can't be allocated by the buddy allocator
741 */
5e0a760b 742 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
ff20c2e0 743
569e5590
SL
744 err = hugepage_init_sysfs(&hugepage_kobj);
745 if (err)
65ebb64f 746 goto err_sysfs;
ba76149f 747
b46e756f 748 err = khugepaged_init();
ba76149f 749 if (err)
65ebb64f 750 goto err_slab;
ba76149f 751
54d91729 752 err = thp_shrinker_init();
9a982250 753 if (err)
54d91729 754 goto err_shrinker;
97ae1749 755
97562cd2
RR
756 /*
757 * By default disable transparent hugepages on smaller systems,
758 * where the extra memory used could hurt more than TLB overhead
759 * is likely to save. The admin can still enable it through /sys.
760 */
ca79b0c2 761 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
97562cd2 762 transparent_hugepage_flags = 0;
79553da2
KS
763 return 0;
764 }
97562cd2 765
79553da2 766 err = start_stop_khugepaged();
65ebb64f
KS
767 if (err)
768 goto err_khugepaged;
ba76149f 769
569e5590 770 return 0;
65ebb64f 771err_khugepaged:
54d91729
QZ
772 thp_shrinker_exit();
773err_shrinker:
b46e756f 774 khugepaged_destroy();
65ebb64f 775err_slab:
569e5590 776 hugepage_exit_sysfs(hugepage_kobj);
65ebb64f 777err_sysfs:
ba76149f 778 return err;
71e3aac0 779}
a64fb3cd 780subsys_initcall(hugepage_init);
71e3aac0
AA
781
782static int __init setup_transparent_hugepage(char *str)
783{
784 int ret = 0;
785 if (!str)
786 goto out;
787 if (!strcmp(str, "always")) {
788 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
789 &transparent_hugepage_flags);
790 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
791 &transparent_hugepage_flags);
792 ret = 1;
793 } else if (!strcmp(str, "madvise")) {
794 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
795 &transparent_hugepage_flags);
796 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
797 &transparent_hugepage_flags);
798 ret = 1;
799 } else if (!strcmp(str, "never")) {
800 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
801 &transparent_hugepage_flags);
802 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
803 &transparent_hugepage_flags);
804 ret = 1;
805 }
806out:
807 if (!ret)
ae3a8c1c 808 pr_warn("transparent_hugepage= cannot parse, ignored\n");
71e3aac0
AA
809 return ret;
810}
811__setup("transparent_hugepage=", setup_transparent_hugepage);
812
f55e1014 813pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
71e3aac0 814{
f55e1014 815 if (likely(vma->vm_flags & VM_WRITE))
161e393c 816 pmd = pmd_mkwrite(pmd, vma);
71e3aac0
AA
817 return pmd;
818}
819
87eaceb3 820#ifdef CONFIG_MEMCG
f8baa6be
MWO
821static inline
822struct deferred_split *get_deferred_split_queue(struct folio *folio)
9a982250 823{
f8baa6be
MWO
824 struct mem_cgroup *memcg = folio_memcg(folio);
825 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
87eaceb3
YS
826
827 if (memcg)
828 return &memcg->deferred_split_queue;
829 else
830 return &pgdat->deferred_split_queue;
9a982250 831}
87eaceb3 832#else
f8baa6be
MWO
833static inline
834struct deferred_split *get_deferred_split_queue(struct folio *folio)
87eaceb3 835{
f8baa6be 836 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
87eaceb3
YS
837
838 return &pgdat->deferred_split_queue;
839}
840#endif
9a982250 841
5beaee54 842static inline bool is_transparent_hugepage(const struct folio *folio)
005ba37c 843{
a644b0ab 844 if (!folio_test_large(folio))
fa1f68cc 845 return false;
005ba37c 846
5beaee54 847 return is_huge_zero_folio(folio) ||
de53c05f 848 folio_test_large_rmappable(folio);
005ba37c 849}
005ba37c 850
97d3d0f9
KS
851static unsigned long __thp_get_unmapped_area(struct file *filp,
852 unsigned long addr, unsigned long len,
ed48e87c
RE
853 loff_t off, unsigned long flags, unsigned long size,
854 vm_flags_t vm_flags)
74d2fad1 855{
74d2fad1
TK
856 loff_t off_end = off + len;
857 loff_t off_align = round_up(off, size);
96204e15 858 unsigned long len_pad, ret, off_sub;
74d2fad1 859
4ef9ad19
YS
860 if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
861 return 0;
862
74d2fad1
TK
863 if (off_end <= off_align || (off_end - off_align) < size)
864 return 0;
865
866 len_pad = len + size;
867 if (len_pad < len || (off + len_pad) < off)
868 return 0;
869
ed48e87c
RE
870 ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
871 off >> PAGE_SHIFT, flags, vm_flags);
97d3d0f9
KS
872
873 /*
874 * The failure might be due to length padding. The caller will retry
875 * without the padding.
876 */
877 if (IS_ERR_VALUE(ret))
74d2fad1
TK
878 return 0;
879
97d3d0f9
KS
880 /*
881 * Do not try to align to THP boundary if allocation at the address
882 * hint succeeds.
883 */
884 if (ret == addr)
885 return addr;
886
96204e15
RR
887 off_sub = (off - ret) & (size - 1);
888
529ce23a 889 if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
96204e15
RR
890 return ret + size;
891
892 ret += off_sub;
97d3d0f9 893 return ret;
74d2fad1
TK
894}
895
ed48e87c
RE
896unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
897 unsigned long len, unsigned long pgoff, unsigned long flags,
898 vm_flags_t vm_flags)
74d2fad1 899{
97d3d0f9 900 unsigned long ret;
74d2fad1
TK
901 loff_t off = (loff_t)pgoff << PAGE_SHIFT;
902
ed48e87c 903 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
97d3d0f9
KS
904 if (ret)
905 return ret;
1854bc6e 906
ed48e87c
RE
907 return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
908 vm_flags);
909}
910
911unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
912 unsigned long len, unsigned long pgoff, unsigned long flags)
913{
914 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
74d2fad1
TK
915}
916EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
917
2b740303
SJ
918static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
919 struct page *page, gfp_t gfp)
71e3aac0 920{
82b0f8c3 921 struct vm_area_struct *vma = vmf->vma;
cfe3236d 922 struct folio *folio = page_folio(page);
71e3aac0 923 pgtable_t pgtable;
82b0f8c3 924 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2b740303 925 vm_fault_t ret = 0;
71e3aac0 926
cfe3236d 927 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
00501b53 928
cfe3236d
KW
929 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
930 folio_put(folio);
6b251fc9 931 count_vm_event(THP_FAULT_FALLBACK);
85b9f46e 932 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
ec33687c
BS
933 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
934 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
6b251fc9
AA
935 return VM_FAULT_FALLBACK;
936 }
cfe3236d 937 folio_throttle_swaprate(folio, gfp);
00501b53 938
4cf58924 939 pgtable = pte_alloc_one(vma->vm_mm);
00501b53 940 if (unlikely(!pgtable)) {
6b31d595
MH
941 ret = VM_FAULT_OOM;
942 goto release;
00501b53 943 }
71e3aac0 944
c79b57e4 945 clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
52f37629 946 /*
cfe3236d 947 * The memory barrier inside __folio_mark_uptodate makes sure that
52f37629
MK
948 * clear_huge_page writes become visible before the set_pmd_at()
949 * write.
950 */
cfe3236d 951 __folio_mark_uptodate(folio);
71e3aac0 952
82b0f8c3
JK
953 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
954 if (unlikely(!pmd_none(*vmf->pmd))) {
6b31d595 955 goto unlock_release;
71e3aac0
AA
956 } else {
957 pmd_t entry;
6b251fc9 958
6b31d595
MH
959 ret = check_stable_address_space(vma->vm_mm);
960 if (ret)
961 goto unlock_release;
962
6b251fc9
AA
963 /* Deliver the page fault to userland */
964 if (userfaultfd_missing(vma)) {
82b0f8c3 965 spin_unlock(vmf->ptl);
cfe3236d 966 folio_put(folio);
bae473a4 967 pte_free(vma->vm_mm, pgtable);
8fd5eda4
ML
968 ret = handle_userfault(vmf, VM_UFFD_MISSING);
969 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
970 return ret;
6b251fc9
AA
971 }
972
3122359a 973 entry = mk_huge_pmd(page, vma->vm_page_prot);
f55e1014 974 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
cfe3236d
KW
975 folio_add_new_anon_rmap(folio, vma, haddr);
976 folio_add_lru_vma(folio, vma);
82b0f8c3
JK
977 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
978 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
fca40573 979 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
bae473a4 980 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
c4812909 981 mm_inc_nr_ptes(vma->vm_mm);
82b0f8c3 982 spin_unlock(vmf->ptl);
6b251fc9 983 count_vm_event(THP_FAULT_ALLOC);
ec33687c 984 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
9d82c694 985 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
71e3aac0
AA
986 }
987
aa2e878e 988 return 0;
6b31d595
MH
989unlock_release:
990 spin_unlock(vmf->ptl);
991release:
992 if (pgtable)
993 pte_free(vma->vm_mm, pgtable);
cfe3236d 994 folio_put(folio);
6b31d595
MH
995 return ret;
996
71e3aac0
AA
997}
998
444eb2a4 999/*
21440d7e
DR
1000 * always: directly stall for all thp allocations
1001 * defer: wake kswapd and fail if not immediately available
1002 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1003 * fail if not immediately available
1004 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1005 * available
1006 * never: never stall for any thp allocation
444eb2a4 1007 */
164cc4fe 1008gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
444eb2a4 1009{
164cc4fe 1010 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
2f0799a0 1011
ac79f78d 1012 /* Always do synchronous compaction */
a8282608
AA
1013 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1014 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
ac79f78d
DR
1015
1016 /* Kick kcompactd and fail quickly */
21440d7e 1017 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
19deb769 1018 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
ac79f78d
DR
1019
1020 /* Synchronous compaction if madvised, otherwise kick kcompactd */
21440d7e 1021 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
19deb769
DR
1022 return GFP_TRANSHUGE_LIGHT |
1023 (vma_madvised ? __GFP_DIRECT_RECLAIM :
1024 __GFP_KSWAPD_RECLAIM);
ac79f78d
DR
1025
1026 /* Only do synchronous compaction if madvised */
21440d7e 1027 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
19deb769
DR
1028 return GFP_TRANSHUGE_LIGHT |
1029 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
ac79f78d 1030
19deb769 1031 return GFP_TRANSHUGE_LIGHT;
444eb2a4
MG
1032}
1033
c4088ebd 1034/* Caller must hold page table lock. */
e28833bc 1035static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
97ae1749 1036 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
e28833bc 1037 struct folio *zero_folio)
fc9fe822
KS
1038{
1039 pmd_t entry;
7c414164 1040 if (!pmd_none(*pmd))
2efeb8da 1041 return;
e28833bc 1042 entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
fc9fe822 1043 entry = pmd_mkhuge(entry);
c8bb4163 1044 pgtable_trans_huge_deposit(mm, pmd, pgtable);
fc9fe822 1045 set_pmd_at(mm, haddr, pmd, entry);
c4812909 1046 mm_inc_nr_ptes(mm);
fc9fe822
KS
1047}
1048
2b740303 1049vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
71e3aac0 1050{
82b0f8c3 1051 struct vm_area_struct *vma = vmf->vma;
077fcf11 1052 gfp_t gfp;
cb196ee1 1053 struct folio *folio;
82b0f8c3 1054 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
a373baed 1055 vm_fault_t ret;
71e3aac0 1056
3485b883 1057 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
c0292554 1058 return VM_FAULT_FALLBACK;
a373baed
MWO
1059 ret = vmf_anon_prepare(vmf);
1060 if (ret)
1061 return ret;
4fa6893f 1062 khugepaged_enter_vma(vma, vma->vm_flags);
d2081b2b 1063
82b0f8c3 1064 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
bae473a4 1065 !mm_forbids_zeropage(vma->vm_mm) &&
128ec037
KS
1066 transparent_hugepage_use_zero_page()) {
1067 pgtable_t pgtable;
e28833bc 1068 struct folio *zero_folio;
2b740303 1069 vm_fault_t ret;
e28833bc 1070
4cf58924 1071 pgtable = pte_alloc_one(vma->vm_mm);
128ec037 1072 if (unlikely(!pgtable))
ba76149f 1073 return VM_FAULT_OOM;
e28833bc
MWO
1074 zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
1075 if (unlikely(!zero_folio)) {
bae473a4 1076 pte_free(vma->vm_mm, pgtable);
81ab4201 1077 count_vm_event(THP_FAULT_FALLBACK);
c0292554 1078 return VM_FAULT_FALLBACK;
b9bbfbe3 1079 }
82b0f8c3 1080 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
6b251fc9 1081 ret = 0;
82b0f8c3 1082 if (pmd_none(*vmf->pmd)) {
6b31d595
MH
1083 ret = check_stable_address_space(vma->vm_mm);
1084 if (ret) {
1085 spin_unlock(vmf->ptl);
bfe8cc1d 1086 pte_free(vma->vm_mm, pgtable);
6b31d595 1087 } else if (userfaultfd_missing(vma)) {
82b0f8c3 1088 spin_unlock(vmf->ptl);
bfe8cc1d 1089 pte_free(vma->vm_mm, pgtable);
82b0f8c3 1090 ret = handle_userfault(vmf, VM_UFFD_MISSING);
6b251fc9
AA
1091 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1092 } else {
e28833bc
MWO
1093 set_huge_zero_folio(pgtable, vma->vm_mm, vma,
1094 haddr, vmf->pmd, zero_folio);
fca40573 1095 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
82b0f8c3 1096 spin_unlock(vmf->ptl);
6b251fc9 1097 }
bfe8cc1d 1098 } else {
82b0f8c3 1099 spin_unlock(vmf->ptl);
bae473a4 1100 pte_free(vma->vm_mm, pgtable);
bfe8cc1d 1101 }
6b251fc9 1102 return ret;
71e3aac0 1103 }
164cc4fe 1104 gfp = vma_thp_gfp_mask(vma);
cb196ee1
MWO
1105 folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
1106 if (unlikely(!folio)) {
128ec037 1107 count_vm_event(THP_FAULT_FALLBACK);
ec33687c 1108 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
c0292554 1109 return VM_FAULT_FALLBACK;
128ec037 1110 }
cb196ee1 1111 return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
71e3aac0
AA
1112}
1113
ae18d6dc 1114static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
3b6521f5
OH
1115 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
1116 pgtable_t pgtable)
5cad465d
MW
1117{
1118 struct mm_struct *mm = vma->vm_mm;
1119 pmd_t entry;
1120 spinlock_t *ptl;
1121
1122 ptl = pmd_lock(mm, pmd);
c6f3c5ee
AK
1123 if (!pmd_none(*pmd)) {
1124 if (write) {
1125 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
1126 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1127 goto out_unlock;
1128 }
1129 entry = pmd_mkyoung(*pmd);
1130 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1131 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1132 update_mmu_cache_pmd(vma, addr, pmd);
1133 }
1134
1135 goto out_unlock;
1136 }
1137
f25748e3
DW
1138 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1139 if (pfn_t_devmap(pfn))
1140 entry = pmd_mkdevmap(entry);
01871e59 1141 if (write) {
f55e1014
LT
1142 entry = pmd_mkyoung(pmd_mkdirty(entry));
1143 entry = maybe_pmd_mkwrite(entry, vma);
5cad465d 1144 }
3b6521f5
OH
1145
1146 if (pgtable) {
1147 pgtable_trans_huge_deposit(mm, pmd, pgtable);
c4812909 1148 mm_inc_nr_ptes(mm);
c6f3c5ee 1149 pgtable = NULL;
3b6521f5
OH
1150 }
1151
01871e59
RZ
1152 set_pmd_at(mm, addr, pmd, entry);
1153 update_mmu_cache_pmd(vma, addr, pmd);
c6f3c5ee
AK
1154
1155out_unlock:
5cad465d 1156 spin_unlock(ptl);
c6f3c5ee
AK
1157 if (pgtable)
1158 pte_free(mm, pgtable);
5cad465d
MW
1159}
1160
9a9731b1 1161/**
7b806d22 1162 * vmf_insert_pfn_pmd - insert a pmd size pfn
9a9731b1
THV
1163 * @vmf: Structure describing the fault
1164 * @pfn: pfn to insert
9a9731b1
THV
1165 * @write: whether it's a write fault
1166 *
7b806d22 1167 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
9a9731b1
THV
1168 *
1169 * Return: vm_fault_t value.
1170 */
7b806d22 1171vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
5cad465d 1172{
fce86ff5
DW
1173 unsigned long addr = vmf->address & PMD_MASK;
1174 struct vm_area_struct *vma = vmf->vma;
7b806d22 1175 pgprot_t pgprot = vma->vm_page_prot;
3b6521f5 1176 pgtable_t pgtable = NULL;
fce86ff5 1177
5cad465d
MW
1178 /*
1179 * If we had pmd_special, we could avoid all these restrictions,
1180 * but we need to be consistent with PTEs and architectures that
1181 * can't support a 'special' bit.
1182 */
e1fb4a08
DJ
1183 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1184 !pfn_t_devmap(pfn));
5cad465d
MW
1185 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1186 (VM_PFNMAP|VM_MIXEDMAP));
1187 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
5cad465d
MW
1188
1189 if (addr < vma->vm_start || addr >= vma->vm_end)
1190 return VM_FAULT_SIGBUS;
308a047c 1191
3b6521f5 1192 if (arch_needs_pgtable_deposit()) {
4cf58924 1193 pgtable = pte_alloc_one(vma->vm_mm);
3b6521f5
OH
1194 if (!pgtable)
1195 return VM_FAULT_OOM;
1196 }
1197
308a047c
BP
1198 track_pfn_insert(vma, &pgprot, pfn);
1199
fce86ff5 1200 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
ae18d6dc 1201 return VM_FAULT_NOPAGE;
5cad465d 1202}
7b806d22 1203EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
5cad465d 1204
a00cc7d9 1205#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
f55e1014 1206static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
a00cc7d9 1207{
f55e1014 1208 if (likely(vma->vm_flags & VM_WRITE))
a00cc7d9
MW
1209 pud = pud_mkwrite(pud);
1210 return pud;
1211}
1212
1213static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
7b806d22 1214 pud_t *pud, pfn_t pfn, bool write)
a00cc7d9
MW
1215{
1216 struct mm_struct *mm = vma->vm_mm;
7b806d22 1217 pgprot_t prot = vma->vm_page_prot;
a00cc7d9
MW
1218 pud_t entry;
1219 spinlock_t *ptl;
1220
1221 ptl = pud_lock(mm, pud);
c6f3c5ee
AK
1222 if (!pud_none(*pud)) {
1223 if (write) {
1224 if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
1225 WARN_ON_ONCE(!is_huge_zero_pud(*pud));
1226 goto out_unlock;
1227 }
1228 entry = pud_mkyoung(*pud);
1229 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1230 if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1231 update_mmu_cache_pud(vma, addr, pud);
1232 }
1233 goto out_unlock;
1234 }
1235
a00cc7d9
MW
1236 entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1237 if (pfn_t_devmap(pfn))
1238 entry = pud_mkdevmap(entry);
1239 if (write) {
f55e1014
LT
1240 entry = pud_mkyoung(pud_mkdirty(entry));
1241 entry = maybe_pud_mkwrite(entry, vma);
a00cc7d9
MW
1242 }
1243 set_pud_at(mm, addr, pud, entry);
1244 update_mmu_cache_pud(vma, addr, pud);
c6f3c5ee
AK
1245
1246out_unlock:
a00cc7d9
MW
1247 spin_unlock(ptl);
1248}
1249
9a9731b1 1250/**
7b806d22 1251 * vmf_insert_pfn_pud - insert a pud size pfn
9a9731b1
THV
1252 * @vmf: Structure describing the fault
1253 * @pfn: pfn to insert
9a9731b1
THV
1254 * @write: whether it's a write fault
1255 *
7b806d22 1256 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
9a9731b1
THV
1257 *
1258 * Return: vm_fault_t value.
1259 */
7b806d22 1260vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
a00cc7d9 1261{
fce86ff5
DW
1262 unsigned long addr = vmf->address & PUD_MASK;
1263 struct vm_area_struct *vma = vmf->vma;
7b806d22 1264 pgprot_t pgprot = vma->vm_page_prot;
fce86ff5 1265
a00cc7d9
MW
1266 /*
1267 * If we had pud_special, we could avoid all these restrictions,
1268 * but we need to be consistent with PTEs and architectures that
1269 * can't support a 'special' bit.
1270 */
62ec0d8c
DJ
1271 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1272 !pfn_t_devmap(pfn));
a00cc7d9
MW
1273 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1274 (VM_PFNMAP|VM_MIXEDMAP));
1275 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
a00cc7d9
MW
1276
1277 if (addr < vma->vm_start || addr >= vma->vm_end)
1278 return VM_FAULT_SIGBUS;
1279
1280 track_pfn_insert(vma, &pgprot, pfn);
1281
7b806d22 1282 insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
a00cc7d9
MW
1283 return VM_FAULT_NOPAGE;
1284}
7b806d22 1285EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
a00cc7d9
MW
1286#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1287
4418c522
PX
1288void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1289 pmd_t *pmd, bool write)
3565fce3
DW
1290{
1291 pmd_t _pmd;
1292
a8f97366 1293 _pmd = pmd_mkyoung(*pmd);
a69e4717 1294 if (write)
a8f97366 1295 _pmd = pmd_mkdirty(_pmd);
3565fce3 1296 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
a69e4717 1297 pmd, _pmd, write))
3565fce3
DW
1298 update_mmu_cache_pmd(vma, addr, pmd);
1299}
1300
1301struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
df06b37f 1302 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
3565fce3
DW
1303{
1304 unsigned long pfn = pmd_pfn(*pmd);
1305 struct mm_struct *mm = vma->vm_mm;
3565fce3 1306 struct page *page;
0f089235 1307 int ret;
3565fce3
DW
1308
1309 assert_spin_locked(pmd_lockptr(mm, pmd));
1310
f6f37321 1311 if (flags & FOLL_WRITE && !pmd_write(*pmd))
3565fce3
DW
1312 return NULL;
1313
1314 if (pmd_present(*pmd) && pmd_devmap(*pmd))
1315 /* pass */;
1316 else
1317 return NULL;
1318
1319 if (flags & FOLL_TOUCH)
a69e4717 1320 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
3565fce3
DW
1321
1322 /*
1323 * device mapped pages can only be returned if the
1324 * caller will manage the page reference count.
1325 */
3faa52c0 1326 if (!(flags & (FOLL_GET | FOLL_PIN)))
3565fce3
DW
1327 return ERR_PTR(-EEXIST);
1328
1329 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
df06b37f
KB
1330 *pgmap = get_dev_pagemap(pfn, *pgmap);
1331 if (!*pgmap)
3565fce3
DW
1332 return ERR_PTR(-EFAULT);
1333 page = pfn_to_page(pfn);
0f089235
LG
1334 ret = try_grab_page(page, flags);
1335 if (ret)
1336 page = ERR_PTR(ret);
3565fce3
DW
1337
1338 return page;
1339}
1340
71e3aac0
AA
1341int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1342 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
8f34f1ea 1343 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
71e3aac0 1344{
c4088ebd 1345 spinlock_t *dst_ptl, *src_ptl;
71e3aac0 1346 struct page *src_page;
96c772c2 1347 struct folio *src_folio;
71e3aac0 1348 pmd_t pmd;
12c9d70b 1349 pgtable_t pgtable = NULL;
628d47ce 1350 int ret = -ENOMEM;
71e3aac0 1351
628d47ce 1352 /* Skip if can be re-fill on fault */
8f34f1ea 1353 if (!vma_is_anonymous(dst_vma))
628d47ce
KS
1354 return 0;
1355
4cf58924 1356 pgtable = pte_alloc_one(dst_mm);
628d47ce
KS
1357 if (unlikely(!pgtable))
1358 goto out;
71e3aac0 1359
c4088ebd
KS
1360 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1361 src_ptl = pmd_lockptr(src_mm, src_pmd);
1362 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
71e3aac0
AA
1363
1364 ret = -EAGAIN;
1365 pmd = *src_pmd;
84c3fc4e
ZY
1366
1367#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1368 if (unlikely(is_swap_pmd(pmd))) {
1369 swp_entry_t entry = pmd_to_swp_entry(pmd);
1370
1371 VM_BUG_ON(!is_pmd_migration_entry(pmd));
6c287605 1372 if (!is_readable_migration_entry(entry)) {
4dd845b5
AP
1373 entry = make_readable_migration_entry(
1374 swp_offset(entry));
84c3fc4e 1375 pmd = swp_entry_to_pmd(entry);
ab6e3d09
NH
1376 if (pmd_swp_soft_dirty(*src_pmd))
1377 pmd = pmd_swp_mksoft_dirty(pmd);
8f34f1ea
PX
1378 if (pmd_swp_uffd_wp(*src_pmd))
1379 pmd = pmd_swp_mkuffd_wp(pmd);
84c3fc4e
ZY
1380 set_pmd_at(src_mm, addr, src_pmd, pmd);
1381 }
dd8a67f9 1382 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
af5b0f6a 1383 mm_inc_nr_ptes(dst_mm);
dd8a67f9 1384 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
8f34f1ea
PX
1385 if (!userfaultfd_wp(dst_vma))
1386 pmd = pmd_swp_clear_uffd_wp(pmd);
84c3fc4e
ZY
1387 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1388 ret = 0;
1389 goto out_unlock;
1390 }
1391#endif
1392
628d47ce 1393 if (unlikely(!pmd_trans_huge(pmd))) {
71e3aac0
AA
1394 pte_free(dst_mm, pgtable);
1395 goto out_unlock;
1396 }
fc9fe822 1397 /*
c4088ebd 1398 * When page table lock is held, the huge zero pmd should not be
fc9fe822
KS
1399 * under splitting since we don't split the page itself, only pmd to
1400 * a page table.
1401 */
1402 if (is_huge_zero_pmd(pmd)) {
97ae1749 1403 /*
e28833bc
MWO
1404 * mm_get_huge_zero_folio() will never allocate a new
1405 * folio here, since we already have a zero page to
1406 * copy. It just takes a reference.
97ae1749 1407 */
5691753d 1408 mm_get_huge_zero_folio(dst_mm);
5fc7a5f6 1409 goto out_zero_page;
fc9fe822 1410 }
de466bd6 1411
628d47ce
KS
1412 src_page = pmd_page(pmd);
1413 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
96c772c2 1414 src_folio = page_folio(src_page);
d042035e 1415
96c772c2
DH
1416 folio_get(src_folio);
1417 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
fb3d824d 1418 /* Page maybe pinned: split and retry the fault on PTEs. */
96c772c2 1419 folio_put(src_folio);
d042035e
PX
1420 pte_free(dst_mm, pgtable);
1421 spin_unlock(src_ptl);
1422 spin_unlock(dst_ptl);
8f34f1ea 1423 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
d042035e
PX
1424 return -EAGAIN;
1425 }
628d47ce 1426 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
5fc7a5f6 1427out_zero_page:
c4812909 1428 mm_inc_nr_ptes(dst_mm);
628d47ce 1429 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
71e3aac0 1430 pmdp_set_wrprotect(src_mm, addr, src_pmd);
8f34f1ea
PX
1431 if (!userfaultfd_wp(dst_vma))
1432 pmd = pmd_clear_uffd_wp(pmd);
71e3aac0
AA
1433 pmd = pmd_mkold(pmd_wrprotect(pmd));
1434 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
71e3aac0
AA
1435
1436 ret = 0;
1437out_unlock:
c4088ebd
KS
1438 spin_unlock(src_ptl);
1439 spin_unlock(dst_ptl);
71e3aac0
AA
1440out:
1441 return ret;
1442}
1443
a00cc7d9 1444#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1b167618
PX
1445void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1446 pud_t *pud, bool write)
a00cc7d9
MW
1447{
1448 pud_t _pud;
1449
a8f97366 1450 _pud = pud_mkyoung(*pud);
5fe653e9 1451 if (write)
a8f97366 1452 _pud = pud_mkdirty(_pud);
a00cc7d9 1453 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
5fe653e9 1454 pud, _pud, write))
a00cc7d9
MW
1455 update_mmu_cache_pud(vma, addr, pud);
1456}
1457
a00cc7d9
MW
1458int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1459 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1460 struct vm_area_struct *vma)
1461{
1462 spinlock_t *dst_ptl, *src_ptl;
1463 pud_t pud;
1464 int ret;
1465
1466 dst_ptl = pud_lock(dst_mm, dst_pud);
1467 src_ptl = pud_lockptr(src_mm, src_pud);
1468 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1469
1470 ret = -EAGAIN;
1471 pud = *src_pud;
1472 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1473 goto out_unlock;
1474
1475 /*
1476 * When page table lock is held, the huge zero pud should not be
1477 * under splitting since we don't split the page itself, only pud to
1478 * a page table.
1479 */
1480 if (is_huge_zero_pud(pud)) {
1481 /* No huge zero pud yet */
1482 }
1483
fb3d824d 1484 /*
96c772c2
DH
1485 * TODO: once we support anonymous pages, use
1486 * folio_try_dup_anon_rmap_*() and split if duplicating fails.
fb3d824d 1487 */
a00cc7d9
MW
1488 pudp_set_wrprotect(src_mm, addr, src_pud);
1489 pud = pud_mkold(pud_wrprotect(pud));
1490 set_pud_at(dst_mm, addr, dst_pud, pud);
1491
1492 ret = 0;
1493out_unlock:
1494 spin_unlock(src_ptl);
1495 spin_unlock(dst_ptl);
1496 return ret;
1497}
1498
1499void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1500{
a00cc7d9
MW
1501 bool write = vmf->flags & FAULT_FLAG_WRITE;
1502
1503 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1504 if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1505 goto unlock;
1506
5fe653e9 1507 touch_pud(vmf->vma, vmf->address, vmf->pud, write);
a00cc7d9
MW
1508unlock:
1509 spin_unlock(vmf->ptl);
1510}
1511#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1512
5db4f15c 1513void huge_pmd_set_accessed(struct vm_fault *vmf)
a1dd450b 1514{
20f664aa 1515 bool write = vmf->flags & FAULT_FLAG_WRITE;
a1dd450b 1516
82b0f8c3 1517 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
a69e4717 1518 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
a1dd450b
WD
1519 goto unlock;
1520
a69e4717 1521 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
a1dd450b
WD
1522
1523unlock:
82b0f8c3 1524 spin_unlock(vmf->ptl);
a1dd450b
WD
1525}
1526
5db4f15c 1527vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
71e3aac0 1528{
c89357e2 1529 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
82b0f8c3 1530 struct vm_area_struct *vma = vmf->vma;
2fad3d14 1531 struct folio *folio;
3917c802 1532 struct page *page;
82b0f8c3 1533 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
5db4f15c 1534 pmd_t orig_pmd = vmf->orig_pmd;
71e3aac0 1535
82b0f8c3 1536 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
81d1b09c 1537 VM_BUG_ON_VMA(!vma->anon_vma, vma);
3917c802 1538
93b4796d 1539 if (is_huge_zero_pmd(orig_pmd))
3917c802
KS
1540 goto fallback;
1541
82b0f8c3 1542 spin_lock(vmf->ptl);
3917c802
KS
1543
1544 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1545 spin_unlock(vmf->ptl);
1546 return 0;
1547 }
71e3aac0
AA
1548
1549 page = pmd_page(orig_pmd);
2fad3d14 1550 folio = page_folio(page);
f6004e73 1551 VM_BUG_ON_PAGE(!PageHead(page), page);
3917c802 1552
6c287605
DH
1553 /* Early check when only holding the PT lock. */
1554 if (PageAnonExclusive(page))
1555 goto reuse;
1556
2fad3d14
MWO
1557 if (!folio_trylock(folio)) {
1558 folio_get(folio);
ba3c4ce6 1559 spin_unlock(vmf->ptl);
2fad3d14 1560 folio_lock(folio);
ba3c4ce6
HY
1561 spin_lock(vmf->ptl);
1562 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
3917c802 1563 spin_unlock(vmf->ptl);
2fad3d14
MWO
1564 folio_unlock(folio);
1565 folio_put(folio);
3917c802 1566 return 0;
ba3c4ce6 1567 }
2fad3d14 1568 folio_put(folio);
ba3c4ce6 1569 }
3917c802 1570
6c287605
DH
1571 /* Recheck after temporarily dropping the PT lock. */
1572 if (PageAnonExclusive(page)) {
2fad3d14 1573 folio_unlock(folio);
6c287605
DH
1574 goto reuse;
1575 }
1576
3917c802 1577 /*
2fad3d14
MWO
1578 * See do_wp_page(): we can only reuse the folio exclusively if
1579 * there are no additional references. Note that we always drain
1fec6890 1580 * the LRU cache immediately after adding a THP.
3917c802 1581 */
2fad3d14
MWO
1582 if (folio_ref_count(folio) >
1583 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
3bff7e3f 1584 goto unlock_fallback;
2fad3d14
MWO
1585 if (folio_test_swapcache(folio))
1586 folio_free_swap(folio);
1587 if (folio_ref_count(folio) == 1) {
71e3aac0 1588 pmd_t entry;
6c54dc6c 1589
06968625 1590 folio_move_anon_rmap(folio, vma);
5ca43289 1591 SetPageAnonExclusive(page);
2fad3d14 1592 folio_unlock(folio);
6c287605 1593reuse:
c89357e2
DH
1594 if (unlikely(unshare)) {
1595 spin_unlock(vmf->ptl);
1596 return 0;
1597 }
71e3aac0 1598 entry = pmd_mkyoung(orig_pmd);
f55e1014 1599 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3917c802 1600 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
82b0f8c3 1601 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
82b0f8c3 1602 spin_unlock(vmf->ptl);
cb8d8633 1603 return 0;
71e3aac0 1604 }
3917c802 1605
3bff7e3f 1606unlock_fallback:
2fad3d14 1607 folio_unlock(folio);
82b0f8c3 1608 spin_unlock(vmf->ptl);
3917c802
KS
1609fallback:
1610 __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
1611 return VM_FAULT_FALLBACK;
71e3aac0
AA
1612}
1613
c27f479e
DH
1614static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1615 unsigned long addr, pmd_t pmd)
1616{
1617 struct page *page;
1618
1619 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1620 return false;
1621
1622 /* Don't touch entries that are not even readable (NUMA hinting). */
1623 if (pmd_protnone(pmd))
1624 return false;
1625
1626 /* Do we need write faults for softdirty tracking? */
1627 if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
1628 return false;
1629
1630 /* Do we need write faults for uffd-wp tracking? */
1631 if (userfaultfd_huge_pmd_wp(vma, pmd))
1632 return false;
1633
1634 if (!(vma->vm_flags & VM_SHARED)) {
1635 /* See can_change_pte_writable(). */
1636 page = vm_normal_page_pmd(vma, addr, pmd);
1637 return page && PageAnon(page) && PageAnonExclusive(page);
1638 }
1639
1640 /* See can_change_pte_writable(). */
1641 return pmd_dirty(pmd);
1642}
1643
d10e63f2 1644/* NUMA hinting page fault entry point for trans huge pmds */
5db4f15c 1645vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
d10e63f2 1646{
82b0f8c3 1647 struct vm_area_struct *vma = vmf->vma;
c5b5a3dd
YS
1648 pmd_t oldpmd = vmf->orig_pmd;
1649 pmd_t pmd;
667ffc31 1650 struct folio *folio;
82b0f8c3 1651 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
667ffc31 1652 int nid = NUMA_NO_NODE;
33024536 1653 int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
6a56ccbc 1654 bool migrated = false, writable = false;
6688cc05 1655 int flags = 0;
d10e63f2 1656
82b0f8c3 1657 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
c5b5a3dd 1658 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
82b0f8c3 1659 spin_unlock(vmf->ptl);
de466bd6
MG
1660 goto out;
1661 }
1662
c5b5a3dd 1663 pmd = pmd_modify(oldpmd, vma->vm_page_prot);
6a56ccbc
DH
1664
1665 /*
1666 * Detect now whether the PMD could be writable; this information
1667 * is only valid while holding the PT lock.
1668 */
1669 writable = pmd_write(pmd);
1670 if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
1671 can_change_pmd_writable(vma, vmf->address, pmd))
1672 writable = true;
1673
667ffc31
KW
1674 folio = vm_normal_folio_pmd(vma, haddr, pmd);
1675 if (!folio)
c5b5a3dd
YS
1676 goto out_map;
1677
1678 /* See similar comment in do_numa_page for explanation */
6a56ccbc 1679 if (!writable)
c5b5a3dd
YS
1680 flags |= TNF_NO_GROUP;
1681
667ffc31 1682 nid = folio_nid(folio);
33024536
HY
1683 /*
1684 * For memory tiering mode, cpupid of slow memory page is used
1685 * to record page access time. So use default value.
1686 */
667ffc31 1687 if (node_is_toptier(nid))
c4a8d2fa 1688 last_cpupid = folio_last_cpupid(folio);
f8fd525b 1689 target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
c5b5a3dd 1690 if (target_nid == NUMA_NO_NODE) {
667ffc31 1691 folio_put(folio);
c5b5a3dd
YS
1692 goto out_map;
1693 }
1694
82b0f8c3 1695 spin_unlock(vmf->ptl);
6a56ccbc 1696 writable = false;
8b1b436d 1697
667ffc31 1698 migrated = migrate_misplaced_folio(folio, vma, target_nid);
6688cc05
PZ
1699 if (migrated) {
1700 flags |= TNF_MIGRATED;
667ffc31 1701 nid = target_nid;
c5b5a3dd 1702 } else {
074c2381 1703 flags |= TNF_MIGRATE_FAIL;
c5b5a3dd
YS
1704 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1705 if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1706 spin_unlock(vmf->ptl);
1707 goto out;
1708 }
1709 goto out_map;
1710 }
b8916634
MG
1711
1712out:
667ffc31
KW
1713 if (nid != NUMA_NO_NODE)
1714 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
8191acbd 1715
d10e63f2 1716 return 0;
c5b5a3dd
YS
1717
1718out_map:
1719 /* Restore the PMD */
1720 pmd = pmd_modify(oldpmd, vma->vm_page_prot);
1721 pmd = pmd_mkyoung(pmd);
6a56ccbc 1722 if (writable)
161e393c 1723 pmd = pmd_mkwrite(pmd, vma);
c5b5a3dd
YS
1724 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1725 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1726 spin_unlock(vmf->ptl);
1727 goto out;
d10e63f2
MG
1728}
1729
319904ad
HY
1730/*
1731 * Return true if we do MADV_FREE successfully on entire pmd page.
1732 * Otherwise, return false.
1733 */
1734bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
b8d3c4c3 1735 pmd_t *pmd, unsigned long addr, unsigned long next)
b8d3c4c3
MK
1736{
1737 spinlock_t *ptl;
1738 pmd_t orig_pmd;
fc986a38 1739 struct folio *folio;
b8d3c4c3 1740 struct mm_struct *mm = tlb->mm;
319904ad 1741 bool ret = false;
b8d3c4c3 1742
ed6a7935 1743 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
07e32661 1744
b6ec57f4
KS
1745 ptl = pmd_trans_huge_lock(pmd, vma);
1746 if (!ptl)
25eedabe 1747 goto out_unlocked;
b8d3c4c3
MK
1748
1749 orig_pmd = *pmd;
319904ad 1750 if (is_huge_zero_pmd(orig_pmd))
b8d3c4c3 1751 goto out;
b8d3c4c3 1752
84c3fc4e
ZY
1753 if (unlikely(!pmd_present(orig_pmd))) {
1754 VM_BUG_ON(thp_migration_supported() &&
1755 !is_pmd_migration_entry(orig_pmd));
1756 goto out;
1757 }
1758
e06d03d5 1759 folio = pmd_folio(orig_pmd);
b8d3c4c3 1760 /*
fc986a38
KW
1761 * If other processes are mapping this folio, we couldn't discard
1762 * the folio unless they all do MADV_FREE so let's skip the folio.
b8d3c4c3 1763 */
ebb34f78 1764 if (folio_likely_mapped_shared(folio))
b8d3c4c3
MK
1765 goto out;
1766
fc986a38 1767 if (!folio_trylock(folio))
b8d3c4c3
MK
1768 goto out;
1769
1770 /*
1771 * If user want to discard part-pages of THP, split it so MADV_FREE
1772 * will deactivate only them.
1773 */
1774 if (next - addr != HPAGE_PMD_SIZE) {
fc986a38 1775 folio_get(folio);
b8d3c4c3 1776 spin_unlock(ptl);
fc986a38
KW
1777 split_folio(folio);
1778 folio_unlock(folio);
1779 folio_put(folio);
b8d3c4c3
MK
1780 goto out_unlocked;
1781 }
1782
fc986a38
KW
1783 if (folio_test_dirty(folio))
1784 folio_clear_dirty(folio);
1785 folio_unlock(folio);
b8d3c4c3 1786
b8d3c4c3 1787 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
58ceeb6b 1788 pmdp_invalidate(vma, addr, pmd);
b8d3c4c3
MK
1789 orig_pmd = pmd_mkold(orig_pmd);
1790 orig_pmd = pmd_mkclean(orig_pmd);
1791
1792 set_pmd_at(mm, addr, pmd, orig_pmd);
1793 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1794 }
802a3a92 1795
6a6fe9eb 1796 folio_mark_lazyfree(folio);
319904ad 1797 ret = true;
b8d3c4c3
MK
1798out:
1799 spin_unlock(ptl);
1800out_unlocked:
1801 return ret;
1802}
1803
953c66c2
AK
1804static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1805{
1806 pgtable_t pgtable;
1807
1808 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1809 pte_free(mm, pgtable);
c4812909 1810 mm_dec_nr_ptes(mm);
953c66c2
AK
1811}
1812
71e3aac0 1813int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
f21760b1 1814 pmd_t *pmd, unsigned long addr)
71e3aac0 1815{
da146769 1816 pmd_t orig_pmd;
bf929152 1817 spinlock_t *ptl;
71e3aac0 1818
ed6a7935 1819 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
07e32661 1820
b6ec57f4
KS
1821 ptl = __pmd_trans_huge_lock(pmd, vma);
1822 if (!ptl)
da146769
KS
1823 return 0;
1824 /*
1825 * For architectures like ppc64 we look at deposited pgtable
1826 * when calling pmdp_huge_get_and_clear. So do the
1827 * pgtable_trans_huge_withdraw after finishing pmdp related
1828 * operations.
1829 */
93a98695
AK
1830 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
1831 tlb->fullmm);
e5136e87 1832 arch_check_zapped_pmd(vma, orig_pmd);
da146769 1833 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2484ca9b 1834 if (vma_is_special_huge(vma)) {
3b6521f5
OH
1835 if (arch_needs_pgtable_deposit())
1836 zap_deposited_table(tlb->mm, pmd);
da146769 1837 spin_unlock(ptl);
da146769 1838 } else if (is_huge_zero_pmd(orig_pmd)) {
c14a6eb4 1839 zap_deposited_table(tlb->mm, pmd);
da146769 1840 spin_unlock(ptl);
da146769 1841 } else {
0103b27a 1842 struct folio *folio = NULL;
616b8371
ZY
1843 int flush_needed = 1;
1844
1845 if (pmd_present(orig_pmd)) {
0103b27a
KW
1846 struct page *page = pmd_page(orig_pmd);
1847
1848 folio = page_folio(page);
1849 folio_remove_rmap_pmd(folio, page, vma);
0a7bda48 1850 WARN_ON_ONCE(folio_mapcount(folio) < 0);
616b8371
ZY
1851 VM_BUG_ON_PAGE(!PageHead(page), page);
1852 } else if (thp_migration_supported()) {
1853 swp_entry_t entry;
1854
1855 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1856 entry = pmd_to_swp_entry(orig_pmd);
0103b27a 1857 folio = pfn_swap_entry_folio(entry);
616b8371
ZY
1858 flush_needed = 0;
1859 } else
1860 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1861
0103b27a 1862 if (folio_test_anon(folio)) {
c14a6eb4 1863 zap_deposited_table(tlb->mm, pmd);
b5072380
KS
1864 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1865 } else {
953c66c2
AK
1866 if (arch_needs_pgtable_deposit())
1867 zap_deposited_table(tlb->mm, pmd);
6b27cc6c 1868 add_mm_counter(tlb->mm, mm_counter_file(folio),
0103b27a 1869 -HPAGE_PMD_NR);
b5072380 1870 }
616b8371 1871
da146769 1872 spin_unlock(ptl);
616b8371 1873 if (flush_needed)
0103b27a 1874 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
025c5b24 1875 }
da146769 1876 return 1;
71e3aac0
AA
1877}
1878
1dd38b6c
AK
1879#ifndef pmd_move_must_withdraw
1880static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1881 spinlock_t *old_pmd_ptl,
1882 struct vm_area_struct *vma)
1883{
1884 /*
1885 * With split pmd lock we also need to move preallocated
1886 * PTE page table if new_pmd is on different PMD page table.
1887 *
1888 * We also don't deposit and withdraw tables for file pages.
1889 */
1890 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1891}
1892#endif
1893
ab6e3d09
NH
1894static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1895{
1896#ifdef CONFIG_MEM_SOFT_DIRTY
1897 if (unlikely(is_pmd_migration_entry(pmd)))
1898 pmd = pmd_swp_mksoft_dirty(pmd);
1899 else if (pmd_present(pmd))
1900 pmd = pmd_mksoft_dirty(pmd);
1901#endif
1902 return pmd;
1903}
1904
bf8616d5 1905bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
b8aa9d9d 1906 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
37a1c49a 1907{
bf929152 1908 spinlock_t *old_ptl, *new_ptl;
37a1c49a 1909 pmd_t pmd;
37a1c49a 1910 struct mm_struct *mm = vma->vm_mm;
5d190420 1911 bool force_flush = false;
37a1c49a 1912
37a1c49a
AA
1913 /*
1914 * The destination pmd shouldn't be established, free_pgtables()
a5be621e
HD
1915 * should have released it; but move_page_tables() might have already
1916 * inserted a page table, if racing against shmem/file collapse.
37a1c49a 1917 */
a5be621e 1918 if (!pmd_none(*new_pmd)) {
37a1c49a 1919 VM_BUG_ON(pmd_trans_huge(*new_pmd));
4b471e88 1920 return false;
37a1c49a
AA
1921 }
1922
bf929152
KS
1923 /*
1924 * We don't have to worry about the ordering of src and dst
c1e8d7c6 1925 * ptlocks because exclusive mmap_lock prevents deadlock.
bf929152 1926 */
b6ec57f4
KS
1927 old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1928 if (old_ptl) {
bf929152
KS
1929 new_ptl = pmd_lockptr(mm, new_pmd);
1930 if (new_ptl != old_ptl)
1931 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
8809aa2d 1932 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
eb66ae03 1933 if (pmd_present(pmd))
a2ce2666 1934 force_flush = true;
025c5b24 1935 VM_BUG_ON(!pmd_none(*new_pmd));
3592806c 1936
1dd38b6c 1937 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
b3084f4d 1938 pgtable_t pgtable;
3592806c
KS
1939 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1940 pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
3592806c 1941 }
ab6e3d09
NH
1942 pmd = move_soft_dirty_pmd(pmd);
1943 set_pmd_at(mm, new_addr, new_pmd, pmd);
5d190420 1944 if (force_flush)
7c38f181 1945 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
eb66ae03
LT
1946 if (new_ptl != old_ptl)
1947 spin_unlock(new_ptl);
bf929152 1948 spin_unlock(old_ptl);
4b471e88 1949 return true;
37a1c49a 1950 }
4b471e88 1951 return false;
37a1c49a
AA
1952}
1953
f123d74a
MG
1954/*
1955 * Returns
1956 * - 0 if PMD could not be locked
f0953a1b 1957 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
e346e668 1958 * or if prot_numa but THP migration is not supported
f0953a1b 1959 * - HPAGE_PMD_NR if protections changed and TLB flush necessary
f123d74a 1960 */
4a18419f
NA
1961int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1962 pmd_t *pmd, unsigned long addr, pgprot_t newprot,
1963 unsigned long cp_flags)
cd7548ab
JW
1964{
1965 struct mm_struct *mm = vma->vm_mm;
bf929152 1966 spinlock_t *ptl;
c9fe6656 1967 pmd_t oldpmd, entry;
58705444 1968 bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
292924b2
PX
1969 bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
1970 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
6a56ccbc 1971 int ret = 1;
cd7548ab 1972
4a18419f
NA
1973 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1974
e346e668
YS
1975 if (prot_numa && !thp_migration_supported())
1976 return 1;
1977
b6ec57f4 1978 ptl = __pmd_trans_huge_lock(pmd, vma);
0a85e51d
KS
1979 if (!ptl)
1980 return 0;
e944fd67 1981
84c3fc4e
ZY
1982#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1983 if (is_swap_pmd(*pmd)) {
1984 swp_entry_t entry = pmd_to_swp_entry(*pmd);
5662400a 1985 struct folio *folio = pfn_swap_entry_folio(entry);
24bf08c4 1986 pmd_t newpmd;
84c3fc4e
ZY
1987
1988 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
4dd845b5 1989 if (is_writable_migration_entry(entry)) {
84c3fc4e
ZY
1990 /*
1991 * A protection check is difficult so
1992 * just be safe and disable write
1993 */
d986ba2b 1994 if (folio_test_anon(folio))
6c287605
DH
1995 entry = make_readable_exclusive_migration_entry(swp_offset(entry));
1996 else
1997 entry = make_readable_migration_entry(swp_offset(entry));
84c3fc4e 1998 newpmd = swp_entry_to_pmd(entry);
ab6e3d09
NH
1999 if (pmd_swp_soft_dirty(*pmd))
2000 newpmd = pmd_swp_mksoft_dirty(newpmd);
24bf08c4
DH
2001 } else {
2002 newpmd = *pmd;
84c3fc4e 2003 }
24bf08c4
DH
2004
2005 if (uffd_wp)
2006 newpmd = pmd_swp_mkuffd_wp(newpmd);
2007 else if (uffd_wp_resolve)
2008 newpmd = pmd_swp_clear_uffd_wp(newpmd);
2009 if (!pmd_same(*pmd, newpmd))
2010 set_pmd_at(mm, addr, pmd, newpmd);
84c3fc4e
ZY
2011 goto unlock;
2012 }
2013#endif
2014
a1a3a2fc 2015 if (prot_numa) {
d986ba2b 2016 struct folio *folio;
33024536 2017 bool toptier;
a1a3a2fc
HY
2018 /*
2019 * Avoid trapping faults against the zero page. The read-only
2020 * data is likely to be read-cached on the local CPU and
2021 * local/remote hits to the zero page are not interesting.
2022 */
2023 if (is_huge_zero_pmd(*pmd))
2024 goto unlock;
025c5b24 2025
a1a3a2fc
HY
2026 if (pmd_protnone(*pmd))
2027 goto unlock;
0a85e51d 2028
e06d03d5 2029 folio = pmd_folio(*pmd);
d986ba2b 2030 toptier = node_is_toptier(folio_nid(folio));
a1a3a2fc
HY
2031 /*
2032 * Skip scanning top tier node if normal numa
2033 * balancing is disabled
2034 */
2035 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
33024536 2036 toptier)
a1a3a2fc 2037 goto unlock;
33024536
HY
2038
2039 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
2040 !toptier)
d986ba2b
KW
2041 folio_xchg_access_time(folio,
2042 jiffies_to_msecs(jiffies));
a1a3a2fc 2043 }
ced10803 2044 /*
3e4e28c5 2045 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
ced10803 2046 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
3e4e28c5 2047 * which is also under mmap_read_lock(mm):
ced10803
KS
2048 *
2049 * CPU0: CPU1:
2050 * change_huge_pmd(prot_numa=1)
2051 * pmdp_huge_get_and_clear_notify()
2052 * madvise_dontneed()
2053 * zap_pmd_range()
2054 * pmd_trans_huge(*pmd) == 0 (without ptl)
2055 * // skip the pmd
2056 * set_pmd_at();
2057 * // pmd is re-established
2058 *
2059 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2060 * which may break userspace.
2061 *
4f831457 2062 * pmdp_invalidate_ad() is required to make sure we don't miss
ced10803
KS
2063 * dirty/young flags set by hardware.
2064 */
4f831457 2065 oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
ced10803 2066
c9fe6656 2067 entry = pmd_modify(oldpmd, newprot);
f1eb1bac 2068 if (uffd_wp)
292924b2 2069 entry = pmd_mkuffd_wp(entry);
f1eb1bac 2070 else if (uffd_wp_resolve)
292924b2
PX
2071 /*
2072 * Leave the write bit to be handled by PF interrupt
2073 * handler, then things like COW could be properly
2074 * handled.
2075 */
2076 entry = pmd_clear_uffd_wp(entry);
c27f479e
DH
2077
2078 /* See change_pte_range(). */
2079 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2080 can_change_pmd_writable(vma, addr, entry))
161e393c 2081 entry = pmd_mkwrite(entry, vma);
c27f479e 2082
0a85e51d
KS
2083 ret = HPAGE_PMD_NR;
2084 set_pmd_at(mm, addr, pmd, entry);
4a18419f 2085
c9fe6656
NA
2086 if (huge_pmd_needs_flush(oldpmd, entry))
2087 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
0a85e51d
KS
2088unlock:
2089 spin_unlock(ptl);
025c5b24
NH
2090 return ret;
2091}
2092
adef4406
AA
2093#ifdef CONFIG_USERFAULTFD
2094/*
867a43a3 2095 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
adef4406
AA
2096 * the caller, but it must return after releasing the page_table_lock.
2097 * Just move the page from src_pmd to dst_pmd if possible.
2098 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2099 * repeated by the caller, or other errors in case of failure.
2100 */
2101int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2102 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2103 unsigned long dst_addr, unsigned long src_addr)
2104{
2105 pmd_t _dst_pmd, src_pmdval;
2106 struct page *src_page;
2107 struct folio *src_folio;
2108 struct anon_vma *src_anon_vma;
2109 spinlock_t *src_ptl, *dst_ptl;
2110 pgtable_t src_pgtable;
2111 struct mmu_notifier_range range;
2112 int err = 0;
2113
2114 src_pmdval = *src_pmd;
2115 src_ptl = pmd_lockptr(mm, src_pmd);
2116
2117 lockdep_assert_held(src_ptl);
867a43a3
LG
2118 vma_assert_locked(src_vma);
2119 vma_assert_locked(dst_vma);
adef4406
AA
2120
2121 /* Sanity checks before the operation */
2122 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2123 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2124 spin_unlock(src_ptl);
2125 return -EINVAL;
2126 }
2127
2128 if (!pmd_trans_huge(src_pmdval)) {
2129 spin_unlock(src_ptl);
2130 if (is_pmd_migration_entry(src_pmdval)) {
2131 pmd_migration_entry_wait(mm, &src_pmdval);
2132 return -EAGAIN;
2133 }
2134 return -ENOENT;
2135 }
2136
2137 src_page = pmd_page(src_pmdval);
adef4406 2138
eb1521da
SB
2139 if (!is_huge_zero_pmd(src_pmdval)) {
2140 if (unlikely(!PageAnonExclusive(src_page))) {
2141 spin_unlock(src_ptl);
2142 return -EBUSY;
2143 }
2144
2145 src_folio = page_folio(src_page);
2146 folio_get(src_folio);
2147 } else
2148 src_folio = NULL;
2149
adef4406
AA
2150 spin_unlock(src_ptl);
2151
2152 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2153 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2154 src_addr + HPAGE_PMD_SIZE);
2155 mmu_notifier_invalidate_range_start(&range);
2156
eb1521da
SB
2157 if (src_folio) {
2158 folio_lock(src_folio);
adef4406 2159
eb1521da
SB
2160 /*
2161 * split_huge_page walks the anon_vma chain without the page
2162 * lock. Serialize against it with the anon_vma lock, the page
2163 * lock is not enough.
2164 */
2165 src_anon_vma = folio_get_anon_vma(src_folio);
2166 if (!src_anon_vma) {
2167 err = -EAGAIN;
2168 goto unlock_folio;
2169 }
2170 anon_vma_lock_write(src_anon_vma);
2171 } else
2172 src_anon_vma = NULL;
adef4406
AA
2173
2174 dst_ptl = pmd_lockptr(mm, dst_pmd);
2175 double_pt_lock(src_ptl, dst_ptl);
2176 if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2177 !pmd_same(*dst_pmd, dst_pmdval))) {
2178 err = -EAGAIN;
2179 goto unlock_ptls;
2180 }
eb1521da
SB
2181 if (src_folio) {
2182 if (folio_maybe_dma_pinned(src_folio) ||
2183 !PageAnonExclusive(&src_folio->page)) {
2184 err = -EBUSY;
2185 goto unlock_ptls;
2186 }
adef4406 2187
eb1521da
SB
2188 if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2189 WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2190 err = -EBUSY;
2191 goto unlock_ptls;
2192 }
adef4406 2193
eb1521da
SB
2194 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2195 /* Folio got pinned from under us. Put it back and fail the move. */
2196 if (folio_maybe_dma_pinned(src_folio)) {
2197 set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2198 err = -EBUSY;
2199 goto unlock_ptls;
2200 }
adef4406 2201
c0205eaf 2202 folio_move_anon_rmap(src_folio, dst_vma);
b5ba3a64 2203 src_folio->index = linear_page_index(dst_vma, dst_addr);
c0205eaf 2204
eb1521da
SB
2205 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2206 /* Follow mremap() behavior and treat the entry dirty after the move */
2207 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2208 } else {
2209 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2210 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
2211 }
adef4406
AA
2212 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2213
2214 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2215 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2216unlock_ptls:
2217 double_pt_unlock(src_ptl, dst_ptl);
eb1521da
SB
2218 if (src_anon_vma) {
2219 anon_vma_unlock_write(src_anon_vma);
2220 put_anon_vma(src_anon_vma);
2221 }
adef4406
AA
2222unlock_folio:
2223 /* unblock rmap walks */
eb1521da
SB
2224 if (src_folio)
2225 folio_unlock(src_folio);
adef4406 2226 mmu_notifier_invalidate_range_end(&range);
eb1521da
SB
2227 if (src_folio)
2228 folio_put(src_folio);
adef4406
AA
2229 return err;
2230}
2231#endif /* CONFIG_USERFAULTFD */
2232
025c5b24 2233/*
8f19b0c0 2234 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
025c5b24 2235 *
8f19b0c0
HY
2236 * Note that if it returns page table lock pointer, this routine returns without
2237 * unlocking page table lock. So callers must unlock it.
025c5b24 2238 */
b6ec57f4 2239spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
025c5b24 2240{
b6ec57f4
KS
2241 spinlock_t *ptl;
2242 ptl = pmd_lock(vma->vm_mm, pmd);
84c3fc4e
ZY
2243 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
2244 pmd_devmap(*pmd)))
b6ec57f4
KS
2245 return ptl;
2246 spin_unlock(ptl);
2247 return NULL;
cd7548ab
JW
2248}
2249
a00cc7d9 2250/*
d965e390 2251 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
a00cc7d9 2252 *
d965e390
ML
2253 * Note that if it returns page table lock pointer, this routine returns without
2254 * unlocking page table lock. So callers must unlock it.
a00cc7d9
MW
2255 */
2256spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2257{
2258 spinlock_t *ptl;
2259
2260 ptl = pud_lock(vma->vm_mm, pud);
2261 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2262 return ptl;
2263 spin_unlock(ptl);
2264 return NULL;
2265}
2266
2267#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2268int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2269 pud_t *pud, unsigned long addr)
2270{
a00cc7d9
MW
2271 spinlock_t *ptl;
2272
2273 ptl = __pud_trans_huge_lock(pud, vma);
2274 if (!ptl)
2275 return 0;
74929079 2276
f32928ab 2277 pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
a00cc7d9 2278 tlb_remove_pud_tlb_entry(tlb, pud, addr);
2484ca9b 2279 if (vma_is_special_huge(vma)) {
a00cc7d9
MW
2280 spin_unlock(ptl);
2281 /* No zero page support yet */
2282 } else {
2283 /* No support for anonymous PUD pages yet */
2284 BUG();
2285 }
2286 return 1;
2287}
2288
2289static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2290 unsigned long haddr)
2291{
2292 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2293 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2294 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2295 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2296
ce9311cf 2297 count_vm_event(THP_SPLIT_PUD);
a00cc7d9 2298
ec8832d0 2299 pudp_huge_clear_flush(vma, haddr, pud);
a00cc7d9
MW
2300}
2301
2302void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2303 unsigned long address)
2304{
2305 spinlock_t *ptl;
ac46d4f3 2306 struct mmu_notifier_range range;
a00cc7d9 2307
7d4a8be0 2308 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
6f4f13e8 2309 address & HPAGE_PUD_MASK,
ac46d4f3
JG
2310 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2311 mmu_notifier_invalidate_range_start(&range);
2312 ptl = pud_lock(vma->vm_mm, pud);
a00cc7d9
MW
2313 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2314 goto out;
ac46d4f3 2315 __split_huge_pud_locked(vma, pud, range.start);
a00cc7d9
MW
2316
2317out:
2318 spin_unlock(ptl);
ec8832d0 2319 mmu_notifier_invalidate_range_end(&range);
a00cc7d9
MW
2320}
2321#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2322
eef1b3ba
KS
2323static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2324 unsigned long haddr, pmd_t *pmd)
2325{
2326 struct mm_struct *mm = vma->vm_mm;
2327 pgtable_t pgtable;
42b2af2c 2328 pmd_t _pmd, old_pmd;
c9c1ee20
HD
2329 unsigned long addr;
2330 pte_t *pte;
eef1b3ba
KS
2331 int i;
2332
0f10851e
JG
2333 /*
2334 * Leave pmd empty until pte is filled note that it is fine to delay
2335 * notification until mmu_notifier_invalidate_range_end() as we are
2336 * replacing a zero pmd write protected page with a zero pte write
2337 * protected page.
2338 *
ee65728e 2339 * See Documentation/mm/mmu_notifier.rst
0f10851e 2340 */
42b2af2c 2341 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
eef1b3ba
KS
2342
2343 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2344 pmd_populate(mm, &_pmd, pgtable);
2345
c9c1ee20
HD
2346 pte = pte_offset_map(&_pmd, haddr);
2347 VM_BUG_ON(!pte);
2348 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2349 pte_t entry;
2350
2351 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
eef1b3ba 2352 entry = pte_mkspecial(entry);
42b2af2c
DH
2353 if (pmd_uffd_wp(old_pmd))
2354 entry = pte_mkuffd_wp(entry);
c33c7948 2355 VM_BUG_ON(!pte_none(ptep_get(pte)));
c9c1ee20
HD
2356 set_pte_at(mm, addr, pte, entry);
2357 pte++;
eef1b3ba 2358 }
c9c1ee20 2359 pte_unmap(pte - 1);
eef1b3ba
KS
2360 smp_wmb(); /* make pte visible before pmd */
2361 pmd_populate(mm, pmd, pgtable);
eef1b3ba
KS
2362}
2363
2364static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
ba988280 2365 unsigned long haddr, bool freeze)
eef1b3ba
KS
2366{
2367 struct mm_struct *mm = vma->vm_mm;
91b2978a 2368 struct folio *folio;
eef1b3ba
KS
2369 struct page *page;
2370 pgtable_t pgtable;
423ac9af 2371 pmd_t old_pmd, _pmd;
292924b2 2372 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
0ccf7f16 2373 bool anon_exclusive = false, dirty = false;
2ac015e2 2374 unsigned long addr;
c9c1ee20 2375 pte_t *pte;
eef1b3ba
KS
2376 int i;
2377
2378 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2379 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2380 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
84c3fc4e
ZY
2381 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2382 && !pmd_devmap(*pmd));
eef1b3ba
KS
2383
2384 count_vm_event(THP_SPLIT_PMD);
2385
d21b9e57 2386 if (!vma_is_anonymous(vma)) {
ec8832d0 2387 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
953c66c2
AK
2388 /*
2389 * We are going to unmap this huge page. So
2390 * just go ahead and zap it
2391 */
2392 if (arch_needs_pgtable_deposit())
2393 zap_deposited_table(mm, pmd);
2484ca9b 2394 if (vma_is_special_huge(vma))
d21b9e57 2395 return;
99fa8a48
HD
2396 if (unlikely(is_pmd_migration_entry(old_pmd))) {
2397 swp_entry_t entry;
2398
2399 entry = pmd_to_swp_entry(old_pmd);
439992ff 2400 folio = pfn_swap_entry_folio(entry);
99fa8a48
HD
2401 } else {
2402 page = pmd_page(old_pmd);
a8e61d58
DH
2403 folio = page_folio(page);
2404 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
db44c658 2405 folio_mark_dirty(folio);
a8e61d58
DH
2406 if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2407 folio_set_referenced(folio);
2408 folio_remove_rmap_pmd(folio, page, vma);
2409 folio_put(folio);
99fa8a48 2410 }
6b27cc6c 2411 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
eef1b3ba 2412 return;
99fa8a48
HD
2413 }
2414
3b77e8c8 2415 if (is_huge_zero_pmd(*pmd)) {
4645b9fe
JG
2416 /*
2417 * FIXME: Do we want to invalidate secondary mmu by calling
1af5a810
AP
2418 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2419 * inside __split_huge_pmd() ?
4645b9fe
JG
2420 *
2421 * We are going from a zero huge page write protected to zero
2422 * small page also write protected so it does not seems useful
2423 * to invalidate secondary mmu at this time.
2424 */
eef1b3ba
KS
2425 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2426 }
2427
3a5a8d34 2428 pmd_migration = is_pmd_migration_entry(*pmd);
2e83ee1d 2429 if (unlikely(pmd_migration)) {
84c3fc4e
ZY
2430 swp_entry_t entry;
2431
3a5a8d34 2432 old_pmd = *pmd;
423ac9af 2433 entry = pmd_to_swp_entry(old_pmd);
af5cdaf8 2434 page = pfn_swap_entry_to_page(entry);
4dd845b5 2435 write = is_writable_migration_entry(entry);
6c287605
DH
2436 if (PageAnon(page))
2437 anon_exclusive = is_readable_exclusive_migration_entry(entry);
2e346877
PX
2438 young = is_migration_entry_young(entry);
2439 dirty = is_migration_entry_dirty(entry);
2e83ee1d 2440 soft_dirty = pmd_swp_soft_dirty(old_pmd);
f45ec5ff 2441 uffd_wp = pmd_swp_uffd_wp(old_pmd);
2e83ee1d 2442 } else {
3a5a8d34
RR
2443 /*
2444 * Up to this point the pmd is present and huge and userland has
2445 * the whole access to the hugepage during the split (which
2446 * happens in place). If we overwrite the pmd with the not-huge
2447 * version pointing to the pte here (which of course we could if
2448 * all CPUs were bug free), userland could trigger a small page
2449 * size TLB miss on the small sized TLB while the hugepage TLB
2450 * entry is still established in the huge TLB. Some CPU doesn't
2451 * like that. See
2452 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2453 * 383 on page 105. Intel should be safe but is also warns that
2454 * it's only safe if the permission and cache attributes of the
2455 * two entries loaded in the two TLB is identical (which should
2456 * be the case here). But it is generally safer to never allow
2457 * small and huge TLB entries for the same virtual address to be
2458 * loaded simultaneously. So instead of doing "pmd_populate();
2459 * flush_pmd_tlb_range();" we first mark the current pmd
2460 * notpresent (atomically because here the pmd_trans_huge must
2461 * remain set at all times on the pmd until the split is
2462 * complete for this pmd), then we flush the SMP TLB and finally
2463 * we write the non-huge version of the pmd entry with
2464 * pmd_populate.
2465 */
2466 old_pmd = pmdp_invalidate(vma, haddr, pmd);
423ac9af 2467 page = pmd_page(old_pmd);
91b2978a 2468 folio = page_folio(page);
0ccf7f16
PX
2469 if (pmd_dirty(old_pmd)) {
2470 dirty = true;
91b2978a 2471 folio_set_dirty(folio);
0ccf7f16 2472 }
2e83ee1d
PX
2473 write = pmd_write(old_pmd);
2474 young = pmd_young(old_pmd);
2475 soft_dirty = pmd_soft_dirty(old_pmd);
292924b2 2476 uffd_wp = pmd_uffd_wp(old_pmd);
6c287605 2477
91b2978a
DH
2478 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2479 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
6c287605
DH
2480
2481 /*
2482 * Without "freeze", we'll simply split the PMD, propagating the
2483 * PageAnonExclusive() flag for each PTE by setting it for
2484 * each subpage -- no need to (temporarily) clear.
2485 *
2486 * With "freeze" we want to replace mapped pages by
2487 * migration entries right away. This is only possible if we
2488 * managed to clear PageAnonExclusive() -- see
2489 * set_pmd_migration_entry().
2490 *
2491 * In case we cannot clear PageAnonExclusive(), split the PMD
2492 * only and let try_to_migrate_one() fail later.
088b8aa5 2493 *
e3b4b137 2494 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
6c287605 2495 */
91b2978a 2496 anon_exclusive = PageAnonExclusive(page);
e3b4b137
DH
2497 if (freeze && anon_exclusive &&
2498 folio_try_share_anon_rmap_pmd(folio, page))
6c287605 2499 freeze = false;
91b2978a
DH
2500 if (!freeze) {
2501 rmap_t rmap_flags = RMAP_NONE;
2502
2503 folio_ref_add(folio, HPAGE_PMD_NR - 1);
2504 if (anon_exclusive)
2505 rmap_flags |= RMAP_EXCLUSIVE;
2506 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2507 vma, haddr, rmap_flags);
2508 }
2e83ee1d 2509 }
eef1b3ba 2510
423ac9af
AK
2511 /*
2512 * Withdraw the table only after we mark the pmd entry invalid.
2513 * This's critical for some architectures (Power).
2514 */
eef1b3ba
KS
2515 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2516 pmd_populate(mm, &_pmd, pgtable);
2517
c9c1ee20
HD
2518 pte = pte_offset_map(&_pmd, haddr);
2519 VM_BUG_ON(!pte);
2bdba986
RR
2520
2521 /*
2522 * Note that NUMA hinting access restrictions are not transferred to
2523 * avoid any possibility of altering permissions across VMAs.
2524 */
2525 if (freeze || pmd_migration) {
2526 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2527 pte_t entry;
ba988280 2528 swp_entry_t swp_entry;
2bdba986 2529
4dd845b5
AP
2530 if (write)
2531 swp_entry = make_writable_migration_entry(
2532 page_to_pfn(page + i));
6c287605
DH
2533 else if (anon_exclusive)
2534 swp_entry = make_readable_exclusive_migration_entry(
2535 page_to_pfn(page + i));
4dd845b5
AP
2536 else
2537 swp_entry = make_readable_migration_entry(
2538 page_to_pfn(page + i));
2e346877
PX
2539 if (young)
2540 swp_entry = make_migration_entry_young(swp_entry);
2541 if (dirty)
2542 swp_entry = make_migration_entry_dirty(swp_entry);
ba988280 2543 entry = swp_entry_to_pte(swp_entry);
804dd150
AA
2544 if (soft_dirty)
2545 entry = pte_swp_mksoft_dirty(entry);
f45ec5ff
PX
2546 if (uffd_wp)
2547 entry = pte_swp_mkuffd_wp(entry);
2bdba986
RR
2548
2549 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2550 set_pte_at(mm, addr, pte + i, entry);
ba988280 2551 }
2bdba986
RR
2552 } else {
2553 pte_t entry;
2554
2555 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
2556 if (write)
2557 entry = pte_mkwrite(entry, vma);
2558 if (!young)
2559 entry = pte_mkold(entry);
2560 /* NOTE: this may set soft-dirty too on some archs */
2561 if (dirty)
2562 entry = pte_mkdirty(entry);
2563 if (soft_dirty)
2564 entry = pte_mksoft_dirty(entry);
2565 if (uffd_wp)
2566 entry = pte_mkuffd_wp(entry);
2567
2568 for (i = 0; i < HPAGE_PMD_NR; i++)
2569 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2570
2571 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
eef1b3ba 2572 }
2bdba986 2573 pte_unmap(pte);
eef1b3ba 2574
cb67f428 2575 if (!pmd_migration)
a8e61d58 2576 folio_remove_rmap_pmd(folio, page, vma);
96d82deb
HD
2577 if (freeze)
2578 put_page(page);
eef1b3ba
KS
2579
2580 smp_wmb(); /* make pte visible before pmd */
2581 pmd_populate(mm, pmd, pgtable);
2582}
2583
2584void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
af28a988 2585 unsigned long address, bool freeze, struct folio *folio)
eef1b3ba
KS
2586{
2587 spinlock_t *ptl;
ac46d4f3 2588 struct mmu_notifier_range range;
eef1b3ba 2589
7d4a8be0 2590 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
6f4f13e8 2591 address & HPAGE_PMD_MASK,
ac46d4f3
JG
2592 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2593 mmu_notifier_invalidate_range_start(&range);
2594 ptl = pmd_lock(vma->vm_mm, pmd);
33f4751e
NH
2595
2596 /*
af28a988
MWO
2597 * If caller asks to setup a migration entry, we need a folio to check
2598 * pmd against. Otherwise we can end up replacing wrong folio.
33f4751e 2599 */
af28a988 2600 VM_BUG_ON(freeze && !folio);
83a8441f 2601 VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
33f4751e 2602
7f760917 2603 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
83a8441f 2604 is_pmd_migration_entry(*pmd)) {
cea33328
ML
2605 /*
2606 * It's safe to call pmd_page when folio is set because it's
2607 * guaranteed that pmd is present.
2608 */
e06d03d5 2609 if (folio && folio != pmd_folio(*pmd))
83a8441f 2610 goto out;
7f760917 2611 __split_huge_pmd_locked(vma, pmd, range.start, freeze);
83a8441f 2612 }
7f760917 2613
e90309c9 2614out:
eef1b3ba 2615 spin_unlock(ptl);
ec8832d0 2616 mmu_notifier_invalidate_range_end(&range);
eef1b3ba
KS
2617}
2618
fec89c10 2619void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
af28a988 2620 bool freeze, struct folio *folio)
94fcc585 2621{
50722804 2622 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
94fcc585 2623
50722804 2624 if (!pmd)
f72e7dcd
HD
2625 return;
2626
af28a988 2627 __split_huge_pmd(vma, pmd, address, freeze, folio);
94fcc585
AA
2628}
2629
71f9e58e
ML
2630static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
2631{
2632 /*
2633 * If the new address isn't hpage aligned and it could previously
2634 * contain an hugepage: check if we need to split an huge pmd.
2635 */
2636 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
2637 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
2638 ALIGN(address, HPAGE_PMD_SIZE)))
2639 split_huge_pmd_address(vma, address, false, NULL);
2640}
2641
e1b9996b 2642void vma_adjust_trans_huge(struct vm_area_struct *vma,
94fcc585
AA
2643 unsigned long start,
2644 unsigned long end,
2645 long adjust_next)
2646{
71f9e58e
ML
2647 /* Check if we need to split start first. */
2648 split_huge_pmd_if_needed(vma, start);
94fcc585 2649
71f9e58e
ML
2650 /* Check if we need to split end next. */
2651 split_huge_pmd_if_needed(vma, end);
94fcc585
AA
2652
2653 /*
68540502 2654 * If we're also updating the next vma vm_start,
71f9e58e 2655 * check if we need to split it.
94fcc585
AA
2656 */
2657 if (adjust_next > 0) {
68540502 2658 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
94fcc585 2659 unsigned long nstart = next->vm_start;
f9d86a60 2660 nstart += adjust_next;
71f9e58e 2661 split_huge_pmd_if_needed(next, nstart);
94fcc585
AA
2662 }
2663}
e9b61f19 2664
684555aa 2665static void unmap_folio(struct folio *folio)
e9b61f19 2666{
319a624e
ZY
2667 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
2668 TTU_BATCH_FLUSH;
e9b61f19 2669
684555aa 2670 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
e9b61f19 2671
319a624e
ZY
2672 if (folio_test_pmd_mappable(folio))
2673 ttu_flags |= TTU_SPLIT_HUGE_PMD;
2674
a98a2f0c
AP
2675 /*
2676 * Anon pages need migration entries to preserve them, but file
2677 * pages can simply be left unmapped, then faulted back on demand.
2678 * If that is ever changed (perhaps for mlock), update remap_page().
2679 */
4b8554c5
MWO
2680 if (folio_test_anon(folio))
2681 try_to_migrate(folio, ttu_flags);
a98a2f0c 2682 else
869f7ee6 2683 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
3027c6f8
BW
2684
2685 try_to_unmap_flush();
e9b61f19
KS
2686}
2687
4eecb8b9 2688static void remap_page(struct folio *folio, unsigned long nr)
e9b61f19 2689{
4eecb8b9 2690 int i = 0;
ab02c252 2691
684555aa 2692 /* If unmap_folio() uses try_to_migrate() on file, remove this check */
4eecb8b9 2693 if (!folio_test_anon(folio))
ab02c252 2694 return;
4eecb8b9
MWO
2695 for (;;) {
2696 remove_migration_ptes(folio, folio, true);
2697 i += folio_nr_pages(folio);
2698 if (i >= nr)
2699 break;
2700 folio = folio_next(folio);
ace71a19 2701 }
e9b61f19
KS
2702}
2703
94866635 2704static void lru_add_page_tail(struct page *head, struct page *tail,
88dcb9a3
AS
2705 struct lruvec *lruvec, struct list_head *list)
2706{
94866635 2707 VM_BUG_ON_PAGE(!PageHead(head), head);
94866635 2708 VM_BUG_ON_PAGE(PageLRU(tail), head);
6168d0da 2709 lockdep_assert_held(&lruvec->lru_lock);
88dcb9a3 2710
6dbb5741 2711 if (list) {
88dcb9a3 2712 /* page reclaim is reclaiming a huge page */
6dbb5741 2713 VM_WARN_ON(PageLRU(head));
94866635
AS
2714 get_page(tail);
2715 list_add_tail(&tail->lru, list);
88dcb9a3 2716 } else {
6dbb5741
AS
2717 /* head is still on lru (and we have it frozen) */
2718 VM_WARN_ON(!PageLRU(head));
07ca7606
HD
2719 if (PageUnevictable(tail))
2720 tail->mlock_count = 0;
2721 else
2722 list_add_tail(&tail->lru, &head->lru);
6dbb5741 2723 SetPageLRU(tail);
88dcb9a3
AS
2724 }
2725}
2726
07e09c48 2727static void __split_huge_page_tail(struct folio *folio, int tail,
c010d47f
ZY
2728 struct lruvec *lruvec, struct list_head *list,
2729 unsigned int new_order)
e9b61f19 2730{
07e09c48 2731 struct page *head = &folio->page;
e9b61f19 2732 struct page *page_tail = head + tail;
07e09c48
DH
2733 /*
2734 * Careful: new_folio is not a "real" folio before we cleared PageTail.
2735 * Don't pass it around before clear_compound_head().
2736 */
2737 struct folio *new_folio = (struct folio *)page_tail;
e9b61f19 2738
8df651c7 2739 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
e9b61f19
KS
2740
2741 /*
605ca5ed
KK
2742 * Clone page flags before unfreezing refcount.
2743 *
2744 * After successful get_page_unless_zero() might follow flags change,
8958b249 2745 * for example lock_page() which set PG_waiters.
6c287605
DH
2746 *
2747 * Note that for mapped sub-pages of an anonymous THP,
684555aa 2748 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
6c287605
DH
2749 * the migration entry instead from where remap_page() will restore it.
2750 * We can still have PG_anon_exclusive set on effectively unmapped and
2751 * unreferenced sub-pages of an anonymous THP: we can simply drop
2752 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
e9b61f19 2753 */
e9b61f19
KS
2754 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2755 page_tail->flags |= (head->flags &
2756 ((1L << PG_referenced) |
2757 (1L << PG_swapbacked) |
38d8b4e6 2758 (1L << PG_swapcache) |
e9b61f19
KS
2759 (1L << PG_mlocked) |
2760 (1L << PG_uptodate) |
2761 (1L << PG_active) |
1899ad18 2762 (1L << PG_workingset) |
e9b61f19 2763 (1L << PG_locked) |
b8d3c4c3 2764 (1L << PG_unevictable) |
b0284cd2 2765#ifdef CONFIG_ARCH_USES_PG_ARCH_X
72e6afa0 2766 (1L << PG_arch_2) |
ef6458b1 2767 (1L << PG_arch_3) |
72e6afa0 2768#endif
ec1c86b2
YZ
2769 (1L << PG_dirty) |
2770 LRU_GEN_MASK | LRU_REFS_MASK));
e9b61f19 2771
cb67f428 2772 /* ->mapping in first and second tail page is replaced by other uses */
173d9d9f
HD
2773 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2774 page_tail);
2775 page_tail->mapping = head->mapping;
2776 page_tail->index = head->index + tail;
71e2d666
MG
2777
2778 /*
cfeed8ff
DH
2779 * page->private should not be set in tail pages. Fix up and warn once
2780 * if private is unexpectedly set.
71e2d666 2781 */
cfeed8ff
DH
2782 if (unlikely(page_tail->private)) {
2783 VM_WARN_ON_ONCE_PAGE(true, page_tail);
71e2d666
MG
2784 page_tail->private = 0;
2785 }
07e09c48
DH
2786 if (folio_test_swapcache(folio))
2787 new_folio->swap.val = folio->swap.val + tail;
173d9d9f 2788
605ca5ed 2789 /* Page flags must be visible before we make the page non-compound. */
e9b61f19
KS
2790 smp_wmb();
2791
605ca5ed
KK
2792 /*
2793 * Clear PageTail before unfreezing page refcount.
2794 *
2795 * After successful get_page_unless_zero() might follow put_page()
2796 * which needs correct compound_head().
2797 */
e9b61f19 2798 clear_compound_head(page_tail);
c010d47f
ZY
2799 if (new_order) {
2800 prep_compound_page(page_tail, new_order);
85edc15a 2801 folio_set_large_rmappable(new_folio);
c010d47f 2802 }
e9b61f19 2803
605ca5ed 2804 /* Finally unfreeze refcount. Additional reference from page cache. */
c010d47f
ZY
2805 page_ref_unfreeze(page_tail,
2806 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
2807 folio_nr_pages(new_folio) : 0));
605ca5ed 2808
b7542769
KW
2809 if (folio_test_young(folio))
2810 folio_set_young(new_folio);
2811 if (folio_test_idle(folio))
2812 folio_set_idle(new_folio);
e9b61f19 2813
c8253011 2814 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
94723aaf
MH
2815
2816 /*
2817 * always add to the tail because some iterators expect new
2818 * pages to show after the currently processed elements - e.g.
2819 * migrate_pages
2820 */
e9b61f19 2821 lru_add_page_tail(head, page_tail, lruvec, list);
e9b61f19
KS
2822}
2823
baa355fd 2824static void __split_huge_page(struct page *page, struct list_head *list,
c010d47f 2825 pgoff_t end, unsigned int new_order)
e9b61f19 2826{
e809c3fe
MWO
2827 struct folio *folio = page_folio(page);
2828 struct page *head = &folio->page;
e9b61f19 2829 struct lruvec *lruvec;
4101196b
MWO
2830 struct address_space *swap_cache = NULL;
2831 unsigned long offset = 0;
509f0069 2832 int i, nr_dropped = 0;
c010d47f 2833 unsigned int new_nr = 1 << new_order;
502003bb
ZY
2834 int order = folio_order(folio);
2835 unsigned int nr = 1 << order;
e9b61f19 2836
e9b61f19 2837 /* complete memcg works before add pages to LRU */
c010d47f 2838 split_page_memcg(head, order, new_order);
e9b61f19 2839
07e09c48
DH
2840 if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
2841 offset = swp_offset(folio->swap);
2842 swap_cache = swap_address_space(folio->swap);
4101196b
MWO
2843 xa_lock(&swap_cache->i_pages);
2844 }
2845
f0953a1b 2846 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
e809c3fe 2847 lruvec = folio_lruvec_lock(folio);
b6769834 2848
eac96c3e
YS
2849 ClearPageHasHWPoisoned(head);
2850
c010d47f
ZY
2851 for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
2852 __split_huge_page_tail(folio, i, lruvec, list, new_order);
d144bf62 2853 /* Some pages can be beyond EOF: drop them from page cache */
baa355fd 2854 if (head[i].index >= end) {
fb5c2029
MWO
2855 struct folio *tail = page_folio(head + i);
2856
435a7554 2857 if (shmem_mapping(folio->mapping))
509f0069 2858 nr_dropped++;
fb5c2029
MWO
2859 else if (folio_test_clear_dirty(tail))
2860 folio_account_cleaned(tail,
2861 inode_to_wb(folio->mapping->host));
2862 __filemap_remove_folio(tail, NULL);
2863 folio_put(tail);
4101196b 2864 } else if (!PageAnon(page)) {
435a7554 2865 __xa_store(&folio->mapping->i_pages, head[i].index,
4101196b
MWO
2866 head + i, 0);
2867 } else if (swap_cache) {
2868 __xa_store(&swap_cache->i_pages, offset + i,
2869 head + i, 0);
baa355fd
KS
2870 }
2871 }
e9b61f19 2872
c010d47f
ZY
2873 if (!new_order)
2874 ClearPageCompound(head);
2875 else {
2876 struct folio *new_folio = (struct folio *)head;
2877
2878 folio_set_order(new_folio, new_order);
2879 }
6168d0da 2880 unlock_page_lruvec(lruvec);
b6769834 2881 /* Caller disabled irqs, so they are still disabled here */
f7da677b 2882
c010d47f 2883 split_page_owner(head, order, new_order);
be25d1d4 2884 pgalloc_tag_split(head, 1 << order);
f7da677b 2885
baa355fd 2886 /* See comment in __split_huge_page_tail() */
435a7554 2887 if (folio_test_anon(folio)) {
aa5dc07f 2888 /* Additional pin to swap cache */
435a7554
MWO
2889 if (folio_test_swapcache(folio)) {
2890 folio_ref_add(folio, 1 + new_nr);
4101196b
MWO
2891 xa_unlock(&swap_cache->i_pages);
2892 } else {
435a7554 2893 folio_ref_inc(folio);
4101196b 2894 }
baa355fd 2895 } else {
aa5dc07f 2896 /* Additional pin to page cache */
435a7554
MWO
2897 folio_ref_add(folio, 1 + new_nr);
2898 xa_unlock(&folio->mapping->i_pages);
baa355fd 2899 }
b6769834 2900 local_irq_enable();
e9b61f19 2901
509f0069 2902 if (nr_dropped)
435a7554 2903 shmem_uncharge(folio->mapping->host, nr_dropped);
4eecb8b9 2904 remap_page(folio, nr);
e9b61f19 2905
c010d47f
ZY
2906 /*
2907 * set page to its compound_head when split to non order-0 pages, so
2908 * we can skip unlocking it below, since PG_locked is transferred to
2909 * the compound_head of the page and the caller will unlock it.
2910 */
2911 if (new_order)
2912 page = compound_head(page);
2913
2914 for (i = 0; i < nr; i += new_nr) {
e9b61f19 2915 struct page *subpage = head + i;
435a7554 2916 struct folio *new_folio = page_folio(subpage);
e9b61f19
KS
2917 if (subpage == page)
2918 continue;
435a7554 2919 folio_unlock(new_folio);
e9b61f19
KS
2920
2921 /*
2922 * Subpages may be freed if there wasn't any mapping
2923 * like if add_to_swap() is running on a lru page that
2924 * had its mapping zapped. And freeing these pages
2925 * requires taking the lru_lock so we do the put_page
2926 * of the tail pages after the split is complete.
2927 */
0b175468 2928 free_page_and_swap_cache(subpage);
e9b61f19
KS
2929 }
2930}
2931
b8f593cd 2932/* Racy check whether the huge page can be split */
d4b4084a 2933bool can_split_folio(struct folio *folio, int *pextra_pins)
b8f593cd
HY
2934{
2935 int extra_pins;
2936
aa5dc07f 2937 /* Additional pins from page cache */
d4b4084a
MWO
2938 if (folio_test_anon(folio))
2939 extra_pins = folio_test_swapcache(folio) ?
2940 folio_nr_pages(folio) : 0;
b8f593cd 2941 else
d4b4084a 2942 extra_pins = folio_nr_pages(folio);
b8f593cd
HY
2943 if (pextra_pins)
2944 *pextra_pins = extra_pins;
d4b4084a 2945 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
b8f593cd
HY
2946}
2947
e9b61f19 2948/*
a8353dc9
JH
2949 * This function splits a large folio into smaller folios of order @new_order.
2950 * @page can point to any page of the large folio to split. The split operation
2951 * does not change the position of @page.
c010d47f 2952 *
a8353dc9 2953 * Prerequisites:
e9b61f19 2954 *
a8353dc9
JH
2955 * 1) The caller must hold a reference on the @page's owning folio, also known
2956 * as the large folio.
2957 *
2958 * 2) The large folio must be locked.
2959 *
2960 * 3) The folio must not be pinned. Any unexpected folio references, including
2961 * GUP pins, will result in the folio not getting split; instead, the caller
d21f996b 2962 * will receive an -EAGAIN.
a8353dc9
JH
2963 *
2964 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
2965 * supported for non-file-backed folios, because folio->_deferred_list, which
2966 * is used by partially mapped folios, is stored in subpage 2, but an order-1
2967 * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
2968 * since they do not use _deferred_list.
2969 *
2970 * After splitting, the caller's folio reference will be transferred to @page,
2971 * resulting in a raised refcount of @page after this call. The other pages may
2972 * be freed if they are not mapped.
e9b61f19
KS
2973 *
2974 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
2975 *
a8353dc9
JH
2976 * Pages in @new_order will inherit the mapping, flags, and so on from the
2977 * huge page.
e9b61f19 2978 *
a8353dc9 2979 * Returns 0 if the huge page was split successfully.
e9b61f19 2980 *
d21f996b
DH
2981 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
2982 * the folio was concurrently removed from the page cache.
2983 *
2984 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
2985 * under writeback, if fs-specific folio metadata cannot currently be
2986 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
2987 * truncation).
2988 *
2989 * Returns -EINVAL when trying to split to an order that is incompatible
2990 * with the folio. Splitting to order 0 is compatible with all folios.
e9b61f19 2991 */
c010d47f
ZY
2992int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
2993 unsigned int new_order)
e9b61f19 2994{
4eecb8b9 2995 struct folio *folio = page_folio(page);
f8baa6be 2996 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
c010d47f
ZY
2997 /* reset xarray order to new order after split */
2998 XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
baa355fd
KS
2999 struct anon_vma *anon_vma = NULL;
3000 struct address_space *mapping = NULL;
835c3a25 3001 bool is_thp = folio_test_pmd_mappable(folio);
504e070d 3002 int extra_pins, ret;
006d3ff2 3003 pgoff_t end;
478d134e 3004 bool is_hzp;
e9b61f19 3005
3e9a13da
MWO
3006 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3007 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
e9b61f19 3008
1412ecb3
ZY
3009 if (new_order >= folio_order(folio))
3010 return -EINVAL;
3011
c010d47f
ZY
3012 /* Cannot split anonymous THP to order-1 */
3013 if (new_order == 1 && folio_test_anon(folio)) {
3014 VM_WARN_ONCE(1, "Cannot split to order-1 folio");
3015 return -EINVAL;
3016 }
3017
3018 if (new_order) {
3019 /* Only swapping a whole PMD-mapped folio is supported */
3020 if (folio_test_swapcache(folio))
3021 return -EINVAL;
3022 /* Split shmem folio to non-zero order not supported */
3023 if (shmem_mapping(folio->mapping)) {
3024 VM_WARN_ONCE(1,
3025 "Cannot split shmem folio to non-0 order");
3026 return -EINVAL;
3027 }
3028 /* No split if the file system does not support large folio */
3029 if (!mapping_large_folio_support(folio->mapping)) {
3030 VM_WARN_ONCE(1,
3031 "Cannot split file folio to non-0 order");
3032 return -EINVAL;
3033 }
3034 }
3035
3036
5beaee54 3037 is_hzp = is_huge_zero_folio(folio);
4737edbb
NH
3038 if (is_hzp) {
3039 pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
478d134e 3040 return -EBUSY;
4737edbb 3041 }
478d134e 3042
3e9a13da 3043 if (folio_test_writeback(folio))
59807685
HY
3044 return -EBUSY;
3045
3e9a13da 3046 if (folio_test_anon(folio)) {
baa355fd 3047 /*
c1e8d7c6 3048 * The caller does not necessarily hold an mmap_lock that would
baa355fd
KS
3049 * prevent the anon_vma disappearing so we first we take a
3050 * reference to it and then lock the anon_vma for write. This
2f031c6f 3051 * is similar to folio_lock_anon_vma_read except the write lock
baa355fd
KS
3052 * is taken to serialise against parallel split or collapse
3053 * operations.
3054 */
29eea9b5 3055 anon_vma = folio_get_anon_vma(folio);
baa355fd
KS
3056 if (!anon_vma) {
3057 ret = -EBUSY;
3058 goto out;
3059 }
006d3ff2 3060 end = -1;
baa355fd
KS
3061 mapping = NULL;
3062 anon_vma_lock_write(anon_vma);
3063 } else {
6a3edd29
YF
3064 gfp_t gfp;
3065
3e9a13da 3066 mapping = folio->mapping;
baa355fd
KS
3067
3068 /* Truncated ? */
3069 if (!mapping) {
3070 ret = -EBUSY;
3071 goto out;
3072 }
3073
6a3edd29
YF
3074 gfp = current_gfp_context(mapping_gfp_mask(mapping) &
3075 GFP_RECLAIM_MASK);
3076
0201ebf2 3077 if (!filemap_release_folio(folio, gfp)) {
6a3edd29
YF
3078 ret = -EBUSY;
3079 goto out;
3080 }
3081
3e9a13da 3082 xas_split_alloc(&xas, folio, folio_order(folio), gfp);
6b24ca4a
MWO
3083 if (xas_error(&xas)) {
3084 ret = xas_error(&xas);
3085 goto out;
3086 }
3087
baa355fd
KS
3088 anon_vma = NULL;
3089 i_mmap_lock_read(mapping);
006d3ff2
HD
3090
3091 /*
3092 *__split_huge_page() may need to trim off pages beyond EOF:
3093 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
3094 * which cannot be nested inside the page tree lock. So note
3095 * end now: i_size itself may be changed at any moment, but
3e9a13da 3096 * folio lock is good enough to serialize the trimming.
006d3ff2
HD
3097 */
3098 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
d144bf62
HD
3099 if (shmem_mapping(mapping))
3100 end = shmem_fallocend(mapping->host, end);
e9b61f19 3101 }
e9b61f19
KS
3102
3103 /*
684555aa 3104 * Racy check if we can split the page, before unmap_folio() will
e9b61f19
KS
3105 * split PMDs
3106 */
d4b4084a 3107 if (!can_split_folio(folio, &extra_pins)) {
fd4a7ac3 3108 ret = -EAGAIN;
e9b61f19
KS
3109 goto out_unlock;
3110 }
3111
684555aa 3112 unmap_folio(folio);
e9b61f19 3113
b6769834
AS
3114 /* block interrupt reentry in xa_lock and spinlock */
3115 local_irq_disable();
baa355fd 3116 if (mapping) {
baa355fd 3117 /*
3e9a13da
MWO
3118 * Check if the folio is present in page cache.
3119 * We assume all tail are present too, if folio is there.
baa355fd 3120 */
6b24ca4a
MWO
3121 xas_lock(&xas);
3122 xas_reset(&xas);
3e9a13da 3123 if (xas_load(&xas) != folio)
baa355fd
KS
3124 goto fail;
3125 }
3126
0139aa7b 3127 /* Prevent deferred_split_scan() touching ->_refcount */
364c1eeb 3128 spin_lock(&ds_queue->split_queue_lock);
3e9a13da 3129 if (folio_ref_freeze(folio, 1 + extra_pins)) {
8897277a
MWO
3130 if (folio_order(folio) > 1 &&
3131 !list_empty(&folio->_deferred_list)) {
364c1eeb 3132 ds_queue->split_queue_len--;
c010d47f
ZY
3133 /*
3134 * Reinitialize page_deferred_list after removing the
3135 * page from the split_queue, otherwise a subsequent
3136 * split will see list corruption when checking the
3137 * page_deferred_list.
3138 */
3139 list_del_init(&folio->_deferred_list);
9a982250 3140 }
afb97172 3141 spin_unlock(&ds_queue->split_queue_lock);
06d3eff6 3142 if (mapping) {
3e9a13da 3143 int nr = folio_nr_pages(folio);
bf9ecead 3144
3e9a13da 3145 xas_split(&xas, folio, folio_order(folio));
c010d47f
ZY
3146 if (folio_test_pmd_mappable(folio) &&
3147 new_order < HPAGE_PMD_ORDER) {
a48d5bdc
SR
3148 if (folio_test_swapbacked(folio)) {
3149 __lruvec_stat_mod_folio(folio,
3150 NR_SHMEM_THPS, -nr);
3151 } else {
3152 __lruvec_stat_mod_folio(folio,
3153 NR_FILE_THPS, -nr);
3154 filemap_nr_thps_dec(mapping);
3155 }
1ca7554d 3156 }
06d3eff6
KS
3157 }
3158
c010d47f 3159 __split_huge_page(page, list, end, new_order);
c4f9c701 3160 ret = 0;
e9b61f19 3161 } else {
364c1eeb 3162 spin_unlock(&ds_queue->split_queue_lock);
504e070d
YS
3163fail:
3164 if (mapping)
6b24ca4a 3165 xas_unlock(&xas);
b6769834 3166 local_irq_enable();
4eecb8b9 3167 remap_page(folio, folio_nr_pages(folio));
fd4a7ac3 3168 ret = -EAGAIN;
e9b61f19
KS
3169 }
3170
3171out_unlock:
baa355fd
KS
3172 if (anon_vma) {
3173 anon_vma_unlock_write(anon_vma);
3174 put_anon_vma(anon_vma);
3175 }
3176 if (mapping)
3177 i_mmap_unlock_read(mapping);
e9b61f19 3178out:
69a37a8b 3179 xas_destroy(&xas);
835c3a25
BW
3180 if (is_thp)
3181 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
e9b61f19
KS
3182 return ret;
3183}
9a982250 3184
8dc4a8f1 3185void folio_undo_large_rmappable(struct folio *folio)
9a982250 3186{
8dc4a8f1 3187 struct deferred_split *ds_queue;
9a982250
KS
3188 unsigned long flags;
3189
8897277a
MWO
3190 if (folio_order(folio) <= 1)
3191 return;
3192
deedad80
YF
3193 /*
3194 * At this point, there is no one trying to add the folio to
3195 * deferred_list. If folio is not in deferred_list, it's safe
3196 * to check without acquiring the split_queue_lock.
3197 */
8dc4a8f1
MWO
3198 if (data_race(list_empty(&folio->_deferred_list)))
3199 return;
3200
3201 ds_queue = get_deferred_split_queue(folio);
3202 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3203 if (!list_empty(&folio->_deferred_list)) {
3204 ds_queue->split_queue_len--;
9bcef597 3205 list_del_init(&folio->_deferred_list);
9a982250 3206 }
8dc4a8f1 3207 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250
KS
3208}
3209
f158ed61 3210void deferred_split_folio(struct folio *folio)
9a982250 3211{
f8baa6be 3212 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
87eaceb3 3213#ifdef CONFIG_MEMCG
8991de90 3214 struct mem_cgroup *memcg = folio_memcg(folio);
87eaceb3 3215#endif
9a982250
KS
3216 unsigned long flags;
3217
8897277a
MWO
3218 /*
3219 * Order 1 folios have no space for a deferred list, but we also
3220 * won't waste much memory by not adding them to the deferred list.
3221 */
3222 if (folio_order(folio) <= 1)
3223 return;
9a982250 3224
87eaceb3
YS
3225 /*
3226 * The try_to_unmap() in page reclaim path might reach here too,
3227 * this may cause a race condition to corrupt deferred split queue.
8991de90 3228 * And, if page reclaim is already handling the same folio, it is
87eaceb3
YS
3229 * unnecessary to handle it again in shrinker.
3230 *
8991de90
MWO
3231 * Check the swapcache flag to determine if the folio is being
3232 * handled by page reclaim since THP swap would add the folio into
87eaceb3
YS
3233 * swap cache before calling try_to_unmap().
3234 */
8991de90 3235 if (folio_test_swapcache(folio))
87eaceb3
YS
3236 return;
3237
8991de90 3238 if (!list_empty(&folio->_deferred_list))
87eaceb3
YS
3239 return;
3240
364c1eeb 3241 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
8991de90 3242 if (list_empty(&folio->_deferred_list)) {
835c3a25
BW
3243 if (folio_test_pmd_mappable(folio))
3244 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
8991de90 3245 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
364c1eeb 3246 ds_queue->split_queue_len++;
87eaceb3
YS
3247#ifdef CONFIG_MEMCG
3248 if (memcg)
8991de90 3249 set_shrinker_bit(memcg, folio_nid(folio),
54d91729 3250 deferred_split_shrinker->id);
87eaceb3 3251#endif
9a982250 3252 }
364c1eeb 3253 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250
KS
3254}
3255
3256static unsigned long deferred_split_count(struct shrinker *shrink,
3257 struct shrink_control *sc)
3258{
a3d0a918 3259 struct pglist_data *pgdata = NODE_DATA(sc->nid);
364c1eeb 3260 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
87eaceb3
YS
3261
3262#ifdef CONFIG_MEMCG
3263 if (sc->memcg)
3264 ds_queue = &sc->memcg->deferred_split_queue;
3265#endif
364c1eeb 3266 return READ_ONCE(ds_queue->split_queue_len);
9a982250
KS
3267}
3268
3269static unsigned long deferred_split_scan(struct shrinker *shrink,
3270 struct shrink_control *sc)
3271{
a3d0a918 3272 struct pglist_data *pgdata = NODE_DATA(sc->nid);
364c1eeb 3273 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
9a982250 3274 unsigned long flags;
4375a553
MWO
3275 LIST_HEAD(list);
3276 struct folio *folio, *next;
9a982250
KS
3277 int split = 0;
3278
87eaceb3
YS
3279#ifdef CONFIG_MEMCG
3280 if (sc->memcg)
3281 ds_queue = &sc->memcg->deferred_split_queue;
3282#endif
3283
364c1eeb 3284 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
9a982250 3285 /* Take pin on all head pages to avoid freeing them under us */
4375a553
MWO
3286 list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
3287 _deferred_list) {
3288 if (folio_try_get(folio)) {
3289 list_move(&folio->_deferred_list, &list);
e3ae1953 3290 } else {
4375a553
MWO
3291 /* We lost race with folio_put() */
3292 list_del_init(&folio->_deferred_list);
364c1eeb 3293 ds_queue->split_queue_len--;
9a982250 3294 }
e3ae1953
KS
3295 if (!--sc->nr_to_scan)
3296 break;
9a982250 3297 }
364c1eeb 3298 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250 3299
4375a553
MWO
3300 list_for_each_entry_safe(folio, next, &list, _deferred_list) {
3301 if (!folio_trylock(folio))
fa41b900 3302 goto next;
9a982250 3303 /* split_huge_page() removes page from list on success */
4375a553 3304 if (!split_folio(folio))
9a982250 3305 split++;
4375a553 3306 folio_unlock(folio);
fa41b900 3307next:
4375a553 3308 folio_put(folio);
9a982250
KS
3309 }
3310
364c1eeb
YS
3311 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3312 list_splice_tail(&list, &ds_queue->split_queue);
3313 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
9a982250 3314
cb8d68ec
KS
3315 /*
3316 * Stop shrinker if we didn't split any page, but the queue is empty.
3317 * This can happen if pages were freed under us.
3318 */
364c1eeb 3319 if (!split && list_empty(&ds_queue->split_queue))
cb8d68ec
KS
3320 return SHRINK_STOP;
3321 return split;
9a982250
KS
3322}
3323
49071d43 3324#ifdef CONFIG_DEBUG_FS
fa6c0231 3325static void split_huge_pages_all(void)
49071d43
KS
3326{
3327 struct zone *zone;
3328 struct page *page;
630e7c5e 3329 struct folio *folio;
49071d43
KS
3330 unsigned long pfn, max_zone_pfn;
3331 unsigned long total = 0, split = 0;
3332
fa6c0231 3333 pr_debug("Split all THPs\n");
a17206da
ML
3334 for_each_zone(zone) {
3335 if (!managed_zone(zone))
3336 continue;
49071d43
KS
3337 max_zone_pfn = zone_end_pfn(zone);
3338 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
a17206da 3339 int nr_pages;
49071d43 3340
2b7aa91b 3341 page = pfn_to_online_page(pfn);
630e7c5e
KW
3342 if (!page || PageTail(page))
3343 continue;
3344 folio = page_folio(page);
3345 if (!folio_try_get(folio))
49071d43
KS
3346 continue;
3347
630e7c5e 3348 if (unlikely(page_folio(page) != folio))
49071d43
KS
3349 goto next;
3350
630e7c5e 3351 if (zone != folio_zone(folio))
49071d43
KS
3352 goto next;
3353
630e7c5e
KW
3354 if (!folio_test_large(folio)
3355 || folio_test_hugetlb(folio)
3356 || !folio_test_lru(folio))
49071d43
KS
3357 goto next;
3358
3359 total++;
630e7c5e
KW
3360 folio_lock(folio);
3361 nr_pages = folio_nr_pages(folio);
3362 if (!split_folio(folio))
49071d43 3363 split++;
a17206da 3364 pfn += nr_pages - 1;
630e7c5e 3365 folio_unlock(folio);
49071d43 3366next:
630e7c5e 3367 folio_put(folio);
fa6c0231 3368 cond_resched();
49071d43
KS
3369 }
3370 }
3371
fa6c0231
ZY
3372 pr_debug("%lu of %lu THP split\n", split, total);
3373}
49071d43 3374
fa6c0231
ZY
3375static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
3376{
3377 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
3378 is_vm_hugetlb_page(vma);
3379}
3380
3381static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
fc4d1823 3382 unsigned long vaddr_end, unsigned int new_order)
fa6c0231
ZY
3383{
3384 int ret = 0;
3385 struct task_struct *task;
3386 struct mm_struct *mm;
3387 unsigned long total = 0, split = 0;
3388 unsigned long addr;
3389
3390 vaddr_start &= PAGE_MASK;
3391 vaddr_end &= PAGE_MASK;
3392
3393 /* Find the task_struct from pid */
3394 rcu_read_lock();
3395 task = find_task_by_vpid(pid);
3396 if (!task) {
3397 rcu_read_unlock();
3398 ret = -ESRCH;
3399 goto out;
3400 }
3401 get_task_struct(task);
3402 rcu_read_unlock();
3403
3404 /* Find the mm_struct */
3405 mm = get_task_mm(task);
3406 put_task_struct(task);
3407
3408 if (!mm) {
3409 ret = -EINVAL;
3410 goto out;
3411 }
3412
3413 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3414 pid, vaddr_start, vaddr_end);
3415
3416 mmap_read_lock(mm);
3417 /*
3418 * always increase addr by PAGE_SIZE, since we could have a PTE page
3419 * table filled with PTE-mapped THPs, each of which is distinct.
3420 */
3421 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
74ba2b38 3422 struct vm_area_struct *vma = vma_lookup(mm, addr);
fa6c0231 3423 struct page *page;
a644b0ab 3424 struct folio *folio;
fa6c0231 3425
74ba2b38 3426 if (!vma)
fa6c0231
ZY
3427 break;
3428
3429 /* skip special VMA and hugetlb VMA */
3430 if (vma_not_suitable_for_thp_split(vma)) {
3431 addr = vma->vm_end;
3432 continue;
3433 }
3434
3435 /* FOLL_DUMP to ignore special (like zero) pages */
87d2762e 3436 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
fa6c0231 3437
f7091ed6 3438 if (IS_ERR_OR_NULL(page))
fa6c0231
ZY
3439 continue;
3440
a644b0ab
MWO
3441 folio = page_folio(page);
3442 if (!is_transparent_hugepage(folio))
fa6c0231
ZY
3443 goto next;
3444
2394aef6
ZY
3445 if (new_order >= folio_order(folio))
3446 goto next;
3447
fa6c0231 3448 total++;
fc4d1823
ZY
3449 /*
3450 * For folios with private, split_huge_page_to_list_to_order()
3451 * will try to drop it before split and then check if the folio
3452 * can be split or not. So skip the check here.
3453 */
3454 if (!folio_test_private(folio) &&
3455 !can_split_folio(folio, NULL))
fa6c0231
ZY
3456 goto next;
3457
a644b0ab 3458 if (!folio_trylock(folio))
fa6c0231
ZY
3459 goto next;
3460
fc4d1823 3461 if (!split_folio_to_order(folio, new_order))
fa6c0231
ZY
3462 split++;
3463
a644b0ab 3464 folio_unlock(folio);
fa6c0231 3465next:
a644b0ab 3466 folio_put(folio);
fa6c0231
ZY
3467 cond_resched();
3468 }
3469 mmap_read_unlock(mm);
3470 mmput(mm);
3471
3472 pr_debug("%lu of %lu THP split\n", split, total);
3473
3474out:
3475 return ret;
49071d43 3476}
fa6c0231 3477
fbe37501 3478static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
fc4d1823 3479 pgoff_t off_end, unsigned int new_order)
fbe37501
ZY
3480{
3481 struct filename *file;
3482 struct file *candidate;
3483 struct address_space *mapping;
3484 int ret = -EINVAL;
3485 pgoff_t index;
3486 int nr_pages = 1;
3487 unsigned long total = 0, split = 0;
3488
3489 file = getname_kernel(file_path);
3490 if (IS_ERR(file))
3491 return ret;
3492
3493 candidate = file_open_name(file, O_RDONLY, 0);
3494 if (IS_ERR(candidate))
3495 goto out;
3496
3497 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3498 file_path, off_start, off_end);
3499
3500 mapping = candidate->f_mapping;
3501
3502 for (index = off_start; index < off_end; index += nr_pages) {
1fb130b2 3503 struct folio *folio = filemap_get_folio(mapping, index);
fbe37501
ZY
3504
3505 nr_pages = 1;
66dabbb6 3506 if (IS_ERR(folio))
fbe37501
ZY
3507 continue;
3508
9ee2c086 3509 if (!folio_test_large(folio))
fbe37501
ZY
3510 goto next;
3511
3512 total++;
9ee2c086 3513 nr_pages = folio_nr_pages(folio);
fbe37501 3514
2394aef6
ZY
3515 if (new_order >= folio_order(folio))
3516 goto next;
3517
9ee2c086 3518 if (!folio_trylock(folio))
fbe37501
ZY
3519 goto next;
3520
fc4d1823 3521 if (!split_folio_to_order(folio, new_order))
fbe37501
ZY
3522 split++;
3523
9ee2c086 3524 folio_unlock(folio);
fbe37501 3525next:
9ee2c086 3526 folio_put(folio);
fbe37501
ZY
3527 cond_resched();
3528 }
3529
3530 filp_close(candidate, NULL);
3531 ret = 0;
3532
3533 pr_debug("%lu of %lu file-backed THP split\n", split, total);
3534out:
3535 putname(file);
3536 return ret;
3537}
3538
fa6c0231
ZY
3539#define MAX_INPUT_BUF_SZ 255
3540
3541static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
3542 size_t count, loff_t *ppops)
3543{
3544 static DEFINE_MUTEX(split_debug_mutex);
3545 ssize_t ret;
fc4d1823
ZY
3546 /*
3547 * hold pid, start_vaddr, end_vaddr, new_order or
3548 * file_path, off_start, off_end, new_order
3549 */
fbe37501 3550 char input_buf[MAX_INPUT_BUF_SZ];
fa6c0231
ZY
3551 int pid;
3552 unsigned long vaddr_start, vaddr_end;
fc4d1823 3553 unsigned int new_order = 0;
fa6c0231
ZY
3554
3555 ret = mutex_lock_interruptible(&split_debug_mutex);
3556 if (ret)
3557 return ret;
3558
3559 ret = -EFAULT;
3560
3561 memset(input_buf, 0, MAX_INPUT_BUF_SZ);
3562 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
3563 goto out;
3564
3565 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
fbe37501
ZY
3566
3567 if (input_buf[0] == '/') {
3568 char *tok;
3569 char *buf = input_buf;
3570 char file_path[MAX_INPUT_BUF_SZ];
3571 pgoff_t off_start = 0, off_end = 0;
3572 size_t input_len = strlen(input_buf);
3573
3574 tok = strsep(&buf, ",");
3575 if (tok) {
1212e00c 3576 strcpy(file_path, tok);
fbe37501
ZY
3577 } else {
3578 ret = -EINVAL;
3579 goto out;
3580 }
3581
fc4d1823
ZY
3582 ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
3583 if (ret != 2 && ret != 3) {
fbe37501
ZY
3584 ret = -EINVAL;
3585 goto out;
3586 }
fc4d1823 3587 ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
fbe37501
ZY
3588 if (!ret)
3589 ret = input_len;
3590
3591 goto out;
3592 }
3593
fc4d1823 3594 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
fa6c0231
ZY
3595 if (ret == 1 && pid == 1) {
3596 split_huge_pages_all();
3597 ret = strlen(input_buf);
3598 goto out;
fc4d1823 3599 } else if (ret != 3 && ret != 4) {
fa6c0231
ZY
3600 ret = -EINVAL;
3601 goto out;
3602 }
3603
fc4d1823 3604 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
fa6c0231
ZY
3605 if (!ret)
3606 ret = strlen(input_buf);
3607out:
3608 mutex_unlock(&split_debug_mutex);
3609 return ret;
3610
3611}
3612
3613static const struct file_operations split_huge_pages_fops = {
3614 .owner = THIS_MODULE,
3615 .write = split_huge_pages_write,
3616 .llseek = no_llseek,
3617};
49071d43
KS
3618
3619static int __init split_huge_pages_debugfs(void)
3620{
d9f7979c
GKH
3621 debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
3622 &split_huge_pages_fops);
49071d43
KS
3623 return 0;
3624}
3625late_initcall(split_huge_pages_debugfs);
3626#endif
616b8371
ZY
3627
3628#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
7f5abe60 3629int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
616b8371
ZY
3630 struct page *page)
3631{
a8e61d58 3632 struct folio *folio = page_folio(page);
616b8371
ZY
3633 struct vm_area_struct *vma = pvmw->vma;
3634 struct mm_struct *mm = vma->vm_mm;
3635 unsigned long address = pvmw->address;
6c287605 3636 bool anon_exclusive;
616b8371
ZY
3637 pmd_t pmdval;
3638 swp_entry_t entry;
ab6e3d09 3639 pmd_t pmdswp;
616b8371
ZY
3640
3641 if (!(pvmw->pmd && !pvmw->pte))
7f5abe60 3642 return 0;
616b8371 3643
616b8371 3644 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
8a8683ad 3645 pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
6c287605 3646
e3b4b137 3647 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
a8e61d58 3648 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
e3b4b137 3649 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
6c287605 3650 set_pmd_at(mm, address, pvmw->pmd, pmdval);
7f5abe60 3651 return -EBUSY;
6c287605
DH
3652 }
3653
616b8371 3654 if (pmd_dirty(pmdval))
db44c658 3655 folio_mark_dirty(folio);
4dd845b5
AP
3656 if (pmd_write(pmdval))
3657 entry = make_writable_migration_entry(page_to_pfn(page));
6c287605
DH
3658 else if (anon_exclusive)
3659 entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
4dd845b5
AP
3660 else
3661 entry = make_readable_migration_entry(page_to_pfn(page));
2e346877
PX
3662 if (pmd_young(pmdval))
3663 entry = make_migration_entry_young(entry);
3664 if (pmd_dirty(pmdval))
3665 entry = make_migration_entry_dirty(entry);
ab6e3d09
NH
3666 pmdswp = swp_entry_to_pmd(entry);
3667 if (pmd_soft_dirty(pmdval))
3668 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
24bf08c4
DH
3669 if (pmd_uffd_wp(pmdval))
3670 pmdswp = pmd_swp_mkuffd_wp(pmdswp);
ab6e3d09 3671 set_pmd_at(mm, address, pvmw->pmd, pmdswp);
a8e61d58
DH
3672 folio_remove_rmap_pmd(folio, page, vma);
3673 folio_put(folio);
283fd6fe 3674 trace_set_migration_pmd(address, pmd_val(pmdswp));
7f5abe60
DH
3675
3676 return 0;
616b8371
ZY
3677}
3678
3679void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
3680{
14d85a6e 3681 struct folio *folio = page_folio(new);
616b8371
ZY
3682 struct vm_area_struct *vma = pvmw->vma;
3683 struct mm_struct *mm = vma->vm_mm;
3684 unsigned long address = pvmw->address;
4fba8f2a 3685 unsigned long haddr = address & HPAGE_PMD_MASK;
616b8371
ZY
3686 pmd_t pmde;
3687 swp_entry_t entry;
3688
3689 if (!(pvmw->pmd && !pvmw->pte))
3690 return;
3691
3692 entry = pmd_to_swp_entry(*pvmw->pmd);
14d85a6e 3693 folio_get(folio);
2e346877 3694 pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
ab6e3d09
NH
3695 if (pmd_swp_soft_dirty(*pvmw->pmd))
3696 pmde = pmd_mksoft_dirty(pmde);
3c811f78 3697 if (is_writable_migration_entry(entry))
161e393c 3698 pmde = pmd_mkwrite(pmde, vma);
8f34f1ea 3699 if (pmd_swp_uffd_wp(*pvmw->pmd))
f1eb1bac 3700 pmde = pmd_mkuffd_wp(pmde);
2e346877
PX
3701 if (!is_migration_entry_young(entry))
3702 pmde = pmd_mkold(pmde);
3703 /* NOTE: this may contain setting soft-dirty on some archs */
14d85a6e 3704 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
2e346877 3705 pmde = pmd_mkdirty(pmde);
616b8371 3706
14d85a6e 3707 if (folio_test_anon(folio)) {
395db7b1 3708 rmap_t rmap_flags = RMAP_NONE;
6c287605
DH
3709
3710 if (!is_readable_migration_entry(entry))
3711 rmap_flags |= RMAP_EXCLUSIVE;
3712
395db7b1 3713 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
6c287605 3714 } else {
14d85a6e 3715 folio_add_file_rmap_pmd(folio, new, vma);
6c287605 3716 }
14d85a6e 3717 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
4fba8f2a 3718 set_pmd_at(mm, haddr, pvmw->pmd, pmde);
5cbcf225
MS
3719
3720 /* No need to invalidate - it was non-present before */
616b8371 3721 update_mmu_cache_pmd(vma, address, pvmw->pmd);
283fd6fe 3722 trace_remove_migration_pmd(address, pmd_val(pmde));
616b8371
ZY
3723}
3724#endif