1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2009 Red Hat, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 #include <linux/sched.h>
10 #include <linux/sched/mm.h>
11 #include <linux/sched/numa_balancing.h>
12 #include <linux/highmem.h>
13 #include <linux/hugetlb.h>
14 #include <linux/mmu_notifier.h>
15 #include <linux/rmap.h>
16 #include <linux/swap.h>
17 #include <linux/shrinker.h>
18 #include <linux/mm_inline.h>
19 #include <linux/swapops.h>
20 #include <linux/backing-dev.h>
21 #include <linux/dax.h>
22 #include <linux/mm_types.h>
23 #include <linux/khugepaged.h>
24 #include <linux/freezer.h>
25 #include <linux/pfn_t.h>
26 #include <linux/mman.h>
27 #include <linux/memremap.h>
28 #include <linux/pagemap.h>
29 #include <linux/debugfs.h>
30 #include <linux/migrate.h>
31 #include <linux/hashtable.h>
32 #include <linux/userfaultfd_k.h>
33 #include <linux/page_idle.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/oom.h>
36 #include <linux/numa.h>
37 #include <linux/page_owner.h>
38 #include <linux/sched/sysctl.h>
39 #include <linux/memory-tiers.h>
40 #include <linux/compat.h>
41 #include <linux/pgalloc_tag.h>
42 #include <linux/pagewalk.h>
45 #include <asm/pgalloc.h>
49 #define CREATE_TRACE_POINTS
50 #include <trace/events/thp.h>
53 * By default, transparent hugepage support is disabled in order to avoid
54 * risking an increased memory footprint for applications that are not
55 * guaranteed to benefit from it. When transparent hugepage support is
56 * enabled, it is for all mappings, and khugepaged scans all mappings.
57 * Defrag is invoked by khugepaged hugepage allocations and by page faults
58 * for all hugepage allocations.
60 unsigned long transparent_hugepage_flags __read_mostly
=
61 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
62 (1<<TRANSPARENT_HUGEPAGE_FLAG
)|
64 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
65 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
)|
67 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
)|
68 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG
)|
69 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG
);
71 static struct shrinker
*deferred_split_shrinker
;
72 static unsigned long deferred_split_count(struct shrinker
*shrink
,
73 struct shrink_control
*sc
);
74 static unsigned long deferred_split_scan(struct shrinker
*shrink
,
75 struct shrink_control
*sc
);
76 static bool split_underused_thp
= true;
78 static atomic_t huge_zero_refcount
;
79 struct folio
*huge_zero_folio __read_mostly
;
80 unsigned long huge_zero_pfn __read_mostly
= ~0UL;
81 unsigned long huge_anon_orders_always __read_mostly
;
82 unsigned long huge_anon_orders_madvise __read_mostly
;
83 unsigned long huge_anon_orders_inherit __read_mostly
;
84 static bool anon_orders_configured __initdata
;
86 static inline bool file_thp_enabled(struct vm_area_struct
*vma
)
90 if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS
))
96 inode
= file_inode(vma
->vm_file
);
98 return !inode_is_open_for_write(inode
) && S_ISREG(inode
->i_mode
);
101 unsigned long __thp_vma_allowable_orders(struct vm_area_struct
*vma
,
102 unsigned long vm_flags
,
103 unsigned long tva_flags
,
104 unsigned long orders
)
106 bool smaps
= tva_flags
& TVA_SMAPS
;
107 bool in_pf
= tva_flags
& TVA_IN_PF
;
108 bool enforce_sysfs
= tva_flags
& TVA_ENFORCE_SYSFS
;
109 unsigned long supported_orders
;
111 /* Check the intersection of requested and supported orders. */
112 if (vma_is_anonymous(vma
))
113 supported_orders
= THP_ORDERS_ALL_ANON
;
114 else if (vma_is_special_huge(vma
))
115 supported_orders
= THP_ORDERS_ALL_SPECIAL
;
117 supported_orders
= THP_ORDERS_ALL_FILE_DEFAULT
;
119 orders
&= supported_orders
;
123 if (!vma
->vm_mm
) /* vdso */
126 if (thp_disabled_by_hw() || vma_thp_disabled(vma
, vm_flags
))
129 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
131 return in_pf
? orders
: 0;
134 * khugepaged special VMA and hugetlb VMA.
135 * Must be checked after dax since some dax mappings may have
138 if (!in_pf
&& !smaps
&& (vm_flags
& VM_NO_KHUGEPAGED
))
142 * Check alignment for file vma and size for both file and anon vma by
143 * filtering out the unsuitable orders.
145 * Skip the check for page fault. Huge fault does the check in fault
149 int order
= highest_order(orders
);
153 addr
= vma
->vm_end
- (PAGE_SIZE
<< order
);
154 if (thp_vma_suitable_order(vma
, addr
, order
))
156 order
= next_order(&orders
, order
);
164 * Enabled via shmem mount options or sysfs settings.
165 * Must be done before hugepage flags check since shmem has its
168 if (!in_pf
&& shmem_file(vma
->vm_file
))
169 return shmem_allowable_huge_orders(file_inode(vma
->vm_file
),
170 vma
, vma
->vm_pgoff
, 0,
173 if (!vma_is_anonymous(vma
)) {
175 * Enforce sysfs THP requirements as necessary. Anonymous vmas
176 * were already handled in thp_vma_allowable_orders().
179 (!hugepage_global_enabled() || (!(vm_flags
& VM_HUGEPAGE
) &&
180 !hugepage_global_always())))
184 * Trust that ->huge_fault() handlers know what they are doing
187 if (((in_pf
|| smaps
)) && vma
->vm_ops
->huge_fault
)
189 /* Only regular file is valid in collapse path */
190 if (((!in_pf
|| smaps
)) && file_thp_enabled(vma
))
195 if (vma_is_temporary_stack(vma
))
199 * THPeligible bit of smaps should show 1 for proper VMAs even
200 * though anon_vma is not initialized yet.
202 * Allow page fault since anon_vma may be not initialized until
203 * the first page fault.
206 return (smaps
|| in_pf
) ? orders
: 0;
211 static bool get_huge_zero_page(void)
213 struct folio
*zero_folio
;
215 if (likely(atomic_inc_not_zero(&huge_zero_refcount
)))
218 zero_folio
= folio_alloc((GFP_TRANSHUGE
| __GFP_ZERO
) & ~__GFP_MOVABLE
,
221 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED
);
224 /* Ensure zero folio won't have large_rmappable flag set. */
225 folio_clear_large_rmappable(zero_folio
);
227 if (cmpxchg(&huge_zero_folio
, NULL
, zero_folio
)) {
229 folio_put(zero_folio
);
232 WRITE_ONCE(huge_zero_pfn
, folio_pfn(zero_folio
));
234 /* We take additional reference here. It will be put back by shrinker */
235 atomic_set(&huge_zero_refcount
, 2);
237 count_vm_event(THP_ZERO_PAGE_ALLOC
);
241 static void put_huge_zero_page(void)
244 * Counter should never go to zero here. Only shrinker can put
247 BUG_ON(atomic_dec_and_test(&huge_zero_refcount
));
250 struct folio
*mm_get_huge_zero_folio(struct mm_struct
*mm
)
252 if (test_bit(MMF_HUGE_ZERO_PAGE
, &mm
->flags
))
253 return READ_ONCE(huge_zero_folio
);
255 if (!get_huge_zero_page())
258 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE
, &mm
->flags
))
259 put_huge_zero_page();
261 return READ_ONCE(huge_zero_folio
);
264 void mm_put_huge_zero_folio(struct mm_struct
*mm
)
266 if (test_bit(MMF_HUGE_ZERO_PAGE
, &mm
->flags
))
267 put_huge_zero_page();
270 static unsigned long shrink_huge_zero_page_count(struct shrinker
*shrink
,
271 struct shrink_control
*sc
)
273 /* we can free zero page only if last reference remains */
274 return atomic_read(&huge_zero_refcount
) == 1 ? HPAGE_PMD_NR
: 0;
277 static unsigned long shrink_huge_zero_page_scan(struct shrinker
*shrink
,
278 struct shrink_control
*sc
)
280 if (atomic_cmpxchg(&huge_zero_refcount
, 1, 0) == 1) {
281 struct folio
*zero_folio
= xchg(&huge_zero_folio
, NULL
);
282 BUG_ON(zero_folio
== NULL
);
283 WRITE_ONCE(huge_zero_pfn
, ~0UL);
284 folio_put(zero_folio
);
291 static struct shrinker
*huge_zero_page_shrinker
;
294 static ssize_t
enabled_show(struct kobject
*kobj
,
295 struct kobj_attribute
*attr
, char *buf
)
299 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG
, &transparent_hugepage_flags
))
300 output
= "[always] madvise never";
301 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
,
302 &transparent_hugepage_flags
))
303 output
= "always [madvise] never";
305 output
= "always madvise [never]";
307 return sysfs_emit(buf
, "%s\n", output
);
310 static ssize_t
enabled_store(struct kobject
*kobj
,
311 struct kobj_attribute
*attr
,
312 const char *buf
, size_t count
)
316 if (sysfs_streq(buf
, "always")) {
317 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
, &transparent_hugepage_flags
);
318 set_bit(TRANSPARENT_HUGEPAGE_FLAG
, &transparent_hugepage_flags
);
319 } else if (sysfs_streq(buf
, "madvise")) {
320 clear_bit(TRANSPARENT_HUGEPAGE_FLAG
, &transparent_hugepage_flags
);
321 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
, &transparent_hugepage_flags
);
322 } else if (sysfs_streq(buf
, "never")) {
323 clear_bit(TRANSPARENT_HUGEPAGE_FLAG
, &transparent_hugepage_flags
);
324 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
, &transparent_hugepage_flags
);
329 int err
= start_stop_khugepaged();
336 static struct kobj_attribute enabled_attr
= __ATTR_RW(enabled
);
338 ssize_t
single_hugepage_flag_show(struct kobject
*kobj
,
339 struct kobj_attribute
*attr
, char *buf
,
340 enum transparent_hugepage_flag flag
)
342 return sysfs_emit(buf
, "%d\n",
343 !!test_bit(flag
, &transparent_hugepage_flags
));
346 ssize_t
single_hugepage_flag_store(struct kobject
*kobj
,
347 struct kobj_attribute
*attr
,
348 const char *buf
, size_t count
,
349 enum transparent_hugepage_flag flag
)
354 ret
= kstrtoul(buf
, 10, &value
);
361 set_bit(flag
, &transparent_hugepage_flags
);
363 clear_bit(flag
, &transparent_hugepage_flags
);
368 static ssize_t
defrag_show(struct kobject
*kobj
,
369 struct kobj_attribute
*attr
, char *buf
)
373 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG
,
374 &transparent_hugepage_flags
))
375 output
= "[always] defer defer+madvise madvise never";
376 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG
,
377 &transparent_hugepage_flags
))
378 output
= "always [defer] defer+madvise madvise never";
379 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG
,
380 &transparent_hugepage_flags
))
381 output
= "always defer [defer+madvise] madvise never";
382 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
,
383 &transparent_hugepage_flags
))
384 output
= "always defer defer+madvise [madvise] never";
386 output
= "always defer defer+madvise madvise [never]";
388 return sysfs_emit(buf
, "%s\n", output
);
391 static ssize_t
defrag_store(struct kobject
*kobj
,
392 struct kobj_attribute
*attr
,
393 const char *buf
, size_t count
)
395 if (sysfs_streq(buf
, "always")) {
396 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG
, &transparent_hugepage_flags
);
397 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG
, &transparent_hugepage_flags
);
398 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
, &transparent_hugepage_flags
);
399 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG
, &transparent_hugepage_flags
);
400 } else if (sysfs_streq(buf
, "defer+madvise")) {
401 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG
, &transparent_hugepage_flags
);
402 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG
, &transparent_hugepage_flags
);
403 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
, &transparent_hugepage_flags
);
404 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG
, &transparent_hugepage_flags
);
405 } else if (sysfs_streq(buf
, "defer")) {
406 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG
, &transparent_hugepage_flags
);
407 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG
, &transparent_hugepage_flags
);
408 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
, &transparent_hugepage_flags
);
409 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG
, &transparent_hugepage_flags
);
410 } else if (sysfs_streq(buf
, "madvise")) {
411 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG
, &transparent_hugepage_flags
);
412 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG
, &transparent_hugepage_flags
);
413 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG
, &transparent_hugepage_flags
);
414 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
, &transparent_hugepage_flags
);
415 } else if (sysfs_streq(buf
, "never")) {
416 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG
, &transparent_hugepage_flags
);
417 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG
, &transparent_hugepage_flags
);
418 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG
, &transparent_hugepage_flags
);
419 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
, &transparent_hugepage_flags
);
425 static struct kobj_attribute defrag_attr
= __ATTR_RW(defrag
);
427 static ssize_t
use_zero_page_show(struct kobject
*kobj
,
428 struct kobj_attribute
*attr
, char *buf
)
430 return single_hugepage_flag_show(kobj
, attr
, buf
,
431 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG
);
433 static ssize_t
use_zero_page_store(struct kobject
*kobj
,
434 struct kobj_attribute
*attr
, const char *buf
, size_t count
)
436 return single_hugepage_flag_store(kobj
, attr
, buf
, count
,
437 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG
);
439 static struct kobj_attribute use_zero_page_attr
= __ATTR_RW(use_zero_page
);
441 static ssize_t
hpage_pmd_size_show(struct kobject
*kobj
,
442 struct kobj_attribute
*attr
, char *buf
)
444 return sysfs_emit(buf
, "%lu\n", HPAGE_PMD_SIZE
);
446 static struct kobj_attribute hpage_pmd_size_attr
=
447 __ATTR_RO(hpage_pmd_size
);
449 static ssize_t
split_underused_thp_show(struct kobject
*kobj
,
450 struct kobj_attribute
*attr
, char *buf
)
452 return sysfs_emit(buf
, "%d\n", split_underused_thp
);
455 static ssize_t
split_underused_thp_store(struct kobject
*kobj
,
456 struct kobj_attribute
*attr
,
457 const char *buf
, size_t count
)
459 int err
= kstrtobool(buf
, &split_underused_thp
);
467 static struct kobj_attribute split_underused_thp_attr
= __ATTR(
468 shrink_underused
, 0644, split_underused_thp_show
, split_underused_thp_store
);
470 static struct attribute
*hugepage_attr
[] = {
473 &use_zero_page_attr
.attr
,
474 &hpage_pmd_size_attr
.attr
,
476 &shmem_enabled_attr
.attr
,
478 &split_underused_thp_attr
.attr
,
482 static const struct attribute_group hugepage_attr_group
= {
483 .attrs
= hugepage_attr
,
486 static void hugepage_exit_sysfs(struct kobject
*hugepage_kobj
);
487 static void thpsize_release(struct kobject
*kobj
);
488 static DEFINE_SPINLOCK(huge_anon_orders_lock
);
489 static LIST_HEAD(thpsize_list
);
491 static ssize_t
anon_enabled_show(struct kobject
*kobj
,
492 struct kobj_attribute
*attr
, char *buf
)
494 int order
= to_thpsize(kobj
)->order
;
497 if (test_bit(order
, &huge_anon_orders_always
))
498 output
= "[always] inherit madvise never";
499 else if (test_bit(order
, &huge_anon_orders_inherit
))
500 output
= "always [inherit] madvise never";
501 else if (test_bit(order
, &huge_anon_orders_madvise
))
502 output
= "always inherit [madvise] never";
504 output
= "always inherit madvise [never]";
506 return sysfs_emit(buf
, "%s\n", output
);
509 static ssize_t
anon_enabled_store(struct kobject
*kobj
,
510 struct kobj_attribute
*attr
,
511 const char *buf
, size_t count
)
513 int order
= to_thpsize(kobj
)->order
;
516 if (sysfs_streq(buf
, "always")) {
517 spin_lock(&huge_anon_orders_lock
);
518 clear_bit(order
, &huge_anon_orders_inherit
);
519 clear_bit(order
, &huge_anon_orders_madvise
);
520 set_bit(order
, &huge_anon_orders_always
);
521 spin_unlock(&huge_anon_orders_lock
);
522 } else if (sysfs_streq(buf
, "inherit")) {
523 spin_lock(&huge_anon_orders_lock
);
524 clear_bit(order
, &huge_anon_orders_always
);
525 clear_bit(order
, &huge_anon_orders_madvise
);
526 set_bit(order
, &huge_anon_orders_inherit
);
527 spin_unlock(&huge_anon_orders_lock
);
528 } else if (sysfs_streq(buf
, "madvise")) {
529 spin_lock(&huge_anon_orders_lock
);
530 clear_bit(order
, &huge_anon_orders_always
);
531 clear_bit(order
, &huge_anon_orders_inherit
);
532 set_bit(order
, &huge_anon_orders_madvise
);
533 spin_unlock(&huge_anon_orders_lock
);
534 } else if (sysfs_streq(buf
, "never")) {
535 spin_lock(&huge_anon_orders_lock
);
536 clear_bit(order
, &huge_anon_orders_always
);
537 clear_bit(order
, &huge_anon_orders_inherit
);
538 clear_bit(order
, &huge_anon_orders_madvise
);
539 spin_unlock(&huge_anon_orders_lock
);
546 err
= start_stop_khugepaged();
553 static struct kobj_attribute anon_enabled_attr
=
554 __ATTR(enabled
, 0644, anon_enabled_show
, anon_enabled_store
);
556 static struct attribute
*anon_ctrl_attrs
[] = {
557 &anon_enabled_attr
.attr
,
561 static const struct attribute_group anon_ctrl_attr_grp
= {
562 .attrs
= anon_ctrl_attrs
,
565 static struct attribute
*file_ctrl_attrs
[] = {
567 &thpsize_shmem_enabled_attr
.attr
,
572 static const struct attribute_group file_ctrl_attr_grp
= {
573 .attrs
= file_ctrl_attrs
,
576 static struct attribute
*any_ctrl_attrs
[] = {
580 static const struct attribute_group any_ctrl_attr_grp
= {
581 .attrs
= any_ctrl_attrs
,
584 static const struct kobj_type thpsize_ktype
= {
585 .release
= &thpsize_release
,
586 .sysfs_ops
= &kobj_sysfs_ops
,
589 DEFINE_PER_CPU(struct mthp_stat
, mthp_stats
) = {{{0}}};
591 static unsigned long sum_mthp_stat(int order
, enum mthp_stat_item item
)
593 unsigned long sum
= 0;
596 for_each_possible_cpu(cpu
) {
597 struct mthp_stat
*this = &per_cpu(mthp_stats
, cpu
);
599 sum
+= this->stats
[order
][item
];
605 #define DEFINE_MTHP_STAT_ATTR(_name, _index) \
606 static ssize_t _name##_show(struct kobject *kobj, \
607 struct kobj_attribute *attr, char *buf) \
609 int order = to_thpsize(kobj)->order; \
611 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
613 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
615 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc
, MTHP_STAT_ANON_FAULT_ALLOC
);
616 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback
, MTHP_STAT_ANON_FAULT_FALLBACK
);
617 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge
, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE
);
618 DEFINE_MTHP_STAT_ATTR(zswpout
, MTHP_STAT_ZSWPOUT
);
619 DEFINE_MTHP_STAT_ATTR(swpin
, MTHP_STAT_SWPIN
);
620 DEFINE_MTHP_STAT_ATTR(swpin_fallback
, MTHP_STAT_SWPIN_FALLBACK
);
621 DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge
, MTHP_STAT_SWPIN_FALLBACK_CHARGE
);
622 DEFINE_MTHP_STAT_ATTR(swpout
, MTHP_STAT_SWPOUT
);
623 DEFINE_MTHP_STAT_ATTR(swpout_fallback
, MTHP_STAT_SWPOUT_FALLBACK
);
625 DEFINE_MTHP_STAT_ATTR(shmem_alloc
, MTHP_STAT_SHMEM_ALLOC
);
626 DEFINE_MTHP_STAT_ATTR(shmem_fallback
, MTHP_STAT_SHMEM_FALLBACK
);
627 DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge
, MTHP_STAT_SHMEM_FALLBACK_CHARGE
);
629 DEFINE_MTHP_STAT_ATTR(split
, MTHP_STAT_SPLIT
);
630 DEFINE_MTHP_STAT_ATTR(split_failed
, MTHP_STAT_SPLIT_FAILED
);
631 DEFINE_MTHP_STAT_ATTR(split_deferred
, MTHP_STAT_SPLIT_DEFERRED
);
632 DEFINE_MTHP_STAT_ATTR(nr_anon
, MTHP_STAT_NR_ANON
);
633 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped
, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED
);
635 static struct attribute
*anon_stats_attrs
[] = {
636 &anon_fault_alloc_attr
.attr
,
637 &anon_fault_fallback_attr
.attr
,
638 &anon_fault_fallback_charge_attr
.attr
,
642 &swpin_fallback_attr
.attr
,
643 &swpin_fallback_charge_attr
.attr
,
645 &swpout_fallback_attr
.attr
,
647 &split_deferred_attr
.attr
,
649 &nr_anon_partially_mapped_attr
.attr
,
653 static struct attribute_group anon_stats_attr_grp
= {
655 .attrs
= anon_stats_attrs
,
658 static struct attribute
*file_stats_attrs
[] = {
660 &shmem_alloc_attr
.attr
,
661 &shmem_fallback_attr
.attr
,
662 &shmem_fallback_charge_attr
.attr
,
667 static struct attribute_group file_stats_attr_grp
= {
669 .attrs
= file_stats_attrs
,
672 static struct attribute
*any_stats_attrs
[] = {
676 &swpin_fallback_attr
.attr
,
677 &swpin_fallback_charge_attr
.attr
,
679 &swpout_fallback_attr
.attr
,
682 &split_failed_attr
.attr
,
686 static struct attribute_group any_stats_attr_grp
= {
688 .attrs
= any_stats_attrs
,
691 static int sysfs_add_group(struct kobject
*kobj
,
692 const struct attribute_group
*grp
)
697 * If the group is named, try to merge first, assuming the subdirectory
698 * was already created. This avoids the warning emitted by
699 * sysfs_create_group() if the directory already exists.
702 ret
= sysfs_merge_group(kobj
, grp
);
704 ret
= sysfs_create_group(kobj
, grp
);
709 static struct thpsize
*thpsize_create(int order
, struct kobject
*parent
)
711 unsigned long size
= (PAGE_SIZE
<< order
) / SZ_1K
;
712 struct thpsize
*thpsize
;
715 thpsize
= kzalloc(sizeof(*thpsize
), GFP_KERNEL
);
719 thpsize
->order
= order
;
721 ret
= kobject_init_and_add(&thpsize
->kobj
, &thpsize_ktype
, parent
,
722 "hugepages-%lukB", size
);
729 ret
= sysfs_add_group(&thpsize
->kobj
, &any_ctrl_attr_grp
);
733 ret
= sysfs_add_group(&thpsize
->kobj
, &any_stats_attr_grp
);
737 if (BIT(order
) & THP_ORDERS_ALL_ANON
) {
738 ret
= sysfs_add_group(&thpsize
->kobj
, &anon_ctrl_attr_grp
);
742 ret
= sysfs_add_group(&thpsize
->kobj
, &anon_stats_attr_grp
);
747 if (BIT(order
) & THP_ORDERS_ALL_FILE_DEFAULT
) {
748 ret
= sysfs_add_group(&thpsize
->kobj
, &file_ctrl_attr_grp
);
752 ret
= sysfs_add_group(&thpsize
->kobj
, &file_stats_attr_grp
);
759 kobject_put(&thpsize
->kobj
);
764 static void thpsize_release(struct kobject
*kobj
)
766 kfree(to_thpsize(kobj
));
769 static int __init
hugepage_init_sysfs(struct kobject
**hugepage_kobj
)
772 struct thpsize
*thpsize
;
773 unsigned long orders
;
777 * Default to setting PMD-sized THP to inherit the global setting and
778 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
779 * constant so we have to do this here.
781 if (!anon_orders_configured
)
782 huge_anon_orders_inherit
= BIT(PMD_ORDER
);
784 *hugepage_kobj
= kobject_create_and_add("transparent_hugepage", mm_kobj
);
785 if (unlikely(!*hugepage_kobj
)) {
786 pr_err("failed to create transparent hugepage kobject\n");
790 err
= sysfs_create_group(*hugepage_kobj
, &hugepage_attr_group
);
792 pr_err("failed to register transparent hugepage group\n");
796 err
= sysfs_create_group(*hugepage_kobj
, &khugepaged_attr_group
);
798 pr_err("failed to register transparent hugepage group\n");
799 goto remove_hp_group
;
802 orders
= THP_ORDERS_ALL_ANON
| THP_ORDERS_ALL_FILE_DEFAULT
;
803 order
= highest_order(orders
);
805 thpsize
= thpsize_create(order
, *hugepage_kobj
);
806 if (IS_ERR(thpsize
)) {
807 pr_err("failed to create thpsize for order %d\n", order
);
808 err
= PTR_ERR(thpsize
);
811 list_add(&thpsize
->node
, &thpsize_list
);
812 order
= next_order(&orders
, order
);
818 hugepage_exit_sysfs(*hugepage_kobj
);
821 sysfs_remove_group(*hugepage_kobj
, &hugepage_attr_group
);
823 kobject_put(*hugepage_kobj
);
827 static void __init
hugepage_exit_sysfs(struct kobject
*hugepage_kobj
)
829 struct thpsize
*thpsize
, *tmp
;
831 list_for_each_entry_safe(thpsize
, tmp
, &thpsize_list
, node
) {
832 list_del(&thpsize
->node
);
833 kobject_put(&thpsize
->kobj
);
836 sysfs_remove_group(hugepage_kobj
, &khugepaged_attr_group
);
837 sysfs_remove_group(hugepage_kobj
, &hugepage_attr_group
);
838 kobject_put(hugepage_kobj
);
841 static inline int hugepage_init_sysfs(struct kobject
**hugepage_kobj
)
846 static inline void hugepage_exit_sysfs(struct kobject
*hugepage_kobj
)
849 #endif /* CONFIG_SYSFS */
851 static int __init
thp_shrinker_init(void)
853 huge_zero_page_shrinker
= shrinker_alloc(0, "thp-zero");
854 if (!huge_zero_page_shrinker
)
857 deferred_split_shrinker
= shrinker_alloc(SHRINKER_NUMA_AWARE
|
858 SHRINKER_MEMCG_AWARE
|
860 "thp-deferred_split");
861 if (!deferred_split_shrinker
) {
862 shrinker_free(huge_zero_page_shrinker
);
866 huge_zero_page_shrinker
->count_objects
= shrink_huge_zero_page_count
;
867 huge_zero_page_shrinker
->scan_objects
= shrink_huge_zero_page_scan
;
868 shrinker_register(huge_zero_page_shrinker
);
870 deferred_split_shrinker
->count_objects
= deferred_split_count
;
871 deferred_split_shrinker
->scan_objects
= deferred_split_scan
;
872 shrinker_register(deferred_split_shrinker
);
877 static void __init
thp_shrinker_exit(void)
879 shrinker_free(huge_zero_page_shrinker
);
880 shrinker_free(deferred_split_shrinker
);
883 static int __init
hugepage_init(void)
886 struct kobject
*hugepage_kobj
;
888 if (!has_transparent_hugepage()) {
889 transparent_hugepage_flags
= 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED
;
894 * hugepages can't be allocated by the buddy allocator
896 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER
> MAX_PAGE_ORDER
);
898 err
= hugepage_init_sysfs(&hugepage_kobj
);
902 err
= khugepaged_init();
906 err
= thp_shrinker_init();
911 * By default disable transparent hugepages on smaller systems,
912 * where the extra memory used could hurt more than TLB overhead
913 * is likely to save. The admin can still enable it through /sys.
915 if (totalram_pages() < (512 << (20 - PAGE_SHIFT
))) {
916 transparent_hugepage_flags
= 0;
920 err
= start_stop_khugepaged();
928 khugepaged_destroy();
930 hugepage_exit_sysfs(hugepage_kobj
);
934 subsys_initcall(hugepage_init
);
936 static int __init
setup_transparent_hugepage(char *str
)
941 if (!strcmp(str
, "always")) {
942 set_bit(TRANSPARENT_HUGEPAGE_FLAG
,
943 &transparent_hugepage_flags
);
944 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
,
945 &transparent_hugepage_flags
);
947 } else if (!strcmp(str
, "madvise")) {
948 clear_bit(TRANSPARENT_HUGEPAGE_FLAG
,
949 &transparent_hugepage_flags
);
950 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
,
951 &transparent_hugepage_flags
);
953 } else if (!strcmp(str
, "never")) {
954 clear_bit(TRANSPARENT_HUGEPAGE_FLAG
,
955 &transparent_hugepage_flags
);
956 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG
,
957 &transparent_hugepage_flags
);
962 pr_warn("transparent_hugepage= cannot parse, ignored\n");
965 __setup("transparent_hugepage=", setup_transparent_hugepage
);
967 static char str_dup
[PAGE_SIZE
] __initdata
;
968 static int __init
setup_thp_anon(char *str
)
970 char *token
, *range
, *policy
, *subtoken
;
971 unsigned long always
, inherit
, madvise
;
972 char *start_size
, *end_size
;
976 if (!str
|| strlen(str
) + 1 > PAGE_SIZE
)
978 strscpy(str_dup
, str
);
980 always
= huge_anon_orders_always
;
981 madvise
= huge_anon_orders_madvise
;
982 inherit
= huge_anon_orders_inherit
;
984 while ((token
= strsep(&p
, ";")) != NULL
) {
985 range
= strsep(&token
, ":");
991 while ((subtoken
= strsep(&range
, ",")) != NULL
) {
992 if (strchr(subtoken
, '-')) {
993 start_size
= strsep(&subtoken
, "-");
996 start
= get_order_from_str(start_size
, THP_ORDERS_ALL_ANON
);
997 end
= get_order_from_str(end_size
, THP_ORDERS_ALL_ANON
);
999 start_size
= end_size
= subtoken
;
1000 start
= end
= get_order_from_str(subtoken
,
1001 THP_ORDERS_ALL_ANON
);
1004 if (start
== -EINVAL
) {
1005 pr_err("invalid size %s in thp_anon boot parameter\n", start_size
);
1009 if (end
== -EINVAL
) {
1010 pr_err("invalid size %s in thp_anon boot parameter\n", end_size
);
1014 if (start
< 0 || end
< 0 || start
> end
)
1017 nr
= end
- start
+ 1;
1018 if (!strcmp(policy
, "always")) {
1019 bitmap_set(&always
, start
, nr
);
1020 bitmap_clear(&inherit
, start
, nr
);
1021 bitmap_clear(&madvise
, start
, nr
);
1022 } else if (!strcmp(policy
, "madvise")) {
1023 bitmap_set(&madvise
, start
, nr
);
1024 bitmap_clear(&inherit
, start
, nr
);
1025 bitmap_clear(&always
, start
, nr
);
1026 } else if (!strcmp(policy
, "inherit")) {
1027 bitmap_set(&inherit
, start
, nr
);
1028 bitmap_clear(&madvise
, start
, nr
);
1029 bitmap_clear(&always
, start
, nr
);
1030 } else if (!strcmp(policy
, "never")) {
1031 bitmap_clear(&inherit
, start
, nr
);
1032 bitmap_clear(&madvise
, start
, nr
);
1033 bitmap_clear(&always
, start
, nr
);
1035 pr_err("invalid policy %s in thp_anon boot parameter\n", policy
);
1041 huge_anon_orders_always
= always
;
1042 huge_anon_orders_madvise
= madvise
;
1043 huge_anon_orders_inherit
= inherit
;
1044 anon_orders_configured
= true;
1048 pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str
);
1051 __setup("thp_anon=", setup_thp_anon
);
1053 pmd_t
maybe_pmd_mkwrite(pmd_t pmd
, struct vm_area_struct
*vma
)
1055 if (likely(vma
->vm_flags
& VM_WRITE
))
1056 pmd
= pmd_mkwrite(pmd
, vma
);
1062 struct deferred_split
*get_deferred_split_queue(struct folio
*folio
)
1064 struct mem_cgroup
*memcg
= folio_memcg(folio
);
1065 struct pglist_data
*pgdat
= NODE_DATA(folio_nid(folio
));
1068 return &memcg
->deferred_split_queue
;
1070 return &pgdat
->deferred_split_queue
;
1074 struct deferred_split
*get_deferred_split_queue(struct folio
*folio
)
1076 struct pglist_data
*pgdat
= NODE_DATA(folio_nid(folio
));
1078 return &pgdat
->deferred_split_queue
;
1082 static inline bool is_transparent_hugepage(const struct folio
*folio
)
1084 if (!folio_test_large(folio
))
1087 return is_huge_zero_folio(folio
) ||
1088 folio_test_large_rmappable(folio
);
1091 static unsigned long __thp_get_unmapped_area(struct file
*filp
,
1092 unsigned long addr
, unsigned long len
,
1093 loff_t off
, unsigned long flags
, unsigned long size
,
1094 vm_flags_t vm_flags
)
1096 loff_t off_end
= off
+ len
;
1097 loff_t off_align
= round_up(off
, size
);
1098 unsigned long len_pad
, ret
, off_sub
;
1100 if (!IS_ENABLED(CONFIG_64BIT
) || in_compat_syscall())
1103 if (off_end
<= off_align
|| (off_end
- off_align
) < size
)
1106 len_pad
= len
+ size
;
1107 if (len_pad
< len
|| (off
+ len_pad
) < off
)
1110 ret
= mm_get_unmapped_area_vmflags(current
->mm
, filp
, addr
, len_pad
,
1111 off
>> PAGE_SHIFT
, flags
, vm_flags
);
1114 * The failure might be due to length padding. The caller will retry
1115 * without the padding.
1117 if (IS_ERR_VALUE(ret
))
1121 * Do not try to align to THP boundary if allocation at the address
1127 off_sub
= (off
- ret
) & (size
- 1);
1129 if (test_bit(MMF_TOPDOWN
, ¤t
->mm
->flags
) && !off_sub
)
1136 unsigned long thp_get_unmapped_area_vmflags(struct file
*filp
, unsigned long addr
,
1137 unsigned long len
, unsigned long pgoff
, unsigned long flags
,
1138 vm_flags_t vm_flags
)
1141 loff_t off
= (loff_t
)pgoff
<< PAGE_SHIFT
;
1143 ret
= __thp_get_unmapped_area(filp
, addr
, len
, off
, flags
, PMD_SIZE
, vm_flags
);
1147 return mm_get_unmapped_area_vmflags(current
->mm
, filp
, addr
, len
, pgoff
, flags
,
1151 unsigned long thp_get_unmapped_area(struct file
*filp
, unsigned long addr
,
1152 unsigned long len
, unsigned long pgoff
, unsigned long flags
)
1154 return thp_get_unmapped_area_vmflags(filp
, addr
, len
, pgoff
, flags
, 0);
1156 EXPORT_SYMBOL_GPL(thp_get_unmapped_area
);
1158 static struct folio
*vma_alloc_anon_folio_pmd(struct vm_area_struct
*vma
,
1161 gfp_t gfp
= vma_thp_gfp_mask(vma
);
1162 const int order
= HPAGE_PMD_ORDER
;
1163 struct folio
*folio
;
1165 folio
= vma_alloc_folio(gfp
, order
, vma
, addr
& HPAGE_PMD_MASK
);
1167 if (unlikely(!folio
)) {
1168 count_vm_event(THP_FAULT_FALLBACK
);
1169 count_mthp_stat(order
, MTHP_STAT_ANON_FAULT_FALLBACK
);
1173 VM_BUG_ON_FOLIO(!folio_test_large(folio
), folio
);
1174 if (mem_cgroup_charge(folio
, vma
->vm_mm
, gfp
)) {
1176 count_vm_event(THP_FAULT_FALLBACK
);
1177 count_vm_event(THP_FAULT_FALLBACK_CHARGE
);
1178 count_mthp_stat(order
, MTHP_STAT_ANON_FAULT_FALLBACK
);
1179 count_mthp_stat(order
, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE
);
1182 folio_throttle_swaprate(folio
, gfp
);
1185 * When a folio is not zeroed during allocation (__GFP_ZERO not used)
1186 * or user folios require special handling, folio_zero_user() is used to
1187 * make sure that the page corresponding to the faulting address will be
1188 * hot in the cache after zeroing.
1190 if (user_alloc_needs_zeroing())
1191 folio_zero_user(folio
, addr
);
1193 * The memory barrier inside __folio_mark_uptodate makes sure that
1194 * folio_zero_user writes become visible before the set_pmd_at()
1197 __folio_mark_uptodate(folio
);
1201 static void map_anon_folio_pmd(struct folio
*folio
, pmd_t
*pmd
,
1202 struct vm_area_struct
*vma
, unsigned long haddr
)
1206 entry
= folio_mk_pmd(folio
, vma
->vm_page_prot
);
1207 entry
= maybe_pmd_mkwrite(pmd_mkdirty(entry
), vma
);
1208 folio_add_new_anon_rmap(folio
, vma
, haddr
, RMAP_EXCLUSIVE
);
1209 folio_add_lru_vma(folio
, vma
);
1210 set_pmd_at(vma
->vm_mm
, haddr
, pmd
, entry
);
1211 update_mmu_cache_pmd(vma
, haddr
, pmd
);
1212 add_mm_counter(vma
->vm_mm
, MM_ANONPAGES
, HPAGE_PMD_NR
);
1213 count_vm_event(THP_FAULT_ALLOC
);
1214 count_mthp_stat(HPAGE_PMD_ORDER
, MTHP_STAT_ANON_FAULT_ALLOC
);
1215 count_memcg_event_mm(vma
->vm_mm
, THP_FAULT_ALLOC
);
1218 static vm_fault_t
__do_huge_pmd_anonymous_page(struct vm_fault
*vmf
)
1220 unsigned long haddr
= vmf
->address
& HPAGE_PMD_MASK
;
1221 struct vm_area_struct
*vma
= vmf
->vma
;
1222 struct folio
*folio
;
1226 folio
= vma_alloc_anon_folio_pmd(vma
, vmf
->address
);
1227 if (unlikely(!folio
))
1228 return VM_FAULT_FALLBACK
;
1230 pgtable
= pte_alloc_one(vma
->vm_mm
);
1231 if (unlikely(!pgtable
)) {
1236 vmf
->ptl
= pmd_lock(vma
->vm_mm
, vmf
->pmd
);
1237 if (unlikely(!pmd_none(*vmf
->pmd
))) {
1238 goto unlock_release
;
1240 ret
= check_stable_address_space(vma
->vm_mm
);
1242 goto unlock_release
;
1244 /* Deliver the page fault to userland */
1245 if (userfaultfd_missing(vma
)) {
1246 spin_unlock(vmf
->ptl
);
1248 pte_free(vma
->vm_mm
, pgtable
);
1249 ret
= handle_userfault(vmf
, VM_UFFD_MISSING
);
1250 VM_BUG_ON(ret
& VM_FAULT_FALLBACK
);
1253 pgtable_trans_huge_deposit(vma
->vm_mm
, vmf
->pmd
, pgtable
);
1254 map_anon_folio_pmd(folio
, vmf
->pmd
, vma
, haddr
);
1255 mm_inc_nr_ptes(vma
->vm_mm
);
1256 deferred_split_folio(folio
, false);
1257 spin_unlock(vmf
->ptl
);
1262 spin_unlock(vmf
->ptl
);
1265 pte_free(vma
->vm_mm
, pgtable
);
1272 * always: directly stall for all thp allocations
1273 * defer: wake kswapd and fail if not immediately available
1274 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1275 * fail if not immediately available
1276 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1278 * never: never stall for any thp allocation
1280 gfp_t
vma_thp_gfp_mask(struct vm_area_struct
*vma
)
1282 const bool vma_madvised
= vma
&& (vma
->vm_flags
& VM_HUGEPAGE
);
1284 /* Always do synchronous compaction */
1285 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG
, &transparent_hugepage_flags
))
1286 return GFP_TRANSHUGE
| (vma_madvised
? 0 : __GFP_NORETRY
);
1288 /* Kick kcompactd and fail quickly */
1289 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG
, &transparent_hugepage_flags
))
1290 return GFP_TRANSHUGE_LIGHT
| __GFP_KSWAPD_RECLAIM
;
1292 /* Synchronous compaction if madvised, otherwise kick kcompactd */
1293 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG
, &transparent_hugepage_flags
))
1294 return GFP_TRANSHUGE_LIGHT
|
1295 (vma_madvised
? __GFP_DIRECT_RECLAIM
:
1296 __GFP_KSWAPD_RECLAIM
);
1298 /* Only do synchronous compaction if madvised */
1299 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG
, &transparent_hugepage_flags
))
1300 return GFP_TRANSHUGE_LIGHT
|
1301 (vma_madvised
? __GFP_DIRECT_RECLAIM
: 0);
1303 return GFP_TRANSHUGE_LIGHT
;
1306 /* Caller must hold page table lock. */
1307 static void set_huge_zero_folio(pgtable_t pgtable
, struct mm_struct
*mm
,
1308 struct vm_area_struct
*vma
, unsigned long haddr
, pmd_t
*pmd
,
1309 struct folio
*zero_folio
)
1312 entry
= folio_mk_pmd(zero_folio
, vma
->vm_page_prot
);
1313 pgtable_trans_huge_deposit(mm
, pmd
, pgtable
);
1314 set_pmd_at(mm
, haddr
, pmd
, entry
);
1318 vm_fault_t
do_huge_pmd_anonymous_page(struct vm_fault
*vmf
)
1320 struct vm_area_struct
*vma
= vmf
->vma
;
1321 unsigned long haddr
= vmf
->address
& HPAGE_PMD_MASK
;
1324 if (!thp_vma_suitable_order(vma
, haddr
, PMD_ORDER
))
1325 return VM_FAULT_FALLBACK
;
1326 ret
= vmf_anon_prepare(vmf
);
1329 khugepaged_enter_vma(vma
, vma
->vm_flags
);
1331 if (!(vmf
->flags
& FAULT_FLAG_WRITE
) &&
1332 !mm_forbids_zeropage(vma
->vm_mm
) &&
1333 transparent_hugepage_use_zero_page()) {
1335 struct folio
*zero_folio
;
1338 pgtable
= pte_alloc_one(vma
->vm_mm
);
1339 if (unlikely(!pgtable
))
1340 return VM_FAULT_OOM
;
1341 zero_folio
= mm_get_huge_zero_folio(vma
->vm_mm
);
1342 if (unlikely(!zero_folio
)) {
1343 pte_free(vma
->vm_mm
, pgtable
);
1344 count_vm_event(THP_FAULT_FALLBACK
);
1345 return VM_FAULT_FALLBACK
;
1347 vmf
->ptl
= pmd_lock(vma
->vm_mm
, vmf
->pmd
);
1349 if (pmd_none(*vmf
->pmd
)) {
1350 ret
= check_stable_address_space(vma
->vm_mm
);
1352 spin_unlock(vmf
->ptl
);
1353 pte_free(vma
->vm_mm
, pgtable
);
1354 } else if (userfaultfd_missing(vma
)) {
1355 spin_unlock(vmf
->ptl
);
1356 pte_free(vma
->vm_mm
, pgtable
);
1357 ret
= handle_userfault(vmf
, VM_UFFD_MISSING
);
1358 VM_BUG_ON(ret
& VM_FAULT_FALLBACK
);
1360 set_huge_zero_folio(pgtable
, vma
->vm_mm
, vma
,
1361 haddr
, vmf
->pmd
, zero_folio
);
1362 update_mmu_cache_pmd(vma
, vmf
->address
, vmf
->pmd
);
1363 spin_unlock(vmf
->ptl
);
1366 spin_unlock(vmf
->ptl
);
1367 pte_free(vma
->vm_mm
, pgtable
);
1372 return __do_huge_pmd_anonymous_page(vmf
);
1375 static int insert_pfn_pmd(struct vm_area_struct
*vma
, unsigned long addr
,
1376 pmd_t
*pmd
, pfn_t pfn
, pgprot_t prot
, bool write
,
1379 struct mm_struct
*mm
= vma
->vm_mm
;
1382 lockdep_assert_held(pmd_lockptr(mm
, pmd
));
1384 if (!pmd_none(*pmd
)) {
1386 if (pmd_pfn(*pmd
) != pfn_t_to_pfn(pfn
)) {
1387 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd
));
1390 entry
= pmd_mkyoung(*pmd
);
1391 entry
= maybe_pmd_mkwrite(pmd_mkdirty(entry
), vma
);
1392 if (pmdp_set_access_flags(vma
, addr
, pmd
, entry
, 1))
1393 update_mmu_cache_pmd(vma
, addr
, pmd
);
1399 entry
= pmd_mkhuge(pfn_t_pmd(pfn
, prot
));
1400 if (pfn_t_devmap(pfn
))
1401 entry
= pmd_mkdevmap(entry
);
1403 entry
= pmd_mkspecial(entry
);
1405 entry
= pmd_mkyoung(pmd_mkdirty(entry
));
1406 entry
= maybe_pmd_mkwrite(entry
, vma
);
1410 pgtable_trans_huge_deposit(mm
, pmd
, pgtable
);
1414 set_pmd_at(mm
, addr
, pmd
, entry
);
1415 update_mmu_cache_pmd(vma
, addr
, pmd
);
1420 * vmf_insert_pfn_pmd - insert a pmd size pfn
1421 * @vmf: Structure describing the fault
1422 * @pfn: pfn to insert
1423 * @write: whether it's a write fault
1425 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
1427 * Return: vm_fault_t value.
1429 vm_fault_t
vmf_insert_pfn_pmd(struct vm_fault
*vmf
, pfn_t pfn
, bool write
)
1431 unsigned long addr
= vmf
->address
& PMD_MASK
;
1432 struct vm_area_struct
*vma
= vmf
->vma
;
1433 pgprot_t pgprot
= vma
->vm_page_prot
;
1434 pgtable_t pgtable
= NULL
;
1439 * If we had pmd_special, we could avoid all these restrictions,
1440 * but we need to be consistent with PTEs and architectures that
1441 * can't support a 'special' bit.
1443 BUG_ON(!(vma
->vm_flags
& (VM_PFNMAP
|VM_MIXEDMAP
)) &&
1444 !pfn_t_devmap(pfn
));
1445 BUG_ON((vma
->vm_flags
& (VM_PFNMAP
|VM_MIXEDMAP
)) ==
1446 (VM_PFNMAP
|VM_MIXEDMAP
));
1447 BUG_ON((vma
->vm_flags
& VM_PFNMAP
) && is_cow_mapping(vma
->vm_flags
));
1449 if (addr
< vma
->vm_start
|| addr
>= vma
->vm_end
)
1450 return VM_FAULT_SIGBUS
;
1452 if (arch_needs_pgtable_deposit()) {
1453 pgtable
= pte_alloc_one(vma
->vm_mm
);
1455 return VM_FAULT_OOM
;
1458 pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn
), &pgprot
);
1460 ptl
= pmd_lock(vma
->vm_mm
, vmf
->pmd
);
1461 error
= insert_pfn_pmd(vma
, addr
, vmf
->pmd
, pfn
, pgprot
, write
,
1464 if (error
&& pgtable
)
1465 pte_free(vma
->vm_mm
, pgtable
);
1467 return VM_FAULT_NOPAGE
;
1469 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd
);
1471 vm_fault_t
vmf_insert_folio_pmd(struct vm_fault
*vmf
, struct folio
*folio
,
1474 struct vm_area_struct
*vma
= vmf
->vma
;
1475 unsigned long addr
= vmf
->address
& PMD_MASK
;
1476 struct mm_struct
*mm
= vma
->vm_mm
;
1478 pgtable_t pgtable
= NULL
;
1481 if (addr
< vma
->vm_start
|| addr
>= vma
->vm_end
)
1482 return VM_FAULT_SIGBUS
;
1484 if (WARN_ON_ONCE(folio_order(folio
) != PMD_ORDER
))
1485 return VM_FAULT_SIGBUS
;
1487 if (arch_needs_pgtable_deposit()) {
1488 pgtable
= pte_alloc_one(vma
->vm_mm
);
1490 return VM_FAULT_OOM
;
1493 ptl
= pmd_lock(mm
, vmf
->pmd
);
1494 if (pmd_none(*vmf
->pmd
)) {
1496 folio_add_file_rmap_pmd(folio
, &folio
->page
, vma
);
1497 add_mm_counter(mm
, mm_counter_file(folio
), HPAGE_PMD_NR
);
1499 error
= insert_pfn_pmd(vma
, addr
, vmf
->pmd
,
1500 pfn_to_pfn_t(folio_pfn(folio
)), vma
->vm_page_prot
,
1503 if (error
&& pgtable
)
1504 pte_free(mm
, pgtable
);
1506 return VM_FAULT_NOPAGE
;
1508 EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd
);
1510 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1511 static pud_t
maybe_pud_mkwrite(pud_t pud
, struct vm_area_struct
*vma
)
1513 if (likely(vma
->vm_flags
& VM_WRITE
))
1514 pud
= pud_mkwrite(pud
);
1518 static void insert_pfn_pud(struct vm_area_struct
*vma
, unsigned long addr
,
1519 pud_t
*pud
, pfn_t pfn
, bool write
)
1521 struct mm_struct
*mm
= vma
->vm_mm
;
1522 pgprot_t prot
= vma
->vm_page_prot
;
1525 if (!pud_none(*pud
)) {
1527 if (WARN_ON_ONCE(pud_pfn(*pud
) != pfn_t_to_pfn(pfn
)))
1529 entry
= pud_mkyoung(*pud
);
1530 entry
= maybe_pud_mkwrite(pud_mkdirty(entry
), vma
);
1531 if (pudp_set_access_flags(vma
, addr
, pud
, entry
, 1))
1532 update_mmu_cache_pud(vma
, addr
, pud
);
1537 entry
= pud_mkhuge(pfn_t_pud(pfn
, prot
));
1538 if (pfn_t_devmap(pfn
))
1539 entry
= pud_mkdevmap(entry
);
1541 entry
= pud_mkspecial(entry
);
1543 entry
= pud_mkyoung(pud_mkdirty(entry
));
1544 entry
= maybe_pud_mkwrite(entry
, vma
);
1546 set_pud_at(mm
, addr
, pud
, entry
);
1547 update_mmu_cache_pud(vma
, addr
, pud
);
1551 * vmf_insert_pfn_pud - insert a pud size pfn
1552 * @vmf: Structure describing the fault
1553 * @pfn: pfn to insert
1554 * @write: whether it's a write fault
1556 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
1558 * Return: vm_fault_t value.
1560 vm_fault_t
vmf_insert_pfn_pud(struct vm_fault
*vmf
, pfn_t pfn
, bool write
)
1562 unsigned long addr
= vmf
->address
& PUD_MASK
;
1563 struct vm_area_struct
*vma
= vmf
->vma
;
1564 pgprot_t pgprot
= vma
->vm_page_prot
;
1568 * If we had pud_special, we could avoid all these restrictions,
1569 * but we need to be consistent with PTEs and architectures that
1570 * can't support a 'special' bit.
1572 BUG_ON(!(vma
->vm_flags
& (VM_PFNMAP
|VM_MIXEDMAP
)) &&
1573 !pfn_t_devmap(pfn
));
1574 BUG_ON((vma
->vm_flags
& (VM_PFNMAP
|VM_MIXEDMAP
)) ==
1575 (VM_PFNMAP
|VM_MIXEDMAP
));
1576 BUG_ON((vma
->vm_flags
& VM_PFNMAP
) && is_cow_mapping(vma
->vm_flags
));
1578 if (addr
< vma
->vm_start
|| addr
>= vma
->vm_end
)
1579 return VM_FAULT_SIGBUS
;
1581 pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn
), &pgprot
);
1583 ptl
= pud_lock(vma
->vm_mm
, vmf
->pud
);
1584 insert_pfn_pud(vma
, addr
, vmf
->pud
, pfn
, write
);
1587 return VM_FAULT_NOPAGE
;
1589 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud
);
1592 * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
1593 * @vmf: Structure describing the fault
1594 * @folio: folio to insert
1595 * @write: whether it's a write fault
1597 * Return: vm_fault_t value.
1599 vm_fault_t
vmf_insert_folio_pud(struct vm_fault
*vmf
, struct folio
*folio
,
1602 struct vm_area_struct
*vma
= vmf
->vma
;
1603 unsigned long addr
= vmf
->address
& PUD_MASK
;
1604 pud_t
*pud
= vmf
->pud
;
1605 struct mm_struct
*mm
= vma
->vm_mm
;
1608 if (addr
< vma
->vm_start
|| addr
>= vma
->vm_end
)
1609 return VM_FAULT_SIGBUS
;
1611 if (WARN_ON_ONCE(folio_order(folio
) != PUD_ORDER
))
1612 return VM_FAULT_SIGBUS
;
1614 ptl
= pud_lock(mm
, pud
);
1617 * If there is already an entry present we assume the folio is
1618 * already mapped, hence no need to take another reference. We
1619 * still call insert_pfn_pud() though in case the mapping needs
1620 * upgrading to writeable.
1622 if (pud_none(*vmf
->pud
)) {
1624 folio_add_file_rmap_pud(folio
, &folio
->page
, vma
);
1625 add_mm_counter(mm
, mm_counter_file(folio
), HPAGE_PUD_NR
);
1627 insert_pfn_pud(vma
, addr
, vmf
->pud
, pfn_to_pfn_t(folio_pfn(folio
)),
1631 return VM_FAULT_NOPAGE
;
1633 EXPORT_SYMBOL_GPL(vmf_insert_folio_pud
);
1634 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1636 void touch_pmd(struct vm_area_struct
*vma
, unsigned long addr
,
1637 pmd_t
*pmd
, bool write
)
1641 _pmd
= pmd_mkyoung(*pmd
);
1643 _pmd
= pmd_mkdirty(_pmd
);
1644 if (pmdp_set_access_flags(vma
, addr
& HPAGE_PMD_MASK
,
1646 update_mmu_cache_pmd(vma
, addr
, pmd
);
1649 struct page
*follow_devmap_pmd(struct vm_area_struct
*vma
, unsigned long addr
,
1650 pmd_t
*pmd
, int flags
, struct dev_pagemap
**pgmap
)
1652 unsigned long pfn
= pmd_pfn(*pmd
);
1653 struct mm_struct
*mm
= vma
->vm_mm
;
1657 assert_spin_locked(pmd_lockptr(mm
, pmd
));
1659 if (flags
& FOLL_WRITE
&& !pmd_write(*pmd
))
1662 if (pmd_present(*pmd
) && pmd_devmap(*pmd
))
1667 if (flags
& FOLL_TOUCH
)
1668 touch_pmd(vma
, addr
, pmd
, flags
& FOLL_WRITE
);
1671 * device mapped pages can only be returned if the
1672 * caller will manage the page reference count.
1674 if (!(flags
& (FOLL_GET
| FOLL_PIN
)))
1675 return ERR_PTR(-EEXIST
);
1677 pfn
+= (addr
& ~PMD_MASK
) >> PAGE_SHIFT
;
1678 *pgmap
= get_dev_pagemap(pfn
, *pgmap
);
1680 return ERR_PTR(-EFAULT
);
1681 page
= pfn_to_page(pfn
);
1682 ret
= try_grab_folio(page_folio(page
), 1, flags
);
1684 page
= ERR_PTR(ret
);
1689 int copy_huge_pmd(struct mm_struct
*dst_mm
, struct mm_struct
*src_mm
,
1690 pmd_t
*dst_pmd
, pmd_t
*src_pmd
, unsigned long addr
,
1691 struct vm_area_struct
*dst_vma
, struct vm_area_struct
*src_vma
)
1693 spinlock_t
*dst_ptl
, *src_ptl
;
1694 struct page
*src_page
;
1695 struct folio
*src_folio
;
1697 pgtable_t pgtable
= NULL
;
1700 pmd
= pmdp_get_lockless(src_pmd
);
1701 if (unlikely(pmd_present(pmd
) && pmd_special(pmd
))) {
1702 dst_ptl
= pmd_lock(dst_mm
, dst_pmd
);
1703 src_ptl
= pmd_lockptr(src_mm
, src_pmd
);
1704 spin_lock_nested(src_ptl
, SINGLE_DEPTH_NESTING
);
1706 * No need to recheck the pmd, it can't change with write
1707 * mmap lock held here.
1709 * Meanwhile, making sure it's not a CoW VMA with writable
1710 * mapping, otherwise it means either the anon page wrongly
1711 * applied special bit, or we made the PRIVATE mapping be
1712 * able to wrongly write to the backend MMIO.
1714 VM_WARN_ON_ONCE(is_cow_mapping(src_vma
->vm_flags
) && pmd_write(pmd
));
1718 /* Skip if can be re-fill on fault */
1719 if (!vma_is_anonymous(dst_vma
))
1722 pgtable
= pte_alloc_one(dst_mm
);
1723 if (unlikely(!pgtable
))
1726 dst_ptl
= pmd_lock(dst_mm
, dst_pmd
);
1727 src_ptl
= pmd_lockptr(src_mm
, src_pmd
);
1728 spin_lock_nested(src_ptl
, SINGLE_DEPTH_NESTING
);
1733 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1734 if (unlikely(is_swap_pmd(pmd
))) {
1735 swp_entry_t entry
= pmd_to_swp_entry(pmd
);
1737 VM_BUG_ON(!is_pmd_migration_entry(pmd
));
1738 if (!is_readable_migration_entry(entry
)) {
1739 entry
= make_readable_migration_entry(
1741 pmd
= swp_entry_to_pmd(entry
);
1742 if (pmd_swp_soft_dirty(*src_pmd
))
1743 pmd
= pmd_swp_mksoft_dirty(pmd
);
1744 if (pmd_swp_uffd_wp(*src_pmd
))
1745 pmd
= pmd_swp_mkuffd_wp(pmd
);
1746 set_pmd_at(src_mm
, addr
, src_pmd
, pmd
);
1748 add_mm_counter(dst_mm
, MM_ANONPAGES
, HPAGE_PMD_NR
);
1749 mm_inc_nr_ptes(dst_mm
);
1750 pgtable_trans_huge_deposit(dst_mm
, dst_pmd
, pgtable
);
1751 if (!userfaultfd_wp(dst_vma
))
1752 pmd
= pmd_swp_clear_uffd_wp(pmd
);
1753 set_pmd_at(dst_mm
, addr
, dst_pmd
, pmd
);
1759 if (unlikely(!pmd_trans_huge(pmd
))) {
1760 pte_free(dst_mm
, pgtable
);
1764 * When page table lock is held, the huge zero pmd should not be
1765 * under splitting since we don't split the page itself, only pmd to
1768 if (is_huge_zero_pmd(pmd
)) {
1770 * mm_get_huge_zero_folio() will never allocate a new
1771 * folio here, since we already have a zero page to
1772 * copy. It just takes a reference.
1774 mm_get_huge_zero_folio(dst_mm
);
1778 src_page
= pmd_page(pmd
);
1779 VM_BUG_ON_PAGE(!PageHead(src_page
), src_page
);
1780 src_folio
= page_folio(src_page
);
1782 folio_get(src_folio
);
1783 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio
, src_page
, dst_vma
, src_vma
))) {
1784 /* Page maybe pinned: split and retry the fault on PTEs. */
1785 folio_put(src_folio
);
1786 pte_free(dst_mm
, pgtable
);
1787 spin_unlock(src_ptl
);
1788 spin_unlock(dst_ptl
);
1789 __split_huge_pmd(src_vma
, src_pmd
, addr
, false);
1792 add_mm_counter(dst_mm
, MM_ANONPAGES
, HPAGE_PMD_NR
);
1794 mm_inc_nr_ptes(dst_mm
);
1795 pgtable_trans_huge_deposit(dst_mm
, dst_pmd
, pgtable
);
1796 pmdp_set_wrprotect(src_mm
, addr
, src_pmd
);
1797 if (!userfaultfd_wp(dst_vma
))
1798 pmd
= pmd_clear_uffd_wp(pmd
);
1799 pmd
= pmd_wrprotect(pmd
);
1801 pmd
= pmd_mkold(pmd
);
1802 set_pmd_at(dst_mm
, addr
, dst_pmd
, pmd
);
1806 spin_unlock(src_ptl
);
1807 spin_unlock(dst_ptl
);
1812 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1813 void touch_pud(struct vm_area_struct
*vma
, unsigned long addr
,
1814 pud_t
*pud
, bool write
)
1818 _pud
= pud_mkyoung(*pud
);
1820 _pud
= pud_mkdirty(_pud
);
1821 if (pudp_set_access_flags(vma
, addr
& HPAGE_PUD_MASK
,
1823 update_mmu_cache_pud(vma
, addr
, pud
);
1826 int copy_huge_pud(struct mm_struct
*dst_mm
, struct mm_struct
*src_mm
,
1827 pud_t
*dst_pud
, pud_t
*src_pud
, unsigned long addr
,
1828 struct vm_area_struct
*vma
)
1830 spinlock_t
*dst_ptl
, *src_ptl
;
1834 dst_ptl
= pud_lock(dst_mm
, dst_pud
);
1835 src_ptl
= pud_lockptr(src_mm
, src_pud
);
1836 spin_lock_nested(src_ptl
, SINGLE_DEPTH_NESTING
);
1840 if (unlikely(!pud_trans_huge(pud
) && !pud_devmap(pud
)))
1844 * TODO: once we support anonymous pages, use
1845 * folio_try_dup_anon_rmap_*() and split if duplicating fails.
1847 if (is_cow_mapping(vma
->vm_flags
) && pud_write(pud
)) {
1848 pudp_set_wrprotect(src_mm
, addr
, src_pud
);
1849 pud
= pud_wrprotect(pud
);
1851 pud
= pud_mkold(pud
);
1852 set_pud_at(dst_mm
, addr
, dst_pud
, pud
);
1856 spin_unlock(src_ptl
);
1857 spin_unlock(dst_ptl
);
1861 void huge_pud_set_accessed(struct vm_fault
*vmf
, pud_t orig_pud
)
1863 bool write
= vmf
->flags
& FAULT_FLAG_WRITE
;
1865 vmf
->ptl
= pud_lock(vmf
->vma
->vm_mm
, vmf
->pud
);
1866 if (unlikely(!pud_same(*vmf
->pud
, orig_pud
)))
1869 touch_pud(vmf
->vma
, vmf
->address
, vmf
->pud
, write
);
1871 spin_unlock(vmf
->ptl
);
1873 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1875 void huge_pmd_set_accessed(struct vm_fault
*vmf
)
1877 bool write
= vmf
->flags
& FAULT_FLAG_WRITE
;
1879 vmf
->ptl
= pmd_lock(vmf
->vma
->vm_mm
, vmf
->pmd
);
1880 if (unlikely(!pmd_same(*vmf
->pmd
, vmf
->orig_pmd
)))
1883 touch_pmd(vmf
->vma
, vmf
->address
, vmf
->pmd
, write
);
1886 spin_unlock(vmf
->ptl
);
1889 static vm_fault_t
do_huge_zero_wp_pmd(struct vm_fault
*vmf
)
1891 unsigned long haddr
= vmf
->address
& HPAGE_PMD_MASK
;
1892 struct vm_area_struct
*vma
= vmf
->vma
;
1893 struct mmu_notifier_range range
;
1894 struct folio
*folio
;
1897 folio
= vma_alloc_anon_folio_pmd(vma
, vmf
->address
);
1898 if (unlikely(!folio
))
1899 return VM_FAULT_FALLBACK
;
1901 mmu_notifier_range_init(&range
, MMU_NOTIFY_CLEAR
, 0, vma
->vm_mm
, haddr
,
1902 haddr
+ HPAGE_PMD_SIZE
);
1903 mmu_notifier_invalidate_range_start(&range
);
1904 vmf
->ptl
= pmd_lock(vma
->vm_mm
, vmf
->pmd
);
1905 if (unlikely(!pmd_same(pmdp_get(vmf
->pmd
), vmf
->orig_pmd
)))
1907 ret
= check_stable_address_space(vma
->vm_mm
);
1910 (void)pmdp_huge_clear_flush(vma
, haddr
, vmf
->pmd
);
1911 map_anon_folio_pmd(folio
, vmf
->pmd
, vma
, haddr
);
1916 spin_unlock(vmf
->ptl
);
1917 mmu_notifier_invalidate_range_end(&range
);
1921 vm_fault_t
do_huge_pmd_wp_page(struct vm_fault
*vmf
)
1923 const bool unshare
= vmf
->flags
& FAULT_FLAG_UNSHARE
;
1924 struct vm_area_struct
*vma
= vmf
->vma
;
1925 struct folio
*folio
;
1927 unsigned long haddr
= vmf
->address
& HPAGE_PMD_MASK
;
1928 pmd_t orig_pmd
= vmf
->orig_pmd
;
1930 vmf
->ptl
= pmd_lockptr(vma
->vm_mm
, vmf
->pmd
);
1931 VM_BUG_ON_VMA(!vma
->anon_vma
, vma
);
1933 if (is_huge_zero_pmd(orig_pmd
)) {
1934 vm_fault_t ret
= do_huge_zero_wp_pmd(vmf
);
1936 if (!(ret
& VM_FAULT_FALLBACK
))
1939 /* Fallback to splitting PMD if THP cannot be allocated */
1943 spin_lock(vmf
->ptl
);
1945 if (unlikely(!pmd_same(*vmf
->pmd
, orig_pmd
))) {
1946 spin_unlock(vmf
->ptl
);
1950 page
= pmd_page(orig_pmd
);
1951 folio
= page_folio(page
);
1952 VM_BUG_ON_PAGE(!PageHead(page
), page
);
1954 /* Early check when only holding the PT lock. */
1955 if (PageAnonExclusive(page
))
1958 if (!folio_trylock(folio
)) {
1960 spin_unlock(vmf
->ptl
);
1962 spin_lock(vmf
->ptl
);
1963 if (unlikely(!pmd_same(*vmf
->pmd
, orig_pmd
))) {
1964 spin_unlock(vmf
->ptl
);
1965 folio_unlock(folio
);
1972 /* Recheck after temporarily dropping the PT lock. */
1973 if (PageAnonExclusive(page
)) {
1974 folio_unlock(folio
);
1979 * See do_wp_page(): we can only reuse the folio exclusively if
1980 * there are no additional references. Note that we always drain
1981 * the LRU cache immediately after adding a THP.
1983 if (folio_ref_count(folio
) >
1984 1 + folio_test_swapcache(folio
) * folio_nr_pages(folio
))
1985 goto unlock_fallback
;
1986 if (folio_test_swapcache(folio
))
1987 folio_free_swap(folio
);
1988 if (folio_ref_count(folio
) == 1) {
1991 folio_move_anon_rmap(folio
, vma
);
1992 SetPageAnonExclusive(page
);
1993 folio_unlock(folio
);
1995 if (unlikely(unshare
)) {
1996 spin_unlock(vmf
->ptl
);
1999 entry
= pmd_mkyoung(orig_pmd
);
2000 entry
= maybe_pmd_mkwrite(pmd_mkdirty(entry
), vma
);
2001 if (pmdp_set_access_flags(vma
, haddr
, vmf
->pmd
, entry
, 1))
2002 update_mmu_cache_pmd(vma
, vmf
->address
, vmf
->pmd
);
2003 spin_unlock(vmf
->ptl
);
2008 folio_unlock(folio
);
2009 spin_unlock(vmf
->ptl
);
2011 __split_huge_pmd(vma
, vmf
->pmd
, vmf
->address
, false);
2012 return VM_FAULT_FALLBACK
;
2015 static inline bool can_change_pmd_writable(struct vm_area_struct
*vma
,
2016 unsigned long addr
, pmd_t pmd
)
2020 if (WARN_ON_ONCE(!(vma
->vm_flags
& VM_WRITE
)))
2023 /* Don't touch entries that are not even readable (NUMA hinting). */
2024 if (pmd_protnone(pmd
))
2027 /* Do we need write faults for softdirty tracking? */
2028 if (pmd_needs_soft_dirty_wp(vma
, pmd
))
2031 /* Do we need write faults for uffd-wp tracking? */
2032 if (userfaultfd_huge_pmd_wp(vma
, pmd
))
2035 if (!(vma
->vm_flags
& VM_SHARED
)) {
2036 /* See can_change_pte_writable(). */
2037 page
= vm_normal_page_pmd(vma
, addr
, pmd
);
2038 return page
&& PageAnon(page
) && PageAnonExclusive(page
);
2041 /* See can_change_pte_writable(). */
2042 return pmd_dirty(pmd
);
2045 /* NUMA hinting page fault entry point for trans huge pmds */
2046 vm_fault_t
do_huge_pmd_numa_page(struct vm_fault
*vmf
)
2048 struct vm_area_struct
*vma
= vmf
->vma
;
2049 struct folio
*folio
;
2050 unsigned long haddr
= vmf
->address
& HPAGE_PMD_MASK
;
2051 int nid
= NUMA_NO_NODE
;
2052 int target_nid
, last_cpupid
;
2054 bool writable
= false;
2057 vmf
->ptl
= pmd_lock(vma
->vm_mm
, vmf
->pmd
);
2058 old_pmd
= pmdp_get(vmf
->pmd
);
2060 if (unlikely(!pmd_same(old_pmd
, vmf
->orig_pmd
))) {
2061 spin_unlock(vmf
->ptl
);
2065 pmd
= pmd_modify(old_pmd
, vma
->vm_page_prot
);
2068 * Detect now whether the PMD could be writable; this information
2069 * is only valid while holding the PT lock.
2071 writable
= pmd_write(pmd
);
2072 if (!writable
&& vma_wants_manual_pte_write_upgrade(vma
) &&
2073 can_change_pmd_writable(vma
, vmf
->address
, pmd
))
2076 folio
= vm_normal_folio_pmd(vma
, haddr
, pmd
);
2080 nid
= folio_nid(folio
);
2082 target_nid
= numa_migrate_check(folio
, vmf
, haddr
, &flags
, writable
,
2084 if (target_nid
== NUMA_NO_NODE
)
2086 if (migrate_misplaced_folio_prepare(folio
, vma
, target_nid
)) {
2087 flags
|= TNF_MIGRATE_FAIL
;
2090 /* The folio is isolated and isolation code holds a folio reference. */
2091 spin_unlock(vmf
->ptl
);
2094 if (!migrate_misplaced_folio(folio
, target_nid
)) {
2095 flags
|= TNF_MIGRATED
;
2097 task_numa_fault(last_cpupid
, nid
, HPAGE_PMD_NR
, flags
);
2101 flags
|= TNF_MIGRATE_FAIL
;
2102 vmf
->ptl
= pmd_lock(vma
->vm_mm
, vmf
->pmd
);
2103 if (unlikely(!pmd_same(pmdp_get(vmf
->pmd
), vmf
->orig_pmd
))) {
2104 spin_unlock(vmf
->ptl
);
2108 /* Restore the PMD */
2109 pmd
= pmd_modify(pmdp_get(vmf
->pmd
), vma
->vm_page_prot
);
2110 pmd
= pmd_mkyoung(pmd
);
2112 pmd
= pmd_mkwrite(pmd
, vma
);
2113 set_pmd_at(vma
->vm_mm
, haddr
, vmf
->pmd
, pmd
);
2114 update_mmu_cache_pmd(vma
, vmf
->address
, vmf
->pmd
);
2115 spin_unlock(vmf
->ptl
);
2117 if (nid
!= NUMA_NO_NODE
)
2118 task_numa_fault(last_cpupid
, nid
, HPAGE_PMD_NR
, flags
);
2123 * Return true if we do MADV_FREE successfully on entire pmd page.
2124 * Otherwise, return false.
2126 bool madvise_free_huge_pmd(struct mmu_gather
*tlb
, struct vm_area_struct
*vma
,
2127 pmd_t
*pmd
, unsigned long addr
, unsigned long next
)
2131 struct folio
*folio
;
2132 struct mm_struct
*mm
= tlb
->mm
;
2135 tlb_change_page_size(tlb
, HPAGE_PMD_SIZE
);
2137 ptl
= pmd_trans_huge_lock(pmd
, vma
);
2142 if (is_huge_zero_pmd(orig_pmd
))
2145 if (unlikely(!pmd_present(orig_pmd
))) {
2146 VM_BUG_ON(thp_migration_supported() &&
2147 !is_pmd_migration_entry(orig_pmd
));
2151 folio
= pmd_folio(orig_pmd
);
2153 * If other processes are mapping this folio, we couldn't discard
2154 * the folio unless they all do MADV_FREE so let's skip the folio.
2156 if (folio_maybe_mapped_shared(folio
))
2159 if (!folio_trylock(folio
))
2163 * If user want to discard part-pages of THP, split it so MADV_FREE
2164 * will deactivate only them.
2166 if (next
- addr
!= HPAGE_PMD_SIZE
) {
2170 folio_unlock(folio
);
2175 if (folio_test_dirty(folio
))
2176 folio_clear_dirty(folio
);
2177 folio_unlock(folio
);
2179 if (pmd_young(orig_pmd
) || pmd_dirty(orig_pmd
)) {
2180 pmdp_invalidate(vma
, addr
, pmd
);
2181 orig_pmd
= pmd_mkold(orig_pmd
);
2182 orig_pmd
= pmd_mkclean(orig_pmd
);
2184 set_pmd_at(mm
, addr
, pmd
, orig_pmd
);
2185 tlb_remove_pmd_tlb_entry(tlb
, pmd
, addr
);
2188 folio_mark_lazyfree(folio
);
2196 static inline void zap_deposited_table(struct mm_struct
*mm
, pmd_t
*pmd
)
2200 pgtable
= pgtable_trans_huge_withdraw(mm
, pmd
);
2201 pte_free(mm
, pgtable
);
2205 int zap_huge_pmd(struct mmu_gather
*tlb
, struct vm_area_struct
*vma
,
2206 pmd_t
*pmd
, unsigned long addr
)
2211 tlb_change_page_size(tlb
, HPAGE_PMD_SIZE
);
2213 ptl
= __pmd_trans_huge_lock(pmd
, vma
);
2217 * For architectures like ppc64 we look at deposited pgtable
2218 * when calling pmdp_huge_get_and_clear. So do the
2219 * pgtable_trans_huge_withdraw after finishing pmdp related
2222 orig_pmd
= pmdp_huge_get_and_clear_full(vma
, addr
, pmd
,
2224 arch_check_zapped_pmd(vma
, orig_pmd
);
2225 tlb_remove_pmd_tlb_entry(tlb
, pmd
, addr
);
2226 if (!vma_is_dax(vma
) && vma_is_special_huge(vma
)) {
2227 if (arch_needs_pgtable_deposit())
2228 zap_deposited_table(tlb
->mm
, pmd
);
2230 } else if (is_huge_zero_pmd(orig_pmd
)) {
2231 if (!vma_is_dax(vma
) || arch_needs_pgtable_deposit())
2232 zap_deposited_table(tlb
->mm
, pmd
);
2235 struct folio
*folio
= NULL
;
2236 int flush_needed
= 1;
2238 if (pmd_present(orig_pmd
)) {
2239 struct page
*page
= pmd_page(orig_pmd
);
2241 folio
= page_folio(page
);
2242 folio_remove_rmap_pmd(folio
, page
, vma
);
2243 WARN_ON_ONCE(folio_mapcount(folio
) < 0);
2244 VM_BUG_ON_PAGE(!PageHead(page
), page
);
2245 } else if (thp_migration_supported()) {
2248 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd
));
2249 entry
= pmd_to_swp_entry(orig_pmd
);
2250 folio
= pfn_swap_entry_folio(entry
);
2253 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
2255 if (folio_test_anon(folio
)) {
2256 zap_deposited_table(tlb
->mm
, pmd
);
2257 add_mm_counter(tlb
->mm
, MM_ANONPAGES
, -HPAGE_PMD_NR
);
2259 if (arch_needs_pgtable_deposit())
2260 zap_deposited_table(tlb
->mm
, pmd
);
2261 add_mm_counter(tlb
->mm
, mm_counter_file(folio
),
2265 * Use flush_needed to indicate whether the PMD entry
2266 * is present, instead of checking pmd_present() again.
2268 if (flush_needed
&& pmd_young(orig_pmd
) &&
2269 likely(vma_has_recency(vma
)))
2270 folio_mark_accessed(folio
);
2275 tlb_remove_page_size(tlb
, &folio
->page
, HPAGE_PMD_SIZE
);
2280 #ifndef pmd_move_must_withdraw
2281 static inline int pmd_move_must_withdraw(spinlock_t
*new_pmd_ptl
,
2282 spinlock_t
*old_pmd_ptl
,
2283 struct vm_area_struct
*vma
)
2286 * With split pmd lock we also need to move preallocated
2287 * PTE page table if new_pmd is on different PMD page table.
2289 * We also don't deposit and withdraw tables for file pages.
2291 return (new_pmd_ptl
!= old_pmd_ptl
) && vma_is_anonymous(vma
);
2295 static pmd_t
move_soft_dirty_pmd(pmd_t pmd
)
2297 #ifdef CONFIG_MEM_SOFT_DIRTY
2298 if (unlikely(is_pmd_migration_entry(pmd
)))
2299 pmd
= pmd_swp_mksoft_dirty(pmd
);
2300 else if (pmd_present(pmd
))
2301 pmd
= pmd_mksoft_dirty(pmd
);
2306 static pmd_t
clear_uffd_wp_pmd(pmd_t pmd
)
2308 if (pmd_present(pmd
))
2309 pmd
= pmd_clear_uffd_wp(pmd
);
2310 else if (is_swap_pmd(pmd
))
2311 pmd
= pmd_swp_clear_uffd_wp(pmd
);
2316 bool move_huge_pmd(struct vm_area_struct
*vma
, unsigned long old_addr
,
2317 unsigned long new_addr
, pmd_t
*old_pmd
, pmd_t
*new_pmd
)
2319 spinlock_t
*old_ptl
, *new_ptl
;
2321 struct mm_struct
*mm
= vma
->vm_mm
;
2322 bool force_flush
= false;
2325 * The destination pmd shouldn't be established, free_pgtables()
2326 * should have released it; but move_page_tables() might have already
2327 * inserted a page table, if racing against shmem/file collapse.
2329 if (!pmd_none(*new_pmd
)) {
2330 VM_BUG_ON(pmd_trans_huge(*new_pmd
));
2335 * We don't have to worry about the ordering of src and dst
2336 * ptlocks because exclusive mmap_lock prevents deadlock.
2338 old_ptl
= __pmd_trans_huge_lock(old_pmd
, vma
);
2340 new_ptl
= pmd_lockptr(mm
, new_pmd
);
2341 if (new_ptl
!= old_ptl
)
2342 spin_lock_nested(new_ptl
, SINGLE_DEPTH_NESTING
);
2343 pmd
= pmdp_huge_get_and_clear(mm
, old_addr
, old_pmd
);
2344 if (pmd_present(pmd
))
2346 VM_BUG_ON(!pmd_none(*new_pmd
));
2348 if (pmd_move_must_withdraw(new_ptl
, old_ptl
, vma
)) {
2350 pgtable
= pgtable_trans_huge_withdraw(mm
, old_pmd
);
2351 pgtable_trans_huge_deposit(mm
, new_pmd
, pgtable
);
2353 pmd
= move_soft_dirty_pmd(pmd
);
2354 if (vma_has_uffd_without_event_remap(vma
))
2355 pmd
= clear_uffd_wp_pmd(pmd
);
2356 set_pmd_at(mm
, new_addr
, new_pmd
, pmd
);
2358 flush_pmd_tlb_range(vma
, old_addr
, old_addr
+ PMD_SIZE
);
2359 if (new_ptl
!= old_ptl
)
2360 spin_unlock(new_ptl
);
2361 spin_unlock(old_ptl
);
2369 * - 0 if PMD could not be locked
2370 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
2371 * or if prot_numa but THP migration is not supported
2372 * - HPAGE_PMD_NR if protections changed and TLB flush necessary
2374 int change_huge_pmd(struct mmu_gather
*tlb
, struct vm_area_struct
*vma
,
2375 pmd_t
*pmd
, unsigned long addr
, pgprot_t newprot
,
2376 unsigned long cp_flags
)
2378 struct mm_struct
*mm
= vma
->vm_mm
;
2380 pmd_t oldpmd
, entry
;
2381 bool prot_numa
= cp_flags
& MM_CP_PROT_NUMA
;
2382 bool uffd_wp
= cp_flags
& MM_CP_UFFD_WP
;
2383 bool uffd_wp_resolve
= cp_flags
& MM_CP_UFFD_WP_RESOLVE
;
2386 tlb_change_page_size(tlb
, HPAGE_PMD_SIZE
);
2388 if (prot_numa
&& !thp_migration_supported())
2391 ptl
= __pmd_trans_huge_lock(pmd
, vma
);
2395 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2396 if (is_swap_pmd(*pmd
)) {
2397 swp_entry_t entry
= pmd_to_swp_entry(*pmd
);
2398 struct folio
*folio
= pfn_swap_entry_folio(entry
);
2401 VM_BUG_ON(!is_pmd_migration_entry(*pmd
));
2402 if (is_writable_migration_entry(entry
)) {
2404 * A protection check is difficult so
2405 * just be safe and disable write
2407 if (folio_test_anon(folio
))
2408 entry
= make_readable_exclusive_migration_entry(swp_offset(entry
));
2410 entry
= make_readable_migration_entry(swp_offset(entry
));
2411 newpmd
= swp_entry_to_pmd(entry
);
2412 if (pmd_swp_soft_dirty(*pmd
))
2413 newpmd
= pmd_swp_mksoft_dirty(newpmd
);
2419 newpmd
= pmd_swp_mkuffd_wp(newpmd
);
2420 else if (uffd_wp_resolve
)
2421 newpmd
= pmd_swp_clear_uffd_wp(newpmd
);
2422 if (!pmd_same(*pmd
, newpmd
))
2423 set_pmd_at(mm
, addr
, pmd
, newpmd
);
2429 struct folio
*folio
;
2432 * Avoid trapping faults against the zero page. The read-only
2433 * data is likely to be read-cached on the local CPU and
2434 * local/remote hits to the zero page are not interesting.
2436 if (is_huge_zero_pmd(*pmd
))
2439 if (pmd_protnone(*pmd
))
2442 folio
= pmd_folio(*pmd
);
2443 toptier
= node_is_toptier(folio_nid(folio
));
2445 * Skip scanning top tier node if normal numa
2446 * balancing is disabled
2448 if (!(sysctl_numa_balancing_mode
& NUMA_BALANCING_NORMAL
) &&
2452 if (folio_use_access_time(folio
))
2453 folio_xchg_access_time(folio
,
2454 jiffies_to_msecs(jiffies
));
2457 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
2458 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
2459 * which is also under mmap_read_lock(mm):
2462 * change_huge_pmd(prot_numa=1)
2463 * pmdp_huge_get_and_clear_notify()
2464 * madvise_dontneed()
2466 * pmd_trans_huge(*pmd) == 0 (without ptl)
2469 * // pmd is re-established
2471 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2472 * which may break userspace.
2474 * pmdp_invalidate_ad() is required to make sure we don't miss
2475 * dirty/young flags set by hardware.
2477 oldpmd
= pmdp_invalidate_ad(vma
, addr
, pmd
);
2479 entry
= pmd_modify(oldpmd
, newprot
);
2481 entry
= pmd_mkuffd_wp(entry
);
2482 else if (uffd_wp_resolve
)
2484 * Leave the write bit to be handled by PF interrupt
2485 * handler, then things like COW could be properly
2488 entry
= pmd_clear_uffd_wp(entry
);
2490 /* See change_pte_range(). */
2491 if ((cp_flags
& MM_CP_TRY_CHANGE_WRITABLE
) && !pmd_write(entry
) &&
2492 can_change_pmd_writable(vma
, addr
, entry
))
2493 entry
= pmd_mkwrite(entry
, vma
);
2496 set_pmd_at(mm
, addr
, pmd
, entry
);
2498 if (huge_pmd_needs_flush(oldpmd
, entry
))
2499 tlb_flush_pmd_range(tlb
, addr
, HPAGE_PMD_SIZE
);
2508 * - 0: if pud leaf changed from under us
2509 * - 1: if pud can be skipped
2510 * - HPAGE_PUD_NR: if pud was successfully processed
2512 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2513 int change_huge_pud(struct mmu_gather
*tlb
, struct vm_area_struct
*vma
,
2514 pud_t
*pudp
, unsigned long addr
, pgprot_t newprot
,
2515 unsigned long cp_flags
)
2517 struct mm_struct
*mm
= vma
->vm_mm
;
2518 pud_t oldpud
, entry
;
2521 tlb_change_page_size(tlb
, HPAGE_PUD_SIZE
);
2523 /* NUMA balancing doesn't apply to dax */
2524 if (cp_flags
& MM_CP_PROT_NUMA
)
2528 * Huge entries on userfault-wp only works with anonymous, while we
2529 * don't have anonymous PUDs yet.
2531 if (WARN_ON_ONCE(cp_flags
& MM_CP_UFFD_WP_ALL
))
2534 ptl
= __pud_trans_huge_lock(pudp
, vma
);
2539 * Can't clear PUD or it can race with concurrent zapping. See
2540 * change_huge_pmd().
2542 oldpud
= pudp_invalidate(vma
, addr
, pudp
);
2543 entry
= pud_modify(oldpud
, newprot
);
2544 set_pud_at(mm
, addr
, pudp
, entry
);
2545 tlb_flush_pud_range(tlb
, addr
, HPAGE_PUD_SIZE
);
2548 return HPAGE_PUD_NR
;
2552 #ifdef CONFIG_USERFAULTFD
2554 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
2555 * the caller, but it must return after releasing the page_table_lock.
2556 * Just move the page from src_pmd to dst_pmd if possible.
2557 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2558 * repeated by the caller, or other errors in case of failure.
2560 int move_pages_huge_pmd(struct mm_struct
*mm
, pmd_t
*dst_pmd
, pmd_t
*src_pmd
, pmd_t dst_pmdval
,
2561 struct vm_area_struct
*dst_vma
, struct vm_area_struct
*src_vma
,
2562 unsigned long dst_addr
, unsigned long src_addr
)
2564 pmd_t _dst_pmd
, src_pmdval
;
2565 struct page
*src_page
;
2566 struct folio
*src_folio
;
2567 struct anon_vma
*src_anon_vma
;
2568 spinlock_t
*src_ptl
, *dst_ptl
;
2569 pgtable_t src_pgtable
;
2570 struct mmu_notifier_range range
;
2573 src_pmdval
= *src_pmd
;
2574 src_ptl
= pmd_lockptr(mm
, src_pmd
);
2576 lockdep_assert_held(src_ptl
);
2577 vma_assert_locked(src_vma
);
2578 vma_assert_locked(dst_vma
);
2580 /* Sanity checks before the operation */
2581 if (WARN_ON_ONCE(!pmd_none(dst_pmdval
)) || WARN_ON_ONCE(src_addr
& ~HPAGE_PMD_MASK
) ||
2582 WARN_ON_ONCE(dst_addr
& ~HPAGE_PMD_MASK
)) {
2583 spin_unlock(src_ptl
);
2587 if (!pmd_trans_huge(src_pmdval
)) {
2588 spin_unlock(src_ptl
);
2589 if (is_pmd_migration_entry(src_pmdval
)) {
2590 pmd_migration_entry_wait(mm
, &src_pmdval
);
2596 src_page
= pmd_page(src_pmdval
);
2598 if (!is_huge_zero_pmd(src_pmdval
)) {
2599 if (unlikely(!PageAnonExclusive(src_page
))) {
2600 spin_unlock(src_ptl
);
2604 src_folio
= page_folio(src_page
);
2605 folio_get(src_folio
);
2609 spin_unlock(src_ptl
);
2611 flush_cache_range(src_vma
, src_addr
, src_addr
+ HPAGE_PMD_SIZE
);
2612 mmu_notifier_range_init(&range
, MMU_NOTIFY_CLEAR
, 0, mm
, src_addr
,
2613 src_addr
+ HPAGE_PMD_SIZE
);
2614 mmu_notifier_invalidate_range_start(&range
);
2617 folio_lock(src_folio
);
2620 * split_huge_page walks the anon_vma chain without the page
2621 * lock. Serialize against it with the anon_vma lock, the page
2622 * lock is not enough.
2624 src_anon_vma
= folio_get_anon_vma(src_folio
);
2625 if (!src_anon_vma
) {
2629 anon_vma_lock_write(src_anon_vma
);
2631 src_anon_vma
= NULL
;
2633 dst_ptl
= pmd_lockptr(mm
, dst_pmd
);
2634 double_pt_lock(src_ptl
, dst_ptl
);
2635 if (unlikely(!pmd_same(*src_pmd
, src_pmdval
) ||
2636 !pmd_same(*dst_pmd
, dst_pmdval
))) {
2641 if (folio_maybe_dma_pinned(src_folio
) ||
2642 !PageAnonExclusive(&src_folio
->page
)) {
2647 if (WARN_ON_ONCE(!folio_test_head(src_folio
)) ||
2648 WARN_ON_ONCE(!folio_test_anon(src_folio
))) {
2653 src_pmdval
= pmdp_huge_clear_flush(src_vma
, src_addr
, src_pmd
);
2654 /* Folio got pinned from under us. Put it back and fail the move. */
2655 if (folio_maybe_dma_pinned(src_folio
)) {
2656 set_pmd_at(mm
, src_addr
, src_pmd
, src_pmdval
);
2661 folio_move_anon_rmap(src_folio
, dst_vma
);
2662 src_folio
->index
= linear_page_index(dst_vma
, dst_addr
);
2664 _dst_pmd
= folio_mk_pmd(src_folio
, dst_vma
->vm_page_prot
);
2665 /* Follow mremap() behavior and treat the entry dirty after the move */
2666 _dst_pmd
= pmd_mkwrite(pmd_mkdirty(_dst_pmd
), dst_vma
);
2668 src_pmdval
= pmdp_huge_clear_flush(src_vma
, src_addr
, src_pmd
);
2669 _dst_pmd
= folio_mk_pmd(src_folio
, dst_vma
->vm_page_prot
);
2671 set_pmd_at(mm
, dst_addr
, dst_pmd
, _dst_pmd
);
2673 src_pgtable
= pgtable_trans_huge_withdraw(mm
, src_pmd
);
2674 pgtable_trans_huge_deposit(mm
, dst_pmd
, src_pgtable
);
2676 double_pt_unlock(src_ptl
, dst_ptl
);
2678 anon_vma_unlock_write(src_anon_vma
);
2679 put_anon_vma(src_anon_vma
);
2682 /* unblock rmap walks */
2684 folio_unlock(src_folio
);
2685 mmu_notifier_invalidate_range_end(&range
);
2687 folio_put(src_folio
);
2690 #endif /* CONFIG_USERFAULTFD */
2693 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2695 * Note that if it returns page table lock pointer, this routine returns without
2696 * unlocking page table lock. So callers must unlock it.
2698 spinlock_t
*__pmd_trans_huge_lock(pmd_t
*pmd
, struct vm_area_struct
*vma
)
2701 ptl
= pmd_lock(vma
->vm_mm
, pmd
);
2702 if (likely(is_swap_pmd(*pmd
) || pmd_trans_huge(*pmd
) ||
2710 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2712 * Note that if it returns page table lock pointer, this routine returns without
2713 * unlocking page table lock. So callers must unlock it.
2715 spinlock_t
*__pud_trans_huge_lock(pud_t
*pud
, struct vm_area_struct
*vma
)
2719 ptl
= pud_lock(vma
->vm_mm
, pud
);
2720 if (likely(pud_trans_huge(*pud
) || pud_devmap(*pud
)))
2726 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2727 int zap_huge_pud(struct mmu_gather
*tlb
, struct vm_area_struct
*vma
,
2728 pud_t
*pud
, unsigned long addr
)
2733 ptl
= __pud_trans_huge_lock(pud
, vma
);
2737 orig_pud
= pudp_huge_get_and_clear_full(vma
, addr
, pud
, tlb
->fullmm
);
2738 arch_check_zapped_pud(vma
, orig_pud
);
2739 tlb_remove_pud_tlb_entry(tlb
, pud
, addr
);
2740 if (!vma_is_dax(vma
) && vma_is_special_huge(vma
)) {
2742 /* No zero page support yet */
2744 struct page
*page
= NULL
;
2745 struct folio
*folio
;
2747 /* No support for anonymous PUD pages or migration yet */
2748 VM_WARN_ON_ONCE(vma_is_anonymous(vma
) ||
2749 !pud_present(orig_pud
));
2751 page
= pud_page(orig_pud
);
2752 folio
= page_folio(page
);
2753 folio_remove_rmap_pud(folio
, page
, vma
);
2754 add_mm_counter(tlb
->mm
, mm_counter_file(folio
), -HPAGE_PUD_NR
);
2757 tlb_remove_page_size(tlb
, page
, HPAGE_PUD_SIZE
);
2762 static void __split_huge_pud_locked(struct vm_area_struct
*vma
, pud_t
*pud
,
2763 unsigned long haddr
)
2765 struct folio
*folio
;
2769 VM_BUG_ON(haddr
& ~HPAGE_PUD_MASK
);
2770 VM_BUG_ON_VMA(vma
->vm_start
> haddr
, vma
);
2771 VM_BUG_ON_VMA(vma
->vm_end
< haddr
+ HPAGE_PUD_SIZE
, vma
);
2772 VM_BUG_ON(!pud_trans_huge(*pud
) && !pud_devmap(*pud
));
2774 count_vm_event(THP_SPLIT_PUD
);
2776 old_pud
= pudp_huge_clear_flush(vma
, haddr
, pud
);
2778 if (!vma_is_dax(vma
))
2781 page
= pud_page(old_pud
);
2782 folio
= page_folio(page
);
2784 if (!folio_test_dirty(folio
) && pud_dirty(old_pud
))
2785 folio_mark_dirty(folio
);
2786 if (!folio_test_referenced(folio
) && pud_young(old_pud
))
2787 folio_set_referenced(folio
);
2788 folio_remove_rmap_pud(folio
, page
, vma
);
2790 add_mm_counter(vma
->vm_mm
, mm_counter_file(folio
),
2794 void __split_huge_pud(struct vm_area_struct
*vma
, pud_t
*pud
,
2795 unsigned long address
)
2798 struct mmu_notifier_range range
;
2800 mmu_notifier_range_init(&range
, MMU_NOTIFY_CLEAR
, 0, vma
->vm_mm
,
2801 address
& HPAGE_PUD_MASK
,
2802 (address
& HPAGE_PUD_MASK
) + HPAGE_PUD_SIZE
);
2803 mmu_notifier_invalidate_range_start(&range
);
2804 ptl
= pud_lock(vma
->vm_mm
, pud
);
2805 if (unlikely(!pud_trans_huge(*pud
) && !pud_devmap(*pud
)))
2807 __split_huge_pud_locked(vma
, pud
, range
.start
);
2811 mmu_notifier_invalidate_range_end(&range
);
2814 void __split_huge_pud(struct vm_area_struct
*vma
, pud_t
*pud
,
2815 unsigned long address
)
2818 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2820 static void __split_huge_zero_page_pmd(struct vm_area_struct
*vma
,
2821 unsigned long haddr
, pmd_t
*pmd
)
2823 struct mm_struct
*mm
= vma
->vm_mm
;
2825 pmd_t _pmd
, old_pmd
;
2831 * Leave pmd empty until pte is filled note that it is fine to delay
2832 * notification until mmu_notifier_invalidate_range_end() as we are
2833 * replacing a zero pmd write protected page with a zero pte write
2836 * See Documentation/mm/mmu_notifier.rst
2838 old_pmd
= pmdp_huge_clear_flush(vma
, haddr
, pmd
);
2840 pgtable
= pgtable_trans_huge_withdraw(mm
, pmd
);
2841 pmd_populate(mm
, &_pmd
, pgtable
);
2843 pte
= pte_offset_map(&_pmd
, haddr
);
2845 for (i
= 0, addr
= haddr
; i
< HPAGE_PMD_NR
; i
++, addr
+= PAGE_SIZE
) {
2848 entry
= pfn_pte(my_zero_pfn(addr
), vma
->vm_page_prot
);
2849 entry
= pte_mkspecial(entry
);
2850 if (pmd_uffd_wp(old_pmd
))
2851 entry
= pte_mkuffd_wp(entry
);
2852 VM_BUG_ON(!pte_none(ptep_get(pte
)));
2853 set_pte_at(mm
, addr
, pte
, entry
);
2857 smp_wmb(); /* make pte visible before pmd */
2858 pmd_populate(mm
, pmd
, pgtable
);
2861 static void __split_huge_pmd_locked(struct vm_area_struct
*vma
, pmd_t
*pmd
,
2862 unsigned long haddr
, bool freeze
)
2864 struct mm_struct
*mm
= vma
->vm_mm
;
2865 struct folio
*folio
;
2868 pmd_t old_pmd
, _pmd
;
2869 bool young
, write
, soft_dirty
, pmd_migration
= false, uffd_wp
= false;
2870 bool anon_exclusive
= false, dirty
= false;
2875 VM_BUG_ON(haddr
& ~HPAGE_PMD_MASK
);
2876 VM_BUG_ON_VMA(vma
->vm_start
> haddr
, vma
);
2877 VM_BUG_ON_VMA(vma
->vm_end
< haddr
+ HPAGE_PMD_SIZE
, vma
);
2878 VM_BUG_ON(!is_pmd_migration_entry(*pmd
) && !pmd_trans_huge(*pmd
)
2879 && !pmd_devmap(*pmd
));
2881 count_vm_event(THP_SPLIT_PMD
);
2883 if (!vma_is_anonymous(vma
)) {
2884 old_pmd
= pmdp_huge_clear_flush(vma
, haddr
, pmd
);
2886 * We are going to unmap this huge page. So
2887 * just go ahead and zap it
2889 if (arch_needs_pgtable_deposit())
2890 zap_deposited_table(mm
, pmd
);
2891 if (!vma_is_dax(vma
) && vma_is_special_huge(vma
))
2893 if (unlikely(is_pmd_migration_entry(old_pmd
))) {
2896 entry
= pmd_to_swp_entry(old_pmd
);
2897 folio
= pfn_swap_entry_folio(entry
);
2898 } else if (is_huge_zero_pmd(old_pmd
)) {
2901 page
= pmd_page(old_pmd
);
2902 folio
= page_folio(page
);
2903 if (!folio_test_dirty(folio
) && pmd_dirty(old_pmd
))
2904 folio_mark_dirty(folio
);
2905 if (!folio_test_referenced(folio
) && pmd_young(old_pmd
))
2906 folio_set_referenced(folio
);
2907 folio_remove_rmap_pmd(folio
, page
, vma
);
2910 add_mm_counter(mm
, mm_counter_file(folio
), -HPAGE_PMD_NR
);
2914 if (is_huge_zero_pmd(*pmd
)) {
2916 * FIXME: Do we want to invalidate secondary mmu by calling
2917 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2918 * inside __split_huge_pmd() ?
2920 * We are going from a zero huge page write protected to zero
2921 * small page also write protected so it does not seems useful
2922 * to invalidate secondary mmu at this time.
2924 return __split_huge_zero_page_pmd(vma
, haddr
, pmd
);
2927 pmd_migration
= is_pmd_migration_entry(*pmd
);
2928 if (unlikely(pmd_migration
)) {
2932 entry
= pmd_to_swp_entry(old_pmd
);
2933 page
= pfn_swap_entry_to_page(entry
);
2934 write
= is_writable_migration_entry(entry
);
2936 anon_exclusive
= is_readable_exclusive_migration_entry(entry
);
2937 young
= is_migration_entry_young(entry
);
2938 dirty
= is_migration_entry_dirty(entry
);
2939 soft_dirty
= pmd_swp_soft_dirty(old_pmd
);
2940 uffd_wp
= pmd_swp_uffd_wp(old_pmd
);
2943 * Up to this point the pmd is present and huge and userland has
2944 * the whole access to the hugepage during the split (which
2945 * happens in place). If we overwrite the pmd with the not-huge
2946 * version pointing to the pte here (which of course we could if
2947 * all CPUs were bug free), userland could trigger a small page
2948 * size TLB miss on the small sized TLB while the hugepage TLB
2949 * entry is still established in the huge TLB. Some CPU doesn't
2951 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2952 * 383 on page 105. Intel should be safe but is also warns that
2953 * it's only safe if the permission and cache attributes of the
2954 * two entries loaded in the two TLB is identical (which should
2955 * be the case here). But it is generally safer to never allow
2956 * small and huge TLB entries for the same virtual address to be
2957 * loaded simultaneously. So instead of doing "pmd_populate();
2958 * flush_pmd_tlb_range();" we first mark the current pmd
2959 * notpresent (atomically because here the pmd_trans_huge must
2960 * remain set at all times on the pmd until the split is
2961 * complete for this pmd), then we flush the SMP TLB and finally
2962 * we write the non-huge version of the pmd entry with
2965 old_pmd
= pmdp_invalidate(vma
, haddr
, pmd
);
2966 page
= pmd_page(old_pmd
);
2967 folio
= page_folio(page
);
2968 if (pmd_dirty(old_pmd
)) {
2970 folio_set_dirty(folio
);
2972 write
= pmd_write(old_pmd
);
2973 young
= pmd_young(old_pmd
);
2974 soft_dirty
= pmd_soft_dirty(old_pmd
);
2975 uffd_wp
= pmd_uffd_wp(old_pmd
);
2977 VM_WARN_ON_FOLIO(!folio_ref_count(folio
), folio
);
2978 VM_WARN_ON_FOLIO(!folio_test_anon(folio
), folio
);
2981 * Without "freeze", we'll simply split the PMD, propagating the
2982 * PageAnonExclusive() flag for each PTE by setting it for
2983 * each subpage -- no need to (temporarily) clear.
2985 * With "freeze" we want to replace mapped pages by
2986 * migration entries right away. This is only possible if we
2987 * managed to clear PageAnonExclusive() -- see
2988 * set_pmd_migration_entry().
2990 * In case we cannot clear PageAnonExclusive(), split the PMD
2991 * only and let try_to_migrate_one() fail later.
2993 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
2995 anon_exclusive
= PageAnonExclusive(page
);
2996 if (freeze
&& anon_exclusive
&&
2997 folio_try_share_anon_rmap_pmd(folio
, page
))
3000 rmap_t rmap_flags
= RMAP_NONE
;
3002 folio_ref_add(folio
, HPAGE_PMD_NR
- 1);
3004 rmap_flags
|= RMAP_EXCLUSIVE
;
3005 folio_add_anon_rmap_ptes(folio
, page
, HPAGE_PMD_NR
,
3006 vma
, haddr
, rmap_flags
);
3011 * Withdraw the table only after we mark the pmd entry invalid.
3012 * This's critical for some architectures (Power).
3014 pgtable
= pgtable_trans_huge_withdraw(mm
, pmd
);
3015 pmd_populate(mm
, &_pmd
, pgtable
);
3017 pte
= pte_offset_map(&_pmd
, haddr
);
3021 * Note that NUMA hinting access restrictions are not transferred to
3022 * avoid any possibility of altering permissions across VMAs.
3024 if (freeze
|| pmd_migration
) {
3025 for (i
= 0, addr
= haddr
; i
< HPAGE_PMD_NR
; i
++, addr
+= PAGE_SIZE
) {
3027 swp_entry_t swp_entry
;
3030 swp_entry
= make_writable_migration_entry(
3031 page_to_pfn(page
+ i
));
3032 else if (anon_exclusive
)
3033 swp_entry
= make_readable_exclusive_migration_entry(
3034 page_to_pfn(page
+ i
));
3036 swp_entry
= make_readable_migration_entry(
3037 page_to_pfn(page
+ i
));
3039 swp_entry
= make_migration_entry_young(swp_entry
);
3041 swp_entry
= make_migration_entry_dirty(swp_entry
);
3042 entry
= swp_entry_to_pte(swp_entry
);
3044 entry
= pte_swp_mksoft_dirty(entry
);
3046 entry
= pte_swp_mkuffd_wp(entry
);
3048 VM_WARN_ON(!pte_none(ptep_get(pte
+ i
)));
3049 set_pte_at(mm
, addr
, pte
+ i
, entry
);
3054 entry
= mk_pte(page
, READ_ONCE(vma
->vm_page_prot
));
3056 entry
= pte_mkwrite(entry
, vma
);
3058 entry
= pte_mkold(entry
);
3059 /* NOTE: this may set soft-dirty too on some archs */
3061 entry
= pte_mkdirty(entry
);
3063 entry
= pte_mksoft_dirty(entry
);
3065 entry
= pte_mkuffd_wp(entry
);
3067 for (i
= 0; i
< HPAGE_PMD_NR
; i
++)
3068 VM_WARN_ON(!pte_none(ptep_get(pte
+ i
)));
3070 set_ptes(mm
, haddr
, pte
, entry
, HPAGE_PMD_NR
);
3075 folio_remove_rmap_pmd(folio
, page
, vma
);
3079 smp_wmb(); /* make pte visible before pmd */
3080 pmd_populate(mm
, pmd
, pgtable
);
3083 void split_huge_pmd_locked(struct vm_area_struct
*vma
, unsigned long address
,
3084 pmd_t
*pmd
, bool freeze
)
3086 VM_WARN_ON_ONCE(!IS_ALIGNED(address
, HPAGE_PMD_SIZE
));
3087 if (pmd_trans_huge(*pmd
) || pmd_devmap(*pmd
) ||
3088 is_pmd_migration_entry(*pmd
))
3089 __split_huge_pmd_locked(vma
, pmd
, address
, freeze
);
3092 void __split_huge_pmd(struct vm_area_struct
*vma
, pmd_t
*pmd
,
3093 unsigned long address
, bool freeze
)
3096 struct mmu_notifier_range range
;
3098 mmu_notifier_range_init(&range
, MMU_NOTIFY_CLEAR
, 0, vma
->vm_mm
,
3099 address
& HPAGE_PMD_MASK
,
3100 (address
& HPAGE_PMD_MASK
) + HPAGE_PMD_SIZE
);
3101 mmu_notifier_invalidate_range_start(&range
);
3102 ptl
= pmd_lock(vma
->vm_mm
, pmd
);
3103 split_huge_pmd_locked(vma
, range
.start
, pmd
, freeze
);
3105 mmu_notifier_invalidate_range_end(&range
);
3108 void split_huge_pmd_address(struct vm_area_struct
*vma
, unsigned long address
,
3111 pmd_t
*pmd
= mm_find_pmd(vma
->vm_mm
, address
);
3116 __split_huge_pmd(vma
, pmd
, address
, freeze
);
3119 static inline void split_huge_pmd_if_needed(struct vm_area_struct
*vma
, unsigned long address
)
3122 * If the new address isn't hpage aligned and it could previously
3123 * contain an hugepage: check if we need to split an huge pmd.
3125 if (!IS_ALIGNED(address
, HPAGE_PMD_SIZE
) &&
3126 range_in_vma(vma
, ALIGN_DOWN(address
, HPAGE_PMD_SIZE
),
3127 ALIGN(address
, HPAGE_PMD_SIZE
)))
3128 split_huge_pmd_address(vma
, address
, false);
3131 void vma_adjust_trans_huge(struct vm_area_struct
*vma
,
3132 unsigned long start
,
3134 struct vm_area_struct
*next
)
3136 /* Check if we need to split start first. */
3137 split_huge_pmd_if_needed(vma
, start
);
3139 /* Check if we need to split end next. */
3140 split_huge_pmd_if_needed(vma
, end
);
3142 /* If we're incrementing next->vm_start, we might need to split it. */
3144 split_huge_pmd_if_needed(next
, end
);
3147 static void unmap_folio(struct folio
*folio
)
3149 enum ttu_flags ttu_flags
= TTU_RMAP_LOCKED
| TTU_SYNC
|
3152 VM_BUG_ON_FOLIO(!folio_test_large(folio
), folio
);
3154 if (folio_test_pmd_mappable(folio
))
3155 ttu_flags
|= TTU_SPLIT_HUGE_PMD
;
3158 * Anon pages need migration entries to preserve them, but file
3159 * pages can simply be left unmapped, then faulted back on demand.
3160 * If that is ever changed (perhaps for mlock), update remap_page().
3162 if (folio_test_anon(folio
))
3163 try_to_migrate(folio
, ttu_flags
);
3165 try_to_unmap(folio
, ttu_flags
| TTU_IGNORE_MLOCK
);
3167 try_to_unmap_flush();
3170 static bool __discard_anon_folio_pmd_locked(struct vm_area_struct
*vma
,
3171 unsigned long addr
, pmd_t
*pmdp
,
3172 struct folio
*folio
)
3174 struct mm_struct
*mm
= vma
->vm_mm
;
3175 int ref_count
, map_count
;
3176 pmd_t orig_pmd
= *pmdp
;
3178 if (pmd_dirty(orig_pmd
))
3179 folio_set_dirty(folio
);
3180 if (folio_test_dirty(folio
) && !(vma
->vm_flags
& VM_DROPPABLE
)) {
3181 folio_set_swapbacked(folio
);
3185 orig_pmd
= pmdp_huge_clear_flush(vma
, addr
, pmdp
);
3188 * Syncing against concurrent GUP-fast:
3189 * - clear PMD; barrier; read refcount
3190 * - inc refcount; barrier; read PMD
3194 ref_count
= folio_ref_count(folio
);
3195 map_count
= folio_mapcount(folio
);
3198 * Order reads for folio refcount and dirty flag
3199 * (see comments in __remove_mapping()).
3204 * If the folio or its PMD is redirtied at this point, or if there
3205 * are unexpected references, we will give up to discard this folio
3208 * The only folio refs must be one from isolation plus the rmap(s).
3210 if (pmd_dirty(orig_pmd
))
3211 folio_set_dirty(folio
);
3212 if (folio_test_dirty(folio
) && !(vma
->vm_flags
& VM_DROPPABLE
)) {
3213 folio_set_swapbacked(folio
);
3214 set_pmd_at(mm
, addr
, pmdp
, orig_pmd
);
3218 if (ref_count
!= map_count
+ 1) {
3219 set_pmd_at(mm
, addr
, pmdp
, orig_pmd
);
3223 folio_remove_rmap_pmd(folio
, pmd_page(orig_pmd
), vma
);
3224 zap_deposited_table(mm
, pmdp
);
3225 add_mm_counter(mm
, MM_ANONPAGES
, -HPAGE_PMD_NR
);
3226 if (vma
->vm_flags
& VM_LOCKED
)
3227 mlock_drain_local();
3233 bool unmap_huge_pmd_locked(struct vm_area_struct
*vma
, unsigned long addr
,
3234 pmd_t
*pmdp
, struct folio
*folio
)
3236 VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio
), folio
);
3237 VM_WARN_ON_FOLIO(!folio_test_locked(folio
), folio
);
3238 VM_WARN_ON_FOLIO(!folio_test_anon(folio
), folio
);
3239 VM_WARN_ON_FOLIO(folio_test_swapbacked(folio
), folio
);
3240 VM_WARN_ON_ONCE(!IS_ALIGNED(addr
, HPAGE_PMD_SIZE
));
3242 return __discard_anon_folio_pmd_locked(vma
, addr
, pmdp
, folio
);
3245 static void remap_page(struct folio
*folio
, unsigned long nr
, int flags
)
3249 /* If unmap_folio() uses try_to_migrate() on file, remove this check */
3250 if (!folio_test_anon(folio
))
3253 remove_migration_ptes(folio
, folio
, RMP_LOCKED
| flags
);
3254 i
+= folio_nr_pages(folio
);
3257 folio
= folio_next(folio
);
3261 static void lru_add_split_folio(struct folio
*folio
, struct folio
*new_folio
,
3262 struct lruvec
*lruvec
, struct list_head
*list
)
3264 VM_BUG_ON_FOLIO(folio_test_lru(new_folio
), folio
);
3265 lockdep_assert_held(&lruvec
->lru_lock
);
3268 /* page reclaim is reclaiming a huge page */
3269 VM_WARN_ON(folio_test_lru(folio
));
3270 folio_get(new_folio
);
3271 list_add_tail(&new_folio
->lru
, list
);
3273 /* head is still on lru (and we have it frozen) */
3274 VM_WARN_ON(!folio_test_lru(folio
));
3275 if (folio_test_unevictable(folio
))
3276 new_folio
->mlock_count
= 0;
3278 list_add_tail(&new_folio
->lru
, &folio
->lru
);
3279 folio_set_lru(new_folio
);
3283 /* Racy check whether the huge page can be split */
3284 bool can_split_folio(struct folio
*folio
, int caller_pins
, int *pextra_pins
)
3288 /* Additional pins from page cache */
3289 if (folio_test_anon(folio
))
3290 extra_pins
= folio_test_swapcache(folio
) ?
3291 folio_nr_pages(folio
) : 0;
3293 extra_pins
= folio_nr_pages(folio
);
3295 *pextra_pins
= extra_pins
;
3296 return folio_mapcount(folio
) == folio_ref_count(folio
) - extra_pins
-
3301 * It splits @folio into @new_order folios and copies the @folio metadata to
3302 * all the resulting folios.
3304 static void __split_folio_to_order(struct folio
*folio
, int old_order
,
3307 long new_nr_pages
= 1 << new_order
;
3308 long nr_pages
= 1 << old_order
;
3312 * Skip the first new_nr_pages, since the new folio from them have all
3313 * the flags from the original folio.
3315 for (i
= new_nr_pages
; i
< nr_pages
; i
+= new_nr_pages
) {
3316 struct page
*new_head
= &folio
->page
+ i
;
3319 * Careful: new_folio is not a "real" folio before we cleared PageTail.
3320 * Don't pass it around before clear_compound_head().
3322 struct folio
*new_folio
= (struct folio
*)new_head
;
3324 VM_BUG_ON_PAGE(atomic_read(&new_folio
->_mapcount
) != -1, new_head
);
3327 * Clone page flags before unfreezing refcount.
3329 * After successful get_page_unless_zero() might follow flags change,
3330 * for example lock_page() which set PG_waiters.
3332 * Note that for mapped sub-pages of an anonymous THP,
3333 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
3334 * the migration entry instead from where remap_page() will restore it.
3335 * We can still have PG_anon_exclusive set on effectively unmapped and
3336 * unreferenced sub-pages of an anonymous THP: we can simply drop
3337 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
3339 new_folio
->flags
&= ~PAGE_FLAGS_CHECK_AT_PREP
;
3340 new_folio
->flags
|= (folio
->flags
&
3341 ((1L << PG_referenced
) |
3342 (1L << PG_swapbacked
) |
3343 (1L << PG_swapcache
) |
3344 (1L << PG_mlocked
) |
3345 (1L << PG_uptodate
) |
3347 (1L << PG_workingset
) |
3349 (1L << PG_unevictable
) |
3350 #ifdef CONFIG_ARCH_USES_PG_ARCH_2
3353 #ifdef CONFIG_ARCH_USES_PG_ARCH_3
3357 LRU_GEN_MASK
| LRU_REFS_MASK
));
3359 new_folio
->mapping
= folio
->mapping
;
3360 new_folio
->index
= folio
->index
+ i
;
3363 * page->private should not be set in tail pages. Fix up and warn once
3364 * if private is unexpectedly set.
3366 if (unlikely(new_folio
->private)) {
3367 VM_WARN_ON_ONCE_PAGE(true, new_head
);
3368 new_folio
->private = NULL
;
3371 if (folio_test_swapcache(folio
))
3372 new_folio
->swap
.val
= folio
->swap
.val
+ i
;
3374 /* Page flags must be visible before we make the page non-compound. */
3378 * Clear PageTail before unfreezing page refcount.
3380 * After successful get_page_unless_zero() might follow put_page()
3381 * which needs correct compound_head().
3383 clear_compound_head(new_head
);
3385 prep_compound_page(new_head
, new_order
);
3386 folio_set_large_rmappable(new_folio
);
3389 if (folio_test_young(folio
))
3390 folio_set_young(new_folio
);
3391 if (folio_test_idle(folio
))
3392 folio_set_idle(new_folio
);
3394 new_folio
->memcg_data
= folio
->memcg_data
;
3397 folio_xchg_last_cpupid(new_folio
, folio_last_cpupid(folio
));
3401 folio_set_order(folio
, new_order
);
3403 ClearPageCompound(&folio
->page
);
3407 * It splits an unmapped @folio to lower order smaller folios in two ways.
3408 * @folio: the to-be-split folio
3409 * @new_order: the smallest order of the after split folios (since buddy
3410 * allocator like split generates folios with orders from @folio's
3411 * order - 1 to new_order).
3412 * @split_at: in buddy allocator like split, the folio containing @split_at
3413 * will be split until its order becomes @new_order.
3414 * @lock_at: the folio containing @lock_at is left locked for caller.
3415 * @list: the after split folios will be added to @list if it is not NULL,
3416 * otherwise to LRU lists.
3417 * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory.
3418 * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
3419 * @mapping: @folio->mapping
3420 * @uniform_split: if the split is uniform or not (buddy allocator like split)
3423 * 1. uniform split: the given @folio into multiple @new_order small folios,
3424 * where all small folios have the same order. This is done when
3425 * uniform_split is true.
3426 * 2. buddy allocator like (non-uniform) split: the given @folio is split into
3427 * half and one of the half (containing the given page) is split into half
3428 * until the given @page's order becomes @new_order. This is done when
3429 * uniform_split is false.
3431 * The high level flow for these two methods are:
3432 * 1. uniform split: a single __split_folio_to_order() is called to split the
3433 * @folio into @new_order, then we traverse all the resulting folios one by
3434 * one in PFN ascending order and perform stats, unfreeze, adding to list,
3435 * and file mapping index operations.
3436 * 2. non-uniform split: in general, folio_order - @new_order calls to
3437 * __split_folio_to_order() are made in a for loop to split the @folio
3438 * to one lower order at a time. The resulting small folios are processed
3439 * like what is done during the traversal in 1, except the one containing
3440 * @page, which is split in next for loop.
3442 * After splitting, the caller's folio reference will be transferred to the
3443 * folio containing @page. The other folios may be freed if they are not mapped.
3445 * In terms of locking, after splitting,
3446 * 1. uniform split leaves @page (or the folio contains it) locked;
3447 * 2. buddy allocator like (non-uniform) split leaves @folio locked.
3450 * For !uniform_split, when -ENOMEM is returned, the original folio might be
3451 * split. The caller needs to check the input folio.
3453 static int __split_unmapped_folio(struct folio
*folio
, int new_order
,
3454 struct page
*split_at
, struct page
*lock_at
,
3455 struct list_head
*list
, pgoff_t end
,
3456 struct xa_state
*xas
, struct address_space
*mapping
,
3459 struct lruvec
*lruvec
;
3460 struct address_space
*swap_cache
= NULL
;
3461 struct folio
*origin_folio
= folio
;
3462 struct folio
*next_folio
= folio_next(folio
);
3463 struct folio
*new_folio
;
3465 int order
= folio_order(folio
);
3467 int start_order
= uniform_split
? new_order
: order
- 1;
3470 bool stop_split
= false;
3472 if (folio_test_swapcache(folio
)) {
3475 /* a swapcache folio can only be uniformly split to order-0 */
3476 if (!uniform_split
|| new_order
!= 0)
3479 swap_cache
= swap_address_space(folio
->swap
);
3480 xa_lock(&swap_cache
->i_pages
);
3483 if (folio_test_anon(folio
))
3484 mod_mthp_stat(order
, MTHP_STAT_NR_ANON
, -1);
3486 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
3487 lruvec
= folio_lruvec_lock(folio
);
3489 folio_clear_has_hwpoisoned(folio
);
3492 * split to new_order one order at a time. For uniform split,
3493 * folio is split to new_order directly.
3495 for (split_order
= start_order
;
3496 split_order
>= new_order
&& !stop_split
;
3498 int old_order
= folio_order(folio
);
3499 struct folio
*release
;
3500 struct folio
*end_folio
= folio_next(folio
);
3502 /* order-1 anonymous folio is not supported */
3503 if (folio_test_anon(folio
) && split_order
== 1)
3505 if (uniform_split
&& split_order
!= new_order
)
3510 * uniform split has xas_split_alloc() called before
3511 * irq is disabled to allocate enough memory, whereas
3512 * non-uniform split can handle ENOMEM.
3515 xas_split(xas
, folio
, old_order
);
3517 xas_set_order(xas
, folio
->index
, split_order
);
3518 xas_try_split(xas
, folio
, old_order
);
3519 if (xas_error(xas
)) {
3520 ret
= xas_error(xas
);
3527 folio_split_memcg_refs(folio
, old_order
, split_order
);
3528 split_page_owner(&folio
->page
, old_order
, split_order
);
3529 pgalloc_tag_split(folio
, old_order
, split_order
);
3531 __split_folio_to_order(folio
, old_order
, split_order
);
3535 * Iterate through after-split folios and perform related
3536 * operations. But in buddy allocator like split, the folio
3537 * containing the specified page is skipped until its order
3538 * is new_order, since the folio will be worked on in next
3541 for (release
= folio
; release
!= end_folio
; release
= next
) {
3542 next
= folio_next(release
);
3544 * for buddy allocator like split, the folio containing
3545 * page will be split next and should not be released,
3546 * until the folio's order is new_order or stop_split
3547 * is set to true by the above xas_split() failure.
3549 if (release
== page_folio(split_at
)) {
3551 if (split_order
!= new_order
&& !stop_split
)
3554 if (folio_test_anon(release
)) {
3555 mod_mthp_stat(folio_order(release
),
3556 MTHP_STAT_NR_ANON
, 1);
3560 * origin_folio should be kept frozon until page cache
3561 * entries are updated with all the other after-split
3562 * folios to prevent others seeing stale page cache
3565 if (release
== origin_folio
)
3568 folio_ref_unfreeze(release
, 1 +
3569 ((mapping
|| swap_cache
) ?
3570 folio_nr_pages(release
) : 0));
3572 lru_add_split_folio(origin_folio
, release
, lruvec
,
3575 /* Some pages can be beyond EOF: drop them from cache */
3576 if (release
->index
>= end
) {
3577 if (shmem_mapping(mapping
))
3578 nr_dropped
+= folio_nr_pages(release
);
3579 else if (folio_test_clear_dirty(release
))
3580 folio_account_cleaned(release
,
3581 inode_to_wb(mapping
->host
));
3582 __filemap_remove_folio(release
, NULL
);
3583 folio_put_refs(release
, folio_nr_pages(release
));
3584 } else if (mapping
) {
3585 __xa_store(&mapping
->i_pages
,
3586 release
->index
, release
, 0);
3587 } else if (swap_cache
) {
3588 __xa_store(&swap_cache
->i_pages
,
3589 swap_cache_index(release
->swap
),
3596 * Unfreeze origin_folio only after all page cache entries, which used
3597 * to point to it, have been updated with new folios. Otherwise,
3598 * a parallel folio_try_get() can grab origin_folio and its caller can
3599 * see stale page cache entries.
3601 folio_ref_unfreeze(origin_folio
, 1 +
3602 ((mapping
|| swap_cache
) ? folio_nr_pages(origin_folio
) : 0));
3604 unlock_page_lruvec(lruvec
);
3607 xa_unlock(&swap_cache
->i_pages
);
3609 xa_unlock(&mapping
->i_pages
);
3611 /* Caller disabled irqs, so they are still disabled here */
3615 shmem_uncharge(mapping
->host
, nr_dropped
);
3617 remap_page(origin_folio
, 1 << order
,
3618 folio_test_anon(origin_folio
) ?
3619 RMP_USE_SHARED_ZEROPAGE
: 0);
3622 * At this point, folio should contain the specified page.
3623 * For uniform split, it is left for caller to unlock.
3624 * For buddy allocator like split, the first after-split folio is left
3625 * for caller to unlock.
3627 for (new_folio
= origin_folio
; new_folio
!= next_folio
; new_folio
= next
) {
3628 next
= folio_next(new_folio
);
3629 if (new_folio
== page_folio(lock_at
))
3632 folio_unlock(new_folio
);
3634 * Subpages may be freed if there wasn't any mapping
3635 * like if add_to_swap() is running on a lru page that
3636 * had its mapping zapped. And freeing these pages
3637 * requires taking the lru_lock so we do the put_page
3638 * of the tail pages after the split is complete.
3640 free_folio_and_swap_cache(new_folio
);
3645 bool non_uniform_split_supported(struct folio
*folio
, unsigned int new_order
,
3648 if (folio_test_anon(folio
)) {
3649 /* order-1 is not supported for anonymous THP. */
3650 VM_WARN_ONCE(warns
&& new_order
== 1,
3651 "Cannot split to order-1 folio");
3652 return new_order
!= 1;
3653 } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS
) &&
3654 !mapping_large_folio_support(folio
->mapping
)) {
3656 * No split if the file system does not support large folio.
3657 * Note that we might still have THPs in such mappings due to
3658 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
3659 * does not actually support large folios properly.
3662 "Cannot split file folio to non-0 order");
3666 /* Only swapping a whole PMD-mapped folio is supported */
3667 if (folio_test_swapcache(folio
)) {
3669 "Cannot split swapcache folio to non-0 order");
3676 /* See comments in non_uniform_split_supported() */
3677 bool uniform_split_supported(struct folio
*folio
, unsigned int new_order
,
3680 if (folio_test_anon(folio
)) {
3681 VM_WARN_ONCE(warns
&& new_order
== 1,
3682 "Cannot split to order-1 folio");
3683 return new_order
!= 1;
3684 } else if (new_order
) {
3685 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS
) &&
3686 !mapping_large_folio_support(folio
->mapping
)) {
3688 "Cannot split file folio to non-0 order");
3693 if (new_order
&& folio_test_swapcache(folio
)) {
3695 "Cannot split swapcache folio to non-0 order");
3703 * __folio_split: split a folio at @split_at to a @new_order folio
3704 * @folio: folio to split
3705 * @new_order: the order of the new folio
3706 * @split_at: a page within the new folio
3707 * @lock_at: a page within @folio to be left locked to caller
3708 * @list: after-split folios will be put on it if non NULL
3709 * @uniform_split: perform uniform split or not (non-uniform split)
3711 * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
3712 * It is in charge of checking whether the split is supported or not and
3713 * preparing @folio for __split_unmapped_folio().
3715 * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
3716 * split but not to @new_order, the caller needs to check)
3718 static int __folio_split(struct folio
*folio
, unsigned int new_order
,
3719 struct page
*split_at
, struct page
*lock_at
,
3720 struct list_head
*list
, bool uniform_split
)
3722 struct deferred_split
*ds_queue
= get_deferred_split_queue(folio
);
3723 XA_STATE(xas
, &folio
->mapping
->i_pages
, folio
->index
);
3724 bool is_anon
= folio_test_anon(folio
);
3725 struct address_space
*mapping
= NULL
;
3726 struct anon_vma
*anon_vma
= NULL
;
3727 int order
= folio_order(folio
);
3728 int extra_pins
, ret
;
3732 VM_BUG_ON_FOLIO(!folio_test_locked(folio
), folio
);
3733 VM_BUG_ON_FOLIO(!folio_test_large(folio
), folio
);
3735 if (folio
!= page_folio(split_at
) || folio
!= page_folio(lock_at
))
3738 if (new_order
>= folio_order(folio
))
3741 if (uniform_split
&& !uniform_split_supported(folio
, new_order
, true))
3744 if (!uniform_split
&&
3745 !non_uniform_split_supported(folio
, new_order
, true))
3748 is_hzp
= is_huge_zero_folio(folio
);
3750 pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
3754 if (folio_test_writeback(folio
))
3759 * The caller does not necessarily hold an mmap_lock that would
3760 * prevent the anon_vma disappearing so we first we take a
3761 * reference to it and then lock the anon_vma for write. This
3762 * is similar to folio_lock_anon_vma_read except the write lock
3763 * is taken to serialise against parallel split or collapse
3766 anon_vma
= folio_get_anon_vma(folio
);
3773 anon_vma_lock_write(anon_vma
);
3775 unsigned int min_order
;
3778 mapping
= folio
->mapping
;
3782 * TODO: add support for large shmem folio in swap cache.
3783 * When shmem is in swap cache, mapping is NULL and
3784 * folio_test_swapcache() is true.
3791 min_order
= mapping_min_folio_order(folio
->mapping
);
3792 if (new_order
< min_order
) {
3793 VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
3799 gfp
= current_gfp_context(mapping_gfp_mask(mapping
) &
3802 if (!filemap_release_folio(folio
, gfp
)) {
3807 if (uniform_split
) {
3808 xas_set_order(&xas
, folio
->index
, new_order
);
3809 xas_split_alloc(&xas
, folio
, folio_order(folio
), gfp
);
3810 if (xas_error(&xas
)) {
3811 ret
= xas_error(&xas
);
3817 i_mmap_lock_read(mapping
);
3820 *__split_unmapped_folio() may need to trim off pages beyond
3821 * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
3822 * seqlock, which cannot be nested inside the page tree lock.
3823 * So note end now: i_size itself may be changed at any moment,
3824 * but folio lock is good enough to serialize the trimming.
3826 end
= DIV_ROUND_UP(i_size_read(mapping
->host
), PAGE_SIZE
);
3827 if (shmem_mapping(mapping
))
3828 end
= shmem_fallocend(mapping
->host
, end
);
3832 * Racy check if we can split the page, before unmap_folio() will
3835 if (!can_split_folio(folio
, 1, &extra_pins
)) {
3842 /* block interrupt reentry in xa_lock and spinlock */
3843 local_irq_disable();
3846 * Check if the folio is present in page cache.
3847 * We assume all tail are present too, if folio is there.
3851 if (xas_load(&xas
) != folio
)
3855 /* Prevent deferred_split_scan() touching ->_refcount */
3856 spin_lock(&ds_queue
->split_queue_lock
);
3857 if (folio_ref_freeze(folio
, 1 + extra_pins
)) {
3858 if (folio_order(folio
) > 1 &&
3859 !list_empty(&folio
->_deferred_list
)) {
3860 ds_queue
->split_queue_len
--;
3861 if (folio_test_partially_mapped(folio
)) {
3862 folio_clear_partially_mapped(folio
);
3863 mod_mthp_stat(folio_order(folio
),
3864 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED
, -1);
3867 * Reinitialize page_deferred_list after removing the
3868 * page from the split_queue, otherwise a subsequent
3869 * split will see list corruption when checking the
3870 * page_deferred_list.
3872 list_del_init(&folio
->_deferred_list
);
3874 spin_unlock(&ds_queue
->split_queue_lock
);
3876 int nr
= folio_nr_pages(folio
);
3878 if (folio_test_pmd_mappable(folio
) &&
3879 new_order
< HPAGE_PMD_ORDER
) {
3880 if (folio_test_swapbacked(folio
)) {
3881 __lruvec_stat_mod_folio(folio
,
3882 NR_SHMEM_THPS
, -nr
);
3884 __lruvec_stat_mod_folio(folio
,
3886 filemap_nr_thps_dec(mapping
);
3891 ret
= __split_unmapped_folio(folio
, new_order
,
3892 split_at
, lock_at
, list
, end
, &xas
, mapping
,
3895 spin_unlock(&ds_queue
->split_queue_lock
);
3900 remap_page(folio
, folio_nr_pages(folio
), 0);
3906 anon_vma_unlock_write(anon_vma
);
3907 put_anon_vma(anon_vma
);
3910 i_mmap_unlock_read(mapping
);
3913 if (order
== HPAGE_PMD_ORDER
)
3914 count_vm_event(!ret
? THP_SPLIT_PAGE
: THP_SPLIT_PAGE_FAILED
);
3915 count_mthp_stat(order
, !ret
? MTHP_STAT_SPLIT
: MTHP_STAT_SPLIT_FAILED
);
3920 * This function splits a large folio into smaller folios of order @new_order.
3921 * @page can point to any page of the large folio to split. The split operation
3922 * does not change the position of @page.
3926 * 1) The caller must hold a reference on the @page's owning folio, also known
3927 * as the large folio.
3929 * 2) The large folio must be locked.
3931 * 3) The folio must not be pinned. Any unexpected folio references, including
3932 * GUP pins, will result in the folio not getting split; instead, the caller
3933 * will receive an -EAGAIN.
3935 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
3936 * supported for non-file-backed folios, because folio->_deferred_list, which
3937 * is used by partially mapped folios, is stored in subpage 2, but an order-1
3938 * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
3939 * since they do not use _deferred_list.
3941 * After splitting, the caller's folio reference will be transferred to @page,
3942 * resulting in a raised refcount of @page after this call. The other pages may
3943 * be freed if they are not mapped.
3945 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3947 * Pages in @new_order will inherit the mapping, flags, and so on from the
3950 * Returns 0 if the huge page was split successfully.
3952 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
3953 * the folio was concurrently removed from the page cache.
3955 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
3956 * under writeback, if fs-specific folio metadata cannot currently be
3957 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
3960 * Callers should ensure that the order respects the address space mapping
3961 * min-order if one is set for non-anonymous folios.
3963 * Returns -EINVAL when trying to split to an order that is incompatible
3964 * with the folio. Splitting to order 0 is compatible with all folios.
3966 int split_huge_page_to_list_to_order(struct page
*page
, struct list_head
*list
,
3967 unsigned int new_order
)
3969 struct folio
*folio
= page_folio(page
);
3971 return __folio_split(folio
, new_order
, &folio
->page
, page
, list
, true);
3975 * folio_split: split a folio at @split_at to a @new_order folio
3976 * @folio: folio to split
3977 * @new_order: the order of the new folio
3978 * @split_at: a page within the new folio
3980 * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
3981 * split but not to @new_order, the caller needs to check)
3983 * It has the same prerequisites and returns as
3984 * split_huge_page_to_list_to_order().
3986 * Split a folio at @split_at to a new_order folio, leave the
3987 * remaining subpages of the original folio as large as possible. For example,
3988 * in the case of splitting an order-9 folio at its third order-3 subpages to
3989 * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
3990 * After the split, there will be a group of folios with different orders and
3991 * the new folio containing @split_at is marked in bracket:
3992 * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
3994 * After split, folio is left locked for caller.
3996 int folio_split(struct folio
*folio
, unsigned int new_order
,
3997 struct page
*split_at
, struct list_head
*list
)
3999 return __folio_split(folio
, new_order
, split_at
, &folio
->page
, list
,
4003 int min_order_for_split(struct folio
*folio
)
4005 if (folio_test_anon(folio
))
4008 if (!folio
->mapping
) {
4009 if (folio_test_pmd_mappable(folio
))
4010 count_vm_event(THP_SPLIT_PAGE_FAILED
);
4014 return mapping_min_folio_order(folio
->mapping
);
4017 int split_folio_to_list(struct folio
*folio
, struct list_head
*list
)
4019 int ret
= min_order_for_split(folio
);
4024 return split_huge_page_to_list_to_order(&folio
->page
, list
, ret
);
4028 * __folio_unqueue_deferred_split() is not to be called directly:
4029 * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
4030 * limits its calls to those folios which may have a _deferred_list for
4031 * queueing THP splits, and that list is (racily observed to be) non-empty.
4033 * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
4034 * zero: because even when split_queue_lock is held, a non-empty _deferred_list
4035 * might be in use on deferred_split_scan()'s unlocked on-stack list.
4037 * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
4038 * therefore important to unqueue deferred split before changing folio memcg.
4040 bool __folio_unqueue_deferred_split(struct folio
*folio
)
4042 struct deferred_split
*ds_queue
;
4043 unsigned long flags
;
4044 bool unqueued
= false;
4046 WARN_ON_ONCE(folio_ref_count(folio
));
4047 WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio
));
4049 ds_queue
= get_deferred_split_queue(folio
);
4050 spin_lock_irqsave(&ds_queue
->split_queue_lock
, flags
);
4051 if (!list_empty(&folio
->_deferred_list
)) {
4052 ds_queue
->split_queue_len
--;
4053 if (folio_test_partially_mapped(folio
)) {
4054 folio_clear_partially_mapped(folio
);
4055 mod_mthp_stat(folio_order(folio
),
4056 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED
, -1);
4058 list_del_init(&folio
->_deferred_list
);
4061 spin_unlock_irqrestore(&ds_queue
->split_queue_lock
, flags
);
4063 return unqueued
; /* useful for debug warnings */
4066 /* partially_mapped=false won't clear PG_partially_mapped folio flag */
4067 void deferred_split_folio(struct folio
*folio
, bool partially_mapped
)
4069 struct deferred_split
*ds_queue
= get_deferred_split_queue(folio
);
4071 struct mem_cgroup
*memcg
= folio_memcg(folio
);
4073 unsigned long flags
;
4076 * Order 1 folios have no space for a deferred list, but we also
4077 * won't waste much memory by not adding them to the deferred list.
4079 if (folio_order(folio
) <= 1)
4082 if (!partially_mapped
&& !split_underused_thp
)
4086 * Exclude swapcache: originally to avoid a corrupt deferred split
4087 * queue. Nowadays that is fully prevented by memcg1_swapout();
4088 * but if page reclaim is already handling the same folio, it is
4089 * unnecessary to handle it again in the shrinker, so excluding
4090 * swapcache here may still be a useful optimization.
4092 if (folio_test_swapcache(folio
))
4095 spin_lock_irqsave(&ds_queue
->split_queue_lock
, flags
);
4096 if (partially_mapped
) {
4097 if (!folio_test_partially_mapped(folio
)) {
4098 folio_set_partially_mapped(folio
);
4099 if (folio_test_pmd_mappable(folio
))
4100 count_vm_event(THP_DEFERRED_SPLIT_PAGE
);
4101 count_mthp_stat(folio_order(folio
), MTHP_STAT_SPLIT_DEFERRED
);
4102 mod_mthp_stat(folio_order(folio
), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED
, 1);
4106 /* partially mapped folios cannot become non-partially mapped */
4107 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio
), folio
);
4109 if (list_empty(&folio
->_deferred_list
)) {
4110 list_add_tail(&folio
->_deferred_list
, &ds_queue
->split_queue
);
4111 ds_queue
->split_queue_len
++;
4114 set_shrinker_bit(memcg
, folio_nid(folio
),
4115 deferred_split_shrinker
->id
);
4118 spin_unlock_irqrestore(&ds_queue
->split_queue_lock
, flags
);
4121 static unsigned long deferred_split_count(struct shrinker
*shrink
,
4122 struct shrink_control
*sc
)
4124 struct pglist_data
*pgdata
= NODE_DATA(sc
->nid
);
4125 struct deferred_split
*ds_queue
= &pgdata
->deferred_split_queue
;
4129 ds_queue
= &sc
->memcg
->deferred_split_queue
;
4131 return READ_ONCE(ds_queue
->split_queue_len
);
4134 static bool thp_underused(struct folio
*folio
)
4136 int num_zero_pages
= 0, num_filled_pages
= 0;
4140 if (khugepaged_max_ptes_none
== HPAGE_PMD_NR
- 1)
4143 for (i
= 0; i
< folio_nr_pages(folio
); i
++) {
4144 kaddr
= kmap_local_folio(folio
, i
* PAGE_SIZE
);
4145 if (!memchr_inv(kaddr
, 0, PAGE_SIZE
)) {
4147 if (num_zero_pages
> khugepaged_max_ptes_none
) {
4148 kunmap_local(kaddr
);
4153 * Another path for early exit once the number
4154 * of non-zero filled pages exceeds threshold.
4157 if (num_filled_pages
>= HPAGE_PMD_NR
- khugepaged_max_ptes_none
) {
4158 kunmap_local(kaddr
);
4162 kunmap_local(kaddr
);
4167 static unsigned long deferred_split_scan(struct shrinker
*shrink
,
4168 struct shrink_control
*sc
)
4170 struct pglist_data
*pgdata
= NODE_DATA(sc
->nid
);
4171 struct deferred_split
*ds_queue
= &pgdata
->deferred_split_queue
;
4172 unsigned long flags
;
4174 struct folio
*folio
, *next
, *prev
= NULL
;
4175 int split
= 0, removed
= 0;
4179 ds_queue
= &sc
->memcg
->deferred_split_queue
;
4182 spin_lock_irqsave(&ds_queue
->split_queue_lock
, flags
);
4183 /* Take pin on all head pages to avoid freeing them under us */
4184 list_for_each_entry_safe(folio
, next
, &ds_queue
->split_queue
,
4186 if (folio_try_get(folio
)) {
4187 list_move(&folio
->_deferred_list
, &list
);
4189 /* We lost race with folio_put() */
4190 if (folio_test_partially_mapped(folio
)) {
4191 folio_clear_partially_mapped(folio
);
4192 mod_mthp_stat(folio_order(folio
),
4193 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED
, -1);
4195 list_del_init(&folio
->_deferred_list
);
4196 ds_queue
->split_queue_len
--;
4198 if (!--sc
->nr_to_scan
)
4201 spin_unlock_irqrestore(&ds_queue
->split_queue_lock
, flags
);
4203 list_for_each_entry_safe(folio
, next
, &list
, _deferred_list
) {
4204 bool did_split
= false;
4205 bool underused
= false;
4207 if (!folio_test_partially_mapped(folio
)) {
4208 underused
= thp_underused(folio
);
4212 if (!folio_trylock(folio
))
4214 if (!split_folio(folio
)) {
4217 count_vm_event(THP_UNDERUSED_SPLIT_PAGE
);
4220 folio_unlock(folio
);
4223 * split_folio() removes folio from list on success.
4224 * Only add back to the queue if folio is partially mapped.
4225 * If thp_underused returns false, or if split_folio fails
4226 * in the case it was underused, then consider it used and
4227 * don't add it back to split_queue.
4230 ; /* folio already removed from list */
4231 } else if (!folio_test_partially_mapped(folio
)) {
4232 list_del_init(&folio
->_deferred_list
);
4236 * That unlocked list_del_init() above would be unsafe,
4237 * unless its folio is separated from any earlier folios
4238 * left on the list (which may be concurrently unqueued)
4239 * by one safe folio with refcount still raised.
4247 spin_lock_irqsave(&ds_queue
->split_queue_lock
, flags
);
4248 list_splice_tail(&list
, &ds_queue
->split_queue
);
4249 ds_queue
->split_queue_len
-= removed
;
4250 spin_unlock_irqrestore(&ds_queue
->split_queue_lock
, flags
);
4256 * Stop shrinker if we didn't split any page, but the queue is empty.
4257 * This can happen if pages were freed under us.
4259 if (!split
&& list_empty(&ds_queue
->split_queue
))
4264 #ifdef CONFIG_DEBUG_FS
4265 static void split_huge_pages_all(void)
4269 struct folio
*folio
;
4270 unsigned long pfn
, max_zone_pfn
;
4271 unsigned long total
= 0, split
= 0;
4273 pr_debug("Split all THPs\n");
4274 for_each_zone(zone
) {
4275 if (!managed_zone(zone
))
4277 max_zone_pfn
= zone_end_pfn(zone
);
4278 for (pfn
= zone
->zone_start_pfn
; pfn
< max_zone_pfn
; pfn
++) {
4281 page
= pfn_to_online_page(pfn
);
4282 if (!page
|| PageTail(page
))
4284 folio
= page_folio(page
);
4285 if (!folio_try_get(folio
))
4288 if (unlikely(page_folio(page
) != folio
))
4291 if (zone
!= folio_zone(folio
))
4294 if (!folio_test_large(folio
)
4295 || folio_test_hugetlb(folio
)
4296 || !folio_test_lru(folio
))
4301 nr_pages
= folio_nr_pages(folio
);
4302 if (!split_folio(folio
))
4304 pfn
+= nr_pages
- 1;
4305 folio_unlock(folio
);
4312 pr_debug("%lu of %lu THP split\n", split
, total
);
4315 static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct
*vma
)
4317 return vma_is_special_huge(vma
) || (vma
->vm_flags
& VM_IO
) ||
4318 is_vm_hugetlb_page(vma
);
4321 static int split_huge_pages_pid(int pid
, unsigned long vaddr_start
,
4322 unsigned long vaddr_end
, unsigned int new_order
,
4323 long in_folio_offset
)
4326 struct task_struct
*task
;
4327 struct mm_struct
*mm
;
4328 unsigned long total
= 0, split
= 0;
4331 vaddr_start
&= PAGE_MASK
;
4332 vaddr_end
&= PAGE_MASK
;
4334 task
= find_get_task_by_vpid(pid
);
4340 /* Find the mm_struct */
4341 mm
= get_task_mm(task
);
4342 put_task_struct(task
);
4349 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
4350 pid
, vaddr_start
, vaddr_end
);
4354 * always increase addr by PAGE_SIZE, since we could have a PTE page
4355 * table filled with PTE-mapped THPs, each of which is distinct.
4357 for (addr
= vaddr_start
; addr
< vaddr_end
; addr
+= PAGE_SIZE
) {
4358 struct vm_area_struct
*vma
= vma_lookup(mm
, addr
);
4359 struct folio_walk fw
;
4360 struct folio
*folio
;
4361 struct address_space
*mapping
;
4362 unsigned int target_order
= new_order
;
4367 /* skip special VMA and hugetlb VMA */
4368 if (vma_not_suitable_for_thp_split(vma
)) {
4373 folio
= folio_walk_start(&fw
, vma
, addr
, 0);
4377 if (!is_transparent_hugepage(folio
))
4380 if (!folio_test_anon(folio
)) {
4381 mapping
= folio
->mapping
;
4382 target_order
= max(new_order
,
4383 mapping_min_folio_order(mapping
));
4386 if (target_order
>= folio_order(folio
))
4391 * For folios with private, split_huge_page_to_list_to_order()
4392 * will try to drop it before split and then check if the folio
4393 * can be split or not. So skip the check here.
4395 if (!folio_test_private(folio
) &&
4396 !can_split_folio(folio
, 0, NULL
))
4399 if (!folio_trylock(folio
))
4402 folio_walk_end(&fw
, vma
);
4404 if (!folio_test_anon(folio
) && folio
->mapping
!= mapping
)
4407 if (in_folio_offset
< 0 ||
4408 in_folio_offset
>= folio_nr_pages(folio
)) {
4409 if (!split_folio_to_order(folio
, target_order
))
4412 struct page
*split_at
= folio_page(folio
,
4414 if (!folio_split(folio
, target_order
, split_at
, NULL
))
4420 folio_unlock(folio
);
4426 folio_walk_end(&fw
, vma
);
4429 mmap_read_unlock(mm
);
4432 pr_debug("%lu of %lu THP split\n", split
, total
);
4438 static int split_huge_pages_in_file(const char *file_path
, pgoff_t off_start
,
4439 pgoff_t off_end
, unsigned int new_order
,
4440 long in_folio_offset
)
4442 struct filename
*file
;
4443 struct file
*candidate
;
4444 struct address_space
*mapping
;
4448 unsigned long total
= 0, split
= 0;
4449 unsigned int min_order
;
4450 unsigned int target_order
;
4452 file
= getname_kernel(file_path
);
4456 candidate
= file_open_name(file
, O_RDONLY
, 0);
4457 if (IS_ERR(candidate
))
4460 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
4461 file_path
, off_start
, off_end
);
4463 mapping
= candidate
->f_mapping
;
4464 min_order
= mapping_min_folio_order(mapping
);
4465 target_order
= max(new_order
, min_order
);
4467 for (index
= off_start
; index
< off_end
; index
+= nr_pages
) {
4468 struct folio
*folio
= filemap_get_folio(mapping
, index
);
4474 if (!folio_test_large(folio
))
4478 nr_pages
= folio_nr_pages(folio
);
4480 if (target_order
>= folio_order(folio
))
4483 if (!folio_trylock(folio
))
4486 if (folio
->mapping
!= mapping
)
4489 if (in_folio_offset
< 0 || in_folio_offset
>= nr_pages
) {
4490 if (!split_folio_to_order(folio
, target_order
))
4493 struct page
*split_at
= folio_page(folio
,
4495 if (!folio_split(folio
, target_order
, split_at
, NULL
))
4500 folio_unlock(folio
);
4506 filp_close(candidate
, NULL
);
4509 pr_debug("%lu of %lu file-backed THP split\n", split
, total
);
4515 #define MAX_INPUT_BUF_SZ 255
4517 static ssize_t
split_huge_pages_write(struct file
*file
, const char __user
*buf
,
4518 size_t count
, loff_t
*ppops
)
4520 static DEFINE_MUTEX(split_debug_mutex
);
4523 * hold pid, start_vaddr, end_vaddr, new_order or
4524 * file_path, off_start, off_end, new_order
4526 char input_buf
[MAX_INPUT_BUF_SZ
];
4528 unsigned long vaddr_start
, vaddr_end
;
4529 unsigned int new_order
= 0;
4530 long in_folio_offset
= -1;
4532 ret
= mutex_lock_interruptible(&split_debug_mutex
);
4538 memset(input_buf
, 0, MAX_INPUT_BUF_SZ
);
4539 if (copy_from_user(input_buf
, buf
, min_t(size_t, count
, MAX_INPUT_BUF_SZ
)))
4542 input_buf
[MAX_INPUT_BUF_SZ
- 1] = '\0';
4544 if (input_buf
[0] == '/') {
4546 char *tok_buf
= input_buf
;
4547 char file_path
[MAX_INPUT_BUF_SZ
];
4548 pgoff_t off_start
= 0, off_end
= 0;
4549 size_t input_len
= strlen(input_buf
);
4551 tok
= strsep(&tok_buf
, ",");
4552 if (tok
&& tok_buf
) {
4553 strscpy(file_path
, tok
);
4559 ret
= sscanf(tok_buf
, "0x%lx,0x%lx,%d,%ld", &off_start
, &off_end
,
4560 &new_order
, &in_folio_offset
);
4561 if (ret
!= 2 && ret
!= 3 && ret
!= 4) {
4565 ret
= split_huge_pages_in_file(file_path
, off_start
, off_end
,
4566 new_order
, in_folio_offset
);
4573 ret
= sscanf(input_buf
, "%d,0x%lx,0x%lx,%d,%ld", &pid
, &vaddr_start
,
4574 &vaddr_end
, &new_order
, &in_folio_offset
);
4575 if (ret
== 1 && pid
== 1) {
4576 split_huge_pages_all();
4577 ret
= strlen(input_buf
);
4579 } else if (ret
!= 3 && ret
!= 4 && ret
!= 5) {
4584 ret
= split_huge_pages_pid(pid
, vaddr_start
, vaddr_end
, new_order
,
4587 ret
= strlen(input_buf
);
4589 mutex_unlock(&split_debug_mutex
);
4594 static const struct file_operations split_huge_pages_fops
= {
4595 .owner
= THIS_MODULE
,
4596 .write
= split_huge_pages_write
,
4599 static int __init
split_huge_pages_debugfs(void)
4601 debugfs_create_file("split_huge_pages", 0200, NULL
, NULL
,
4602 &split_huge_pages_fops
);
4605 late_initcall(split_huge_pages_debugfs
);
4608 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
4609 int set_pmd_migration_entry(struct page_vma_mapped_walk
*pvmw
,
4612 struct folio
*folio
= page_folio(page
);
4613 struct vm_area_struct
*vma
= pvmw
->vma
;
4614 struct mm_struct
*mm
= vma
->vm_mm
;
4615 unsigned long address
= pvmw
->address
;
4616 bool anon_exclusive
;
4621 if (!(pvmw
->pmd
&& !pvmw
->pte
))
4624 flush_cache_range(vma
, address
, address
+ HPAGE_PMD_SIZE
);
4625 pmdval
= pmdp_invalidate(vma
, address
, pvmw
->pmd
);
4627 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
4628 anon_exclusive
= folio_test_anon(folio
) && PageAnonExclusive(page
);
4629 if (anon_exclusive
&& folio_try_share_anon_rmap_pmd(folio
, page
)) {
4630 set_pmd_at(mm
, address
, pvmw
->pmd
, pmdval
);
4634 if (pmd_dirty(pmdval
))
4635 folio_mark_dirty(folio
);
4636 if (pmd_write(pmdval
))
4637 entry
= make_writable_migration_entry(page_to_pfn(page
));
4638 else if (anon_exclusive
)
4639 entry
= make_readable_exclusive_migration_entry(page_to_pfn(page
));
4641 entry
= make_readable_migration_entry(page_to_pfn(page
));
4642 if (pmd_young(pmdval
))
4643 entry
= make_migration_entry_young(entry
);
4644 if (pmd_dirty(pmdval
))
4645 entry
= make_migration_entry_dirty(entry
);
4646 pmdswp
= swp_entry_to_pmd(entry
);
4647 if (pmd_soft_dirty(pmdval
))
4648 pmdswp
= pmd_swp_mksoft_dirty(pmdswp
);
4649 if (pmd_uffd_wp(pmdval
))
4650 pmdswp
= pmd_swp_mkuffd_wp(pmdswp
);
4651 set_pmd_at(mm
, address
, pvmw
->pmd
, pmdswp
);
4652 folio_remove_rmap_pmd(folio
, page
, vma
);
4654 trace_set_migration_pmd(address
, pmd_val(pmdswp
));
4659 void remove_migration_pmd(struct page_vma_mapped_walk
*pvmw
, struct page
*new)
4661 struct folio
*folio
= page_folio(new);
4662 struct vm_area_struct
*vma
= pvmw
->vma
;
4663 struct mm_struct
*mm
= vma
->vm_mm
;
4664 unsigned long address
= pvmw
->address
;
4665 unsigned long haddr
= address
& HPAGE_PMD_MASK
;
4669 if (!(pvmw
->pmd
&& !pvmw
->pte
))
4672 entry
= pmd_to_swp_entry(*pvmw
->pmd
);
4674 pmde
= folio_mk_pmd(folio
, READ_ONCE(vma
->vm_page_prot
));
4675 if (pmd_swp_soft_dirty(*pvmw
->pmd
))
4676 pmde
= pmd_mksoft_dirty(pmde
);
4677 if (is_writable_migration_entry(entry
))
4678 pmde
= pmd_mkwrite(pmde
, vma
);
4679 if (pmd_swp_uffd_wp(*pvmw
->pmd
))
4680 pmde
= pmd_mkuffd_wp(pmde
);
4681 if (!is_migration_entry_young(entry
))
4682 pmde
= pmd_mkold(pmde
);
4683 /* NOTE: this may contain setting soft-dirty on some archs */
4684 if (folio_test_dirty(folio
) && is_migration_entry_dirty(entry
))
4685 pmde
= pmd_mkdirty(pmde
);
4687 if (folio_test_anon(folio
)) {
4688 rmap_t rmap_flags
= RMAP_NONE
;
4690 if (!is_readable_migration_entry(entry
))
4691 rmap_flags
|= RMAP_EXCLUSIVE
;
4693 folio_add_anon_rmap_pmd(folio
, new, vma
, haddr
, rmap_flags
);
4695 folio_add_file_rmap_pmd(folio
, new, vma
);
4697 VM_BUG_ON(pmd_write(pmde
) && folio_test_anon(folio
) && !PageAnonExclusive(new));
4698 set_pmd_at(mm
, haddr
, pvmw
->pmd
, pmde
);
4700 /* No need to invalidate - it was non-present before */
4701 update_mmu_cache_pmd(vma
, address
, pvmw
->pmd
);
4702 trace_remove_migration_pmd(address
, pmd_val(pmde
));