]> git.ipfire.org Git - thirdparty/linux.git/blame - mm/khugepaged.c
arm64: tegra: Remove current-speed for SBSA UART
[thirdparty/linux.git] / mm / khugepaged.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
b46e756f
KS
2#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4#include <linux/mm.h>
5#include <linux/sched.h>
6e84f315 6#include <linux/sched/mm.h>
f7ccbae4 7#include <linux/sched/coredump.h>
b46e756f
KS
8#include <linux/mmu_notifier.h>
9#include <linux/rmap.h>
10#include <linux/swap.h>
11#include <linux/mm_inline.h>
12#include <linux/kthread.h>
13#include <linux/khugepaged.h>
14#include <linux/freezer.h>
15#include <linux/mman.h>
16#include <linux/hashtable.h>
17#include <linux/userfaultfd_k.h>
18#include <linux/page_idle.h>
80110bbf 19#include <linux/page_table_check.h>
b46e756f 20#include <linux/swapops.h>
f3f0e1d2 21#include <linux/shmem_fs.h>
b46e756f
KS
22
23#include <asm/tlb.h>
24#include <asm/pgalloc.h>
25#include "internal.h"
b26e2701 26#include "mm_slot.h"
b46e756f
KS
27
28enum scan_result {
29 SCAN_FAIL,
30 SCAN_SUCCEED,
31 SCAN_PMD_NULL,
34488399 32 SCAN_PMD_NONE,
50722804 33 SCAN_PMD_MAPPED,
b46e756f 34 SCAN_EXCEED_NONE_PTE,
71a2c112
KS
35 SCAN_EXCEED_SWAP_PTE,
36 SCAN_EXCEED_SHARED_PTE,
b46e756f 37 SCAN_PTE_NON_PRESENT,
e1e267c7 38 SCAN_PTE_UFFD_WP,
58ac9a89 39 SCAN_PTE_MAPPED_HUGEPAGE,
b46e756f 40 SCAN_PAGE_RO,
0db501f7 41 SCAN_LACK_REFERENCED_PAGE,
b46e756f
KS
42 SCAN_PAGE_NULL,
43 SCAN_SCAN_ABORT,
44 SCAN_PAGE_COUNT,
45 SCAN_PAGE_LRU,
46 SCAN_PAGE_LOCK,
47 SCAN_PAGE_ANON,
48 SCAN_PAGE_COMPOUND,
49 SCAN_ANY_PROCESS,
50 SCAN_VMA_NULL,
51 SCAN_VMA_CHECK,
52 SCAN_ADDRESS_RANGE,
b46e756f
KS
53 SCAN_DEL_PAGE_LRU,
54 SCAN_ALLOC_HUGE_PAGE_FAIL,
55 SCAN_CGROUP_CHARGE_FAIL,
f3f0e1d2 56 SCAN_TRUNCATED,
99cb0dbd 57 SCAN_PAGE_HAS_PRIVATE,
2ce0bdfe 58 SCAN_STORE_FAILED,
98c76c9f 59 SCAN_COPY_MC,
ac492b9c 60 SCAN_PAGE_FILLED,
b46e756f
KS
61};
62
63#define CREATE_TRACE_POINTS
64#include <trace/events/huge_memory.h>
65
4aab2be0
VB
66static struct task_struct *khugepaged_thread __read_mostly;
67static DEFINE_MUTEX(khugepaged_mutex);
68
b46e756f
KS
69/* default scan 8*512 pte (or vmas) every 30 second */
70static unsigned int khugepaged_pages_to_scan __read_mostly;
71static unsigned int khugepaged_pages_collapsed;
72static unsigned int khugepaged_full_scans;
73static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
74/* during fragmentation poll the hugepage allocator once every minute */
75static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
76static unsigned long khugepaged_sleep_expire;
77static DEFINE_SPINLOCK(khugepaged_mm_lock);
78static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
79/*
80 * default collapse hugepages if there is at least one pte mapped like
81 * it would have happened if the vma was large enough during page
82 * fault.
d8ea7cc8
ZK
83 *
84 * Note that these are only respected if collapse was initiated by khugepaged.
b46e756f
KS
85 */
86static unsigned int khugepaged_max_ptes_none __read_mostly;
87static unsigned int khugepaged_max_ptes_swap __read_mostly;
71a2c112 88static unsigned int khugepaged_max_ptes_shared __read_mostly;
b46e756f
KS
89
90#define MM_SLOTS_HASH_BITS 10
e1ad3e66 91static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
b46e756f
KS
92
93static struct kmem_cache *mm_slot_cache __read_mostly;
94
27e1f827
SL
95#define MAX_PTE_MAPPED_THP 8
96
34d6b470 97struct collapse_control {
d8ea7cc8
ZK
98 bool is_khugepaged;
99
34d6b470
ZK
100 /* Num pages scanned per node */
101 u32 node_load[MAX_NUMNODES];
102
e031ff96
YS
103 /* nodemask for allocation fallback */
104 nodemask_t alloc_nmask;
34d6b470
ZK
105};
106
b46e756f 107/**
b26e2701
QZ
108 * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
109 * @slot: hash lookup from mm to mm_slot
336e6b53
AS
110 * @nr_pte_mapped_thp: number of pte mapped THP
111 * @pte_mapped_thp: address array corresponding pte mapped THP
b46e756f 112 */
b26e2701
QZ
113struct khugepaged_mm_slot {
114 struct mm_slot slot;
27e1f827
SL
115
116 /* pte-mapped THP in this mm */
117 int nr_pte_mapped_thp;
118 unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
b46e756f
KS
119};
120
121/**
122 * struct khugepaged_scan - cursor for scanning
123 * @mm_head: the head of the mm list to scan
124 * @mm_slot: the current mm_slot we are scanning
125 * @address: the next address inside that to be scanned
126 *
127 * There is only the one khugepaged_scan instance of this cursor structure.
128 */
129struct khugepaged_scan {
130 struct list_head mm_head;
b26e2701 131 struct khugepaged_mm_slot *mm_slot;
b46e756f
KS
132 unsigned long address;
133};
134
135static struct khugepaged_scan khugepaged_scan = {
136 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
137};
138
e1465d12 139#ifdef CONFIG_SYSFS
b46e756f
KS
140static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
141 struct kobj_attribute *attr,
142 char *buf)
143{
ae7a927d 144 return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
b46e756f
KS
145}
146
147static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
148 struct kobj_attribute *attr,
149 const char *buf, size_t count)
150{
dfefd226 151 unsigned int msecs;
b46e756f
KS
152 int err;
153
dfefd226
AD
154 err = kstrtouint(buf, 10, &msecs);
155 if (err)
b46e756f
KS
156 return -EINVAL;
157
158 khugepaged_scan_sleep_millisecs = msecs;
159 khugepaged_sleep_expire = 0;
160 wake_up_interruptible(&khugepaged_wait);
161
162 return count;
163}
164static struct kobj_attribute scan_sleep_millisecs_attr =
6dcdc94d 165 __ATTR_RW(scan_sleep_millisecs);
b46e756f
KS
166
167static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
168 struct kobj_attribute *attr,
169 char *buf)
170{
ae7a927d 171 return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
b46e756f
KS
172}
173
174static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
175 struct kobj_attribute *attr,
176 const char *buf, size_t count)
177{
dfefd226 178 unsigned int msecs;
b46e756f
KS
179 int err;
180
dfefd226
AD
181 err = kstrtouint(buf, 10, &msecs);
182 if (err)
b46e756f
KS
183 return -EINVAL;
184
185 khugepaged_alloc_sleep_millisecs = msecs;
186 khugepaged_sleep_expire = 0;
187 wake_up_interruptible(&khugepaged_wait);
188
189 return count;
190}
191static struct kobj_attribute alloc_sleep_millisecs_attr =
6dcdc94d 192 __ATTR_RW(alloc_sleep_millisecs);
b46e756f
KS
193
194static ssize_t pages_to_scan_show(struct kobject *kobj,
195 struct kobj_attribute *attr,
196 char *buf)
197{
ae7a927d 198 return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
b46e756f
KS
199}
200static ssize_t pages_to_scan_store(struct kobject *kobj,
201 struct kobj_attribute *attr,
202 const char *buf, size_t count)
203{
dfefd226 204 unsigned int pages;
b46e756f 205 int err;
b46e756f 206
dfefd226
AD
207 err = kstrtouint(buf, 10, &pages);
208 if (err || !pages)
b46e756f
KS
209 return -EINVAL;
210
211 khugepaged_pages_to_scan = pages;
212
213 return count;
214}
215static struct kobj_attribute pages_to_scan_attr =
6dcdc94d 216 __ATTR_RW(pages_to_scan);
b46e756f
KS
217
218static ssize_t pages_collapsed_show(struct kobject *kobj,
219 struct kobj_attribute *attr,
220 char *buf)
221{
ae7a927d 222 return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
b46e756f
KS
223}
224static struct kobj_attribute pages_collapsed_attr =
225 __ATTR_RO(pages_collapsed);
226
227static ssize_t full_scans_show(struct kobject *kobj,
228 struct kobj_attribute *attr,
229 char *buf)
230{
ae7a927d 231 return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
b46e756f
KS
232}
233static struct kobj_attribute full_scans_attr =
234 __ATTR_RO(full_scans);
235
6dcdc94d
ML
236static ssize_t defrag_show(struct kobject *kobj,
237 struct kobj_attribute *attr, char *buf)
b46e756f
KS
238{
239 return single_hugepage_flag_show(kobj, attr, buf,
ae7a927d 240 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
b46e756f 241}
6dcdc94d
ML
242static ssize_t defrag_store(struct kobject *kobj,
243 struct kobj_attribute *attr,
244 const char *buf, size_t count)
b46e756f
KS
245{
246 return single_hugepage_flag_store(kobj, attr, buf, count,
247 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
248}
249static struct kobj_attribute khugepaged_defrag_attr =
6dcdc94d 250 __ATTR_RW(defrag);
b46e756f
KS
251
252/*
253 * max_ptes_none controls if khugepaged should collapse hugepages over
254 * any unmapped ptes in turn potentially increasing the memory
255 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
256 * reduce the available free memory in the system as it
257 * runs. Increasing max_ptes_none will instead potentially reduce the
258 * free memory in the system during the khugepaged scan.
259 */
6dcdc94d
ML
260static ssize_t max_ptes_none_show(struct kobject *kobj,
261 struct kobj_attribute *attr,
262 char *buf)
b46e756f 263{
ae7a927d 264 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
b46e756f 265}
6dcdc94d
ML
266static ssize_t max_ptes_none_store(struct kobject *kobj,
267 struct kobj_attribute *attr,
268 const char *buf, size_t count)
b46e756f
KS
269{
270 int err;
271 unsigned long max_ptes_none;
272
273 err = kstrtoul(buf, 10, &max_ptes_none);
36ee2c78 274 if (err || max_ptes_none > HPAGE_PMD_NR - 1)
b46e756f
KS
275 return -EINVAL;
276
277 khugepaged_max_ptes_none = max_ptes_none;
278
279 return count;
280}
281static struct kobj_attribute khugepaged_max_ptes_none_attr =
6dcdc94d 282 __ATTR_RW(max_ptes_none);
b46e756f 283
6dcdc94d
ML
284static ssize_t max_ptes_swap_show(struct kobject *kobj,
285 struct kobj_attribute *attr,
286 char *buf)
b46e756f 287{
ae7a927d 288 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
b46e756f
KS
289}
290
6dcdc94d
ML
291static ssize_t max_ptes_swap_store(struct kobject *kobj,
292 struct kobj_attribute *attr,
293 const char *buf, size_t count)
b46e756f
KS
294{
295 int err;
296 unsigned long max_ptes_swap;
297
298 err = kstrtoul(buf, 10, &max_ptes_swap);
36ee2c78 299 if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
b46e756f
KS
300 return -EINVAL;
301
302 khugepaged_max_ptes_swap = max_ptes_swap;
303
304 return count;
305}
306
307static struct kobj_attribute khugepaged_max_ptes_swap_attr =
6dcdc94d 308 __ATTR_RW(max_ptes_swap);
b46e756f 309
6dcdc94d
ML
310static ssize_t max_ptes_shared_show(struct kobject *kobj,
311 struct kobj_attribute *attr,
312 char *buf)
71a2c112 313{
ae7a927d 314 return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
71a2c112
KS
315}
316
6dcdc94d
ML
317static ssize_t max_ptes_shared_store(struct kobject *kobj,
318 struct kobj_attribute *attr,
319 const char *buf, size_t count)
71a2c112
KS
320{
321 int err;
322 unsigned long max_ptes_shared;
323
324 err = kstrtoul(buf, 10, &max_ptes_shared);
36ee2c78 325 if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
71a2c112
KS
326 return -EINVAL;
327
328 khugepaged_max_ptes_shared = max_ptes_shared;
329
330 return count;
331}
332
333static struct kobj_attribute khugepaged_max_ptes_shared_attr =
6dcdc94d 334 __ATTR_RW(max_ptes_shared);
71a2c112 335
b46e756f
KS
336static struct attribute *khugepaged_attr[] = {
337 &khugepaged_defrag_attr.attr,
338 &khugepaged_max_ptes_none_attr.attr,
71a2c112
KS
339 &khugepaged_max_ptes_swap_attr.attr,
340 &khugepaged_max_ptes_shared_attr.attr,
b46e756f
KS
341 &pages_to_scan_attr.attr,
342 &pages_collapsed_attr.attr,
343 &full_scans_attr.attr,
344 &scan_sleep_millisecs_attr.attr,
345 &alloc_sleep_millisecs_attr.attr,
b46e756f
KS
346 NULL,
347};
348
349struct attribute_group khugepaged_attr_group = {
350 .attrs = khugepaged_attr,
351 .name = "khugepaged",
352};
e1465d12 353#endif /* CONFIG_SYSFS */
b46e756f 354
b46e756f
KS
355int hugepage_madvise(struct vm_area_struct *vma,
356 unsigned long *vm_flags, int advice)
357{
358 switch (advice) {
359 case MADV_HUGEPAGE:
360#ifdef CONFIG_S390
361 /*
362 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
363 * can't handle this properly after s390_enable_sie, so we simply
364 * ignore the madvise to prevent qemu from causing a SIGSEGV.
365 */
366 if (mm_has_pgste(vma->vm_mm))
367 return 0;
368#endif
369 *vm_flags &= ~VM_NOHUGEPAGE;
370 *vm_flags |= VM_HUGEPAGE;
371 /*
372 * If the vma become good for khugepaged to scan,
373 * register it here without waiting a page fault that
374 * may not happen any time soon.
375 */
c791576c 376 khugepaged_enter_vma(vma, *vm_flags);
b46e756f
KS
377 break;
378 case MADV_NOHUGEPAGE:
379 *vm_flags &= ~VM_HUGEPAGE;
380 *vm_flags |= VM_NOHUGEPAGE;
381 /*
382 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
383 * this vma even if we leave the mm registered in khugepaged if
384 * it got registered before VM_NOHUGEPAGE was set.
385 */
386 break;
387 }
388
389 return 0;
390}
391
392int __init khugepaged_init(void)
393{
394 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
b26e2701
QZ
395 sizeof(struct khugepaged_mm_slot),
396 __alignof__(struct khugepaged_mm_slot),
397 0, NULL);
b46e756f
KS
398 if (!mm_slot_cache)
399 return -ENOMEM;
400
401 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
402 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
403 khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
71a2c112 404 khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
b46e756f
KS
405
406 return 0;
407}
408
409void __init khugepaged_destroy(void)
410{
411 kmem_cache_destroy(mm_slot_cache);
412}
413
7d2c4385 414static inline int hpage_collapse_test_exit(struct mm_struct *mm)
b46e756f 415{
4d45e75a 416 return atomic_read(&mm->mm_users) == 0;
b46e756f
KS
417}
418
d2081b2b 419void __khugepaged_enter(struct mm_struct *mm)
b46e756f 420{
b26e2701
QZ
421 struct khugepaged_mm_slot *mm_slot;
422 struct mm_slot *slot;
b46e756f
KS
423 int wakeup;
424
16618670
XH
425 /* __khugepaged_exit() must not run from under us */
426 VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
427 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
428 return;
429
b26e2701 430 mm_slot = mm_slot_alloc(mm_slot_cache);
b46e756f 431 if (!mm_slot)
d2081b2b 432 return;
b46e756f 433
b26e2701
QZ
434 slot = &mm_slot->slot;
435
b46e756f 436 spin_lock(&khugepaged_mm_lock);
b26e2701 437 mm_slot_insert(mm_slots_hash, mm, slot);
b46e756f
KS
438 /*
439 * Insert just behind the scanning cursor, to let the area settle
440 * down a little.
441 */
442 wakeup = list_empty(&khugepaged_scan.mm_head);
b26e2701 443 list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
b46e756f
KS
444 spin_unlock(&khugepaged_mm_lock);
445
f1f10076 446 mmgrab(mm);
b46e756f
KS
447 if (wakeup)
448 wake_up_interruptible(&khugepaged_wait);
b46e756f
KS
449}
450
c791576c
YS
451void khugepaged_enter_vma(struct vm_area_struct *vma,
452 unsigned long vm_flags)
b46e756f 453{
2647d11b 454 if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
1064026b 455 hugepage_flags_enabled()) {
a7f4e6e4 456 if (hugepage_vma_check(vma, vm_flags, false, false, true))
2647d11b
YS
457 __khugepaged_enter(vma->vm_mm);
458 }
b46e756f
KS
459}
460
461void __khugepaged_exit(struct mm_struct *mm)
462{
b26e2701
QZ
463 struct khugepaged_mm_slot *mm_slot;
464 struct mm_slot *slot;
b46e756f
KS
465 int free = 0;
466
467 spin_lock(&khugepaged_mm_lock);
b26e2701
QZ
468 slot = mm_slot_lookup(mm_slots_hash, mm);
469 mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
b46e756f 470 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
b26e2701
QZ
471 hash_del(&slot->hash);
472 list_del(&slot->mm_node);
b46e756f
KS
473 free = 1;
474 }
475 spin_unlock(&khugepaged_mm_lock);
476
477 if (free) {
478 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
b26e2701 479 mm_slot_free(mm_slot_cache, mm_slot);
b46e756f
KS
480 mmdrop(mm);
481 } else if (mm_slot) {
482 /*
483 * This is required to serialize against
7d2c4385
ZK
484 * hpage_collapse_test_exit() (which is guaranteed to run
485 * under mmap sem read mode). Stop here (after we return all
486 * pagetables will be destroyed) until khugepaged has finished
487 * working on the pagetables under the mmap_lock.
b46e756f 488 */
d8ed45c5
ML
489 mmap_write_lock(mm);
490 mmap_write_unlock(mm);
b46e756f
KS
491 }
492}
493
92644f58
VMO
494static void release_pte_folio(struct folio *folio)
495{
496 node_stat_mod_folio(folio,
497 NR_ISOLATED_ANON + folio_is_file_lru(folio),
498 -folio_nr_pages(folio));
499 folio_unlock(folio);
500 folio_putback_lru(folio);
501}
502
b46e756f
KS
503static void release_pte_page(struct page *page)
504{
92644f58 505 release_pte_folio(page_folio(page));
b46e756f
KS
506}
507
5503fbf2
KS
508static void release_pte_pages(pte_t *pte, pte_t *_pte,
509 struct list_head *compound_pagelist)
b46e756f 510{
9bdfeea4 511 struct folio *folio, *tmp;
5503fbf2 512
b46e756f 513 while (--_pte >= pte) {
c33c7948 514 pte_t pteval = ptep_get(_pte);
f528260b 515 unsigned long pfn;
5503fbf2 516
f528260b
VMO
517 if (pte_none(pteval))
518 continue;
519 pfn = pte_pfn(pteval);
520 if (is_zero_pfn(pfn))
521 continue;
522 folio = pfn_folio(pfn);
523 if (folio_test_large(folio))
524 continue;
525 release_pte_folio(folio);
5503fbf2
KS
526 }
527
9bdfeea4
VMO
528 list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
529 list_del(&folio->lru);
530 release_pte_folio(folio);
b46e756f
KS
531 }
532}
533
9445689f
KS
534static bool is_refcount_suitable(struct page *page)
535{
536 int expected_refcount;
537
538 expected_refcount = total_mapcount(page);
539 if (PageSwapCache(page))
540 expected_refcount += compound_nr(page);
541
542 return page_count(page) == expected_refcount;
543}
544
b46e756f
KS
545static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
546 unsigned long address,
5503fbf2 547 pte_t *pte,
d8ea7cc8 548 struct collapse_control *cc,
5503fbf2 549 struct list_head *compound_pagelist)
b46e756f
KS
550{
551 struct page *page = NULL;
552 pte_t *_pte;
50ad2f24 553 int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
0db501f7 554 bool writable = false;
b46e756f 555
36ee2c78 556 for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
b46e756f 557 _pte++, address += PAGE_SIZE) {
c33c7948 558 pte_t pteval = ptep_get(_pte);
b46e756f
KS
559 if (pte_none(pteval) || (pte_present(pteval) &&
560 is_zero_pfn(pte_pfn(pteval)))) {
d8ea7cc8 561 ++none_or_zero;
b46e756f 562 if (!userfaultfd_armed(vma) &&
d8ea7cc8
ZK
563 (!cc->is_khugepaged ||
564 none_or_zero <= khugepaged_max_ptes_none)) {
b46e756f
KS
565 continue;
566 } else {
567 result = SCAN_EXCEED_NONE_PTE;
e9ea874a 568 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
b46e756f
KS
569 goto out;
570 }
571 }
572 if (!pte_present(pteval)) {
573 result = SCAN_PTE_NON_PRESENT;
574 goto out;
575 }
dd47ac42
PX
576 if (pte_uffd_wp(pteval)) {
577 result = SCAN_PTE_UFFD_WP;
578 goto out;
579 }
b46e756f 580 page = vm_normal_page(vma, address, pteval);
3218f871 581 if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
b46e756f
KS
582 result = SCAN_PAGE_NULL;
583 goto out;
584 }
585
5503fbf2
KS
586 VM_BUG_ON_PAGE(!PageAnon(page), page);
587
d8ea7cc8
ZK
588 if (page_mapcount(page) > 1) {
589 ++shared;
590 if (cc->is_khugepaged &&
591 shared > khugepaged_max_ptes_shared) {
592 result = SCAN_EXCEED_SHARED_PTE;
593 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
594 goto out;
595 }
71a2c112
KS
596 }
597
fece2029 598 if (PageCompound(page)) {
5503fbf2
KS
599 struct page *p;
600 page = compound_head(page);
fece2029 601
5503fbf2
KS
602 /*
603 * Check if we have dealt with the compound page
604 * already
605 */
606 list_for_each_entry(p, compound_pagelist, lru) {
607 if (page == p)
608 goto next;
609 }
610 }
b46e756f
KS
611
612 /*
613 * We can do it before isolate_lru_page because the
614 * page can't be freed from under us. NOTE: PG_lock
615 * is needed to serialize against split_huge_page
616 * when invoked from the VM.
617 */
618 if (!trylock_page(page)) {
619 result = SCAN_PAGE_LOCK;
620 goto out;
621 }
622
623 /*
9445689f
KS
624 * Check if the page has any GUP (or other external) pins.
625 *
626 * The page table that maps the page has been already unlinked
627 * from the page table tree and this process cannot get
f0953a1b 628 * an additional pin on the page.
9445689f
KS
629 *
630 * New pins can come later if the page is shared across fork,
631 * but not from this process. The other process cannot write to
632 * the page, only trigger CoW.
b46e756f 633 */
9445689f 634 if (!is_refcount_suitable(page)) {
b46e756f
KS
635 unlock_page(page);
636 result = SCAN_PAGE_COUNT;
637 goto out;
638 }
b46e756f
KS
639
640 /*
641 * Isolate the page to avoid collapsing an hugepage
642 * currently in use by the VM.
643 */
f7f9c00d 644 if (!isolate_lru_page(page)) {
b46e756f
KS
645 unlock_page(page);
646 result = SCAN_DEL_PAGE_LRU;
647 goto out;
648 }
5503fbf2
KS
649 mod_node_page_state(page_pgdat(page),
650 NR_ISOLATED_ANON + page_is_file_lru(page),
651 compound_nr(page));
b46e756f
KS
652 VM_BUG_ON_PAGE(!PageLocked(page), page);
653 VM_BUG_ON_PAGE(PageLRU(page), page);
654
5503fbf2
KS
655 if (PageCompound(page))
656 list_add_tail(&page->lru, compound_pagelist);
657next:
d8ea7cc8
ZK
658 /*
659 * If collapse was initiated by khugepaged, check that there is
660 * enough young pte to justify collapsing the page
661 */
662 if (cc->is_khugepaged &&
663 (pte_young(pteval) || page_is_young(page) ||
664 PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
665 address)))
0db501f7 666 referenced++;
5503fbf2
KS
667
668 if (pte_write(pteval))
669 writable = true;
b46e756f 670 }
74e579bf
ML
671
672 if (unlikely(!writable)) {
b46e756f 673 result = SCAN_PAGE_RO;
d8ea7cc8 674 } else if (unlikely(cc->is_khugepaged && !referenced)) {
74e579bf
ML
675 result = SCAN_LACK_REFERENCED_PAGE;
676 } else {
677 result = SCAN_SUCCEED;
678 trace_mm_collapse_huge_page_isolate(page, none_or_zero,
679 referenced, writable, result);
50ad2f24 680 return result;
b46e756f 681 }
b46e756f 682out:
5503fbf2 683 release_pte_pages(pte, _pte, compound_pagelist);
b46e756f
KS
684 trace_mm_collapse_huge_page_isolate(page, none_or_zero,
685 referenced, writable, result);
50ad2f24 686 return result;
b46e756f
KS
687}
688
98c76c9f
JY
689static void __collapse_huge_page_copy_succeeded(pte_t *pte,
690 struct vm_area_struct *vma,
691 unsigned long address,
692 spinlock_t *ptl,
693 struct list_head *compound_pagelist)
b46e756f 694{
98c76c9f
JY
695 struct page *src_page;
696 struct page *tmp;
b46e756f 697 pte_t *_pte;
98c76c9f 698 pte_t pteval;
b46e756f 699
98c76c9f
JY
700 for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
701 _pte++, address += PAGE_SIZE) {
c33c7948 702 pteval = ptep_get(_pte);
b46e756f 703 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
b46e756f
KS
704 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
705 if (is_zero_pfn(pte_pfn(pteval))) {
706 /*
707 * ptl mostly unnecessary.
708 */
709 spin_lock(ptl);
08d5b29e 710 ptep_clear(vma->vm_mm, address, _pte);
b46e756f
KS
711 spin_unlock(ptl);
712 }
713 } else {
714 src_page = pte_page(pteval);
5503fbf2
KS
715 if (!PageCompound(src_page))
716 release_pte_page(src_page);
b46e756f
KS
717 /*
718 * ptl mostly unnecessary, but preempt has to
719 * be disabled to update the per-cpu stats
720 * inside page_remove_rmap().
721 */
722 spin_lock(ptl);
08d5b29e 723 ptep_clear(vma->vm_mm, address, _pte);
cea86fe2 724 page_remove_rmap(src_page, vma, false);
b46e756f
KS
725 spin_unlock(ptl);
726 free_page_and_swap_cache(src_page);
727 }
b46e756f 728 }
5503fbf2
KS
729
730 list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
731 list_del(&src_page->lru);
1baec203
ML
732 mod_node_page_state(page_pgdat(src_page),
733 NR_ISOLATED_ANON + page_is_file_lru(src_page),
734 -compound_nr(src_page));
735 unlock_page(src_page);
736 free_swap_cache(src_page);
737 putback_lru_page(src_page);
5503fbf2 738 }
b46e756f
KS
739}
740
98c76c9f
JY
741static void __collapse_huge_page_copy_failed(pte_t *pte,
742 pmd_t *pmd,
743 pmd_t orig_pmd,
744 struct vm_area_struct *vma,
745 struct list_head *compound_pagelist)
746{
747 spinlock_t *pmd_ptl;
748
749 /*
750 * Re-establish the PMD to point to the original page table
751 * entry. Restoring PMD needs to be done prior to releasing
752 * pages. Since pages are still isolated and locked here,
753 * acquiring anon_vma_lock_write is unnecessary.
754 */
755 pmd_ptl = pmd_lock(vma->vm_mm, pmd);
756 pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
757 spin_unlock(pmd_ptl);
758 /*
759 * Release both raw and compound pages isolated
760 * in __collapse_huge_page_isolate.
761 */
762 release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
763}
764
765/*
766 * __collapse_huge_page_copy - attempts to copy memory contents from raw
767 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
768 * otherwise restores the original page table and releases isolated raw pages.
769 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
770 *
771 * @pte: starting of the PTEs to copy from
772 * @page: the new hugepage to copy contents to
773 * @pmd: pointer to the new hugepage's PMD
774 * @orig_pmd: the original raw pages' PMD
775 * @vma: the original raw pages' virtual memory area
776 * @address: starting address to copy
777 * @ptl: lock on raw pages' PTEs
778 * @compound_pagelist: list that stores compound pages
779 */
780static int __collapse_huge_page_copy(pte_t *pte,
781 struct page *page,
782 pmd_t *pmd,
783 pmd_t orig_pmd,
784 struct vm_area_struct *vma,
785 unsigned long address,
786 spinlock_t *ptl,
787 struct list_head *compound_pagelist)
788{
789 struct page *src_page;
790 pte_t *_pte;
791 pte_t pteval;
792 unsigned long _address;
793 int result = SCAN_SUCCEED;
794
795 /*
796 * Copying pages' contents is subject to memory poison at any iteration.
797 */
798 for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
799 _pte++, page++, _address += PAGE_SIZE) {
c33c7948 800 pteval = ptep_get(_pte);
98c76c9f
JY
801 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
802 clear_user_highpage(page, _address);
803 continue;
804 }
805 src_page = pte_page(pteval);
806 if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) {
807 result = SCAN_COPY_MC;
808 break;
809 }
810 }
811
812 if (likely(result == SCAN_SUCCEED))
813 __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
814 compound_pagelist);
815 else
816 __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
817 compound_pagelist);
818
819 return result;
820}
821
b46e756f
KS
822static void khugepaged_alloc_sleep(void)
823{
824 DEFINE_WAIT(wait);
825
826 add_wait_queue(&khugepaged_wait, &wait);
f5d39b02
PZ
827 __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
828 schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
b46e756f
KS
829 remove_wait_queue(&khugepaged_wait, &wait);
830}
831
34d6b470 832struct collapse_control khugepaged_collapse_control = {
d8ea7cc8 833 .is_khugepaged = true,
34d6b470 834};
b46e756f 835
7d2c4385 836static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
b46e756f
KS
837{
838 int i;
839
840 /*
a5f5f91d 841 * If node_reclaim_mode is disabled, then no extra effort is made to
b46e756f
KS
842 * allocate memory locally.
843 */
202e35db 844 if (!node_reclaim_enabled())
b46e756f
KS
845 return false;
846
847 /* If there is a count for this node already, it must be acceptable */
34d6b470 848 if (cc->node_load[nid])
b46e756f
KS
849 return false;
850
851 for (i = 0; i < MAX_NUMNODES; i++) {
34d6b470 852 if (!cc->node_load[i])
b46e756f 853 continue;
a55c7454 854 if (node_distance(nid, i) > node_reclaim_distance)
b46e756f
KS
855 return true;
856 }
857 return false;
858}
859
1064026b
YS
860#define khugepaged_defrag() \
861 (transparent_hugepage_flags & \
862 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
863
b46e756f
KS
864/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
865static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
866{
25160354 867 return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
b46e756f
KS
868}
869
870#ifdef CONFIG_NUMA
7d2c4385 871static int hpage_collapse_find_target_node(struct collapse_control *cc)
b46e756f 872{
b46e756f
KS
873 int nid, target_node = 0, max_value = 0;
874
875 /* find first node with max normal pages hit */
876 for (nid = 0; nid < MAX_NUMNODES; nid++)
34d6b470
ZK
877 if (cc->node_load[nid] > max_value) {
878 max_value = cc->node_load[nid];
b46e756f
KS
879 target_node = nid;
880 }
881
e031ff96
YS
882 for_each_online_node(nid) {
883 if (max_value == cc->node_load[nid])
884 node_set(nid, cc->alloc_nmask);
885 }
b46e756f 886
b46e756f
KS
887 return target_node;
888}
c6a7f445 889#else
7d2c4385 890static int hpage_collapse_find_target_node(struct collapse_control *cc)
b46e756f 891{
c6a7f445 892 return 0;
b46e756f 893}
c6a7f445 894#endif
b46e756f 895
e031ff96
YS
896static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
897 nodemask_t *nmask)
b46e756f 898{
e031ff96 899 *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
b46e756f
KS
900 if (unlikely(!*hpage)) {
901 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
9710a78a 902 return false;
b46e756f
KS
903 }
904
905 prep_transhuge_page(*hpage);
906 count_vm_event(THP_COLLAPSE_ALLOC);
b46e756f
KS
907 return true;
908}
909
b46e756f 910/*
c1e8d7c6
ML
911 * If mmap_lock temporarily dropped, revalidate vma
912 * before taking mmap_lock.
50ad2f24 913 * Returns enum scan_result value.
b46e756f
KS
914 */
915
c131f751 916static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
34488399 917 bool expect_anon,
a7f4e6e4
ZK
918 struct vm_area_struct **vmap,
919 struct collapse_control *cc)
b46e756f
KS
920{
921 struct vm_area_struct *vma;
b46e756f 922
7d2c4385 923 if (unlikely(hpage_collapse_test_exit(mm)))
b46e756f
KS
924 return SCAN_ANY_PROCESS;
925
c131f751 926 *vmap = vma = find_vma(mm, address);
b46e756f
KS
927 if (!vma)
928 return SCAN_VMA_NULL;
929
4fa6893f 930 if (!transhuge_vma_suitable(vma, address))
b46e756f 931 return SCAN_ADDRESS_RANGE;
a7f4e6e4
ZK
932 if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
933 cc->is_khugepaged))
b46e756f 934 return SCAN_VMA_CHECK;
f707fa49
YS
935 /*
936 * Anon VMA expected, the address may be unmapped then
937 * remapped to file after khugepaged reaquired the mmap_lock.
938 *
939 * hugepage_vma_check may return true for qualified file
940 * vmas.
941 */
34488399
ZK
942 if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
943 return SCAN_PAGE_ANON;
50ad2f24 944 return SCAN_SUCCEED;
b46e756f
KS
945}
946
50722804
ZK
947static int find_pmd_or_thp_or_none(struct mm_struct *mm,
948 unsigned long address,
949 pmd_t **pmd)
950{
951 pmd_t pmde;
952
953 *pmd = mm_find_pmd(mm, address);
954 if (!*pmd)
955 return SCAN_PMD_NULL;
956
dab6e717 957 pmde = pmdp_get_lockless(*pmd);
34488399
ZK
958 if (pmd_none(pmde))
959 return SCAN_PMD_NONE;
edb5d0cf
ZK
960 if (!pmd_present(pmde))
961 return SCAN_PMD_NULL;
50722804
ZK
962 if (pmd_trans_huge(pmde))
963 return SCAN_PMD_MAPPED;
edb5d0cf
ZK
964 if (pmd_devmap(pmde))
965 return SCAN_PMD_NULL;
50722804
ZK
966 if (pmd_bad(pmde))
967 return SCAN_PMD_NULL;
968 return SCAN_SUCCEED;
969}
970
971static int check_pmd_still_valid(struct mm_struct *mm,
972 unsigned long address,
973 pmd_t *pmd)
974{
975 pmd_t *new_pmd;
976 int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
977
978 if (result != SCAN_SUCCEED)
979 return result;
980 if (new_pmd != pmd)
981 return SCAN_FAIL;
982 return SCAN_SUCCEED;
b46e756f
KS
983}
984
985/*
986 * Bring missing pages in from swap, to complete THP collapse.
7d2c4385 987 * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
b46e756f 988 *
4d928e20 989 * Called and returns without pte mapped or spinlocks held.
895f5ee4 990 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
b46e756f 991 */
50ad2f24
ZK
992static int __collapse_huge_page_swapin(struct mm_struct *mm,
993 struct vm_area_struct *vma,
994 unsigned long haddr, pmd_t *pmd,
995 int referenced)
b46e756f 996{
2b740303
SJ
997 int swapped_in = 0;
998 vm_fault_t ret = 0;
2b635dd3 999 unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
895f5ee4
HD
1000 int result;
1001 pte_t *pte = NULL;
c7ad0880 1002 spinlock_t *ptl;
2b635dd3
WD
1003
1004 for (address = haddr; address < end; address += PAGE_SIZE) {
1005 struct vm_fault vmf = {
1006 .vma = vma,
1007 .address = address,
895f5ee4 1008 .pgoff = linear_page_index(vma, address),
2b635dd3
WD
1009 .flags = FAULT_FLAG_ALLOW_RETRY,
1010 .pmd = pmd,
1011 };
1012
895f5ee4 1013 if (!pte++) {
c7ad0880 1014 pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
895f5ee4
HD
1015 if (!pte) {
1016 mmap_read_unlock(mm);
1017 result = SCAN_PMD_NULL;
1018 goto out;
1019 }
2b635dd3 1020 }
895f5ee4 1021
c7ad0880 1022 vmf.orig_pte = ptep_get_lockless(pte);
895f5ee4
HD
1023 if (!is_swap_pte(vmf.orig_pte))
1024 continue;
1025
1026 vmf.pte = pte;
c7ad0880 1027 vmf.ptl = ptl;
2994302b 1028 ret = do_swap_page(&vmf);
895f5ee4
HD
1029 /* Which unmaps pte (after perhaps re-checking the entry) */
1030 pte = NULL;
0db501f7 1031
4d928e20
ML
1032 /*
1033 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
1034 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
1035 * we do not retry here and swap entry will remain in pagetable
1036 * resulting in later failure.
1037 */
b46e756f 1038 if (ret & VM_FAULT_RETRY) {
50ad2f24 1039 /* Likely, but not guaranteed, that page lock failed */
895f5ee4
HD
1040 result = SCAN_PAGE_LOCK;
1041 goto out;
b46e756f
KS
1042 }
1043 if (ret & VM_FAULT_ERROR) {
4d928e20 1044 mmap_read_unlock(mm);
895f5ee4
HD
1045 result = SCAN_FAIL;
1046 goto out;
b46e756f 1047 }
4d928e20 1048 swapped_in++;
b46e756f 1049 }
ae2c5d80 1050
895f5ee4
HD
1051 if (pte)
1052 pte_unmap(pte);
1053
1fec6890 1054 /* Drain LRU cache to remove extra pin on the swapped in pages */
ae2c5d80
KS
1055 if (swapped_in)
1056 lru_add_drain();
1057
895f5ee4
HD
1058 result = SCAN_SUCCEED;
1059out:
1060 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
1061 return result;
b46e756f
KS
1062}
1063
9710a78a
ZK
1064static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
1065 struct collapse_control *cc)
1066{
7d8faaf1 1067 gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
e031ff96 1068 GFP_TRANSHUGE);
7d2c4385 1069 int node = hpage_collapse_find_target_node(cc);
94c02ad7 1070 struct folio *folio;
9710a78a 1071
e031ff96 1072 if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
9710a78a 1073 return SCAN_ALLOC_HUGE_PAGE_FAIL;
94c02ad7
PX
1074
1075 folio = page_folio(*hpage);
1076 if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
1077 folio_put(folio);
1078 *hpage = NULL;
9710a78a 1079 return SCAN_CGROUP_CHARGE_FAIL;
94c02ad7 1080 }
9710a78a 1081 count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
94c02ad7 1082
9710a78a
ZK
1083 return SCAN_SUCCEED;
1084}
1085
50ad2f24
ZK
1086static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
1087 int referenced, int unmapped,
1088 struct collapse_control *cc)
b46e756f 1089{
5503fbf2 1090 LIST_HEAD(compound_pagelist);
b46e756f
KS
1091 pmd_t *pmd, _pmd;
1092 pte_t *pte;
1093 pgtable_t pgtable;
50ad2f24 1094 struct page *hpage;
b46e756f 1095 spinlock_t *pmd_ptl, *pte_ptl;
50ad2f24 1096 int result = SCAN_FAIL;
c131f751 1097 struct vm_area_struct *vma;
ac46d4f3 1098 struct mmu_notifier_range range;
b46e756f
KS
1099
1100 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1101
988ddb71 1102 /*
c1e8d7c6 1103 * Before allocating the hugepage, release the mmap_lock read lock.
988ddb71 1104 * The allocation can take potentially a long time if it involves
c1e8d7c6 1105 * sync compaction, and we do not need to hold the mmap_lock during
988ddb71
KS
1106 * that. We will recheck the vma after taking it again in write mode.
1107 */
d8ed45c5 1108 mmap_read_unlock(mm);
b46e756f 1109
50ad2f24 1110 result = alloc_charge_hpage(&hpage, mm, cc);
9710a78a 1111 if (result != SCAN_SUCCEED)
b46e756f 1112 goto out_nolock;
b46e756f 1113
d8ed45c5 1114 mmap_read_lock(mm);
34488399 1115 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
50ad2f24 1116 if (result != SCAN_SUCCEED) {
d8ed45c5 1117 mmap_read_unlock(mm);
b46e756f
KS
1118 goto out_nolock;
1119 }
1120
50722804
ZK
1121 result = find_pmd_or_thp_or_none(mm, address, &pmd);
1122 if (result != SCAN_SUCCEED) {
d8ed45c5 1123 mmap_read_unlock(mm);
b46e756f
KS
1124 goto out_nolock;
1125 }
1126
50ad2f24
ZK
1127 if (unmapped) {
1128 /*
1129 * __collapse_huge_page_swapin will return with mmap_lock
1130 * released when it fails. So we jump out_nolock directly in
1131 * that case. Continuing to collapse causes inconsistency.
1132 */
1133 result = __collapse_huge_page_swapin(mm, vma, address, pmd,
1134 referenced);
1135 if (result != SCAN_SUCCEED)
1136 goto out_nolock;
b46e756f
KS
1137 }
1138
d8ed45c5 1139 mmap_read_unlock(mm);
b46e756f
KS
1140 /*
1141 * Prevent all access to pagetables with the exception of
1142 * gup_fast later handled by the ptep_clear_flush and the VM
1143 * handled by the anon_vma lock + PG_lock.
1144 */
d8ed45c5 1145 mmap_write_lock(mm);
34488399 1146 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
50ad2f24 1147 if (result != SCAN_SUCCEED)
18d24a7c 1148 goto out_up_write;
b46e756f 1149 /* check if the pmd is still valid */
50722804
ZK
1150 result = check_pmd_still_valid(mm, address, pmd);
1151 if (result != SCAN_SUCCEED)
18d24a7c 1152 goto out_up_write;
b46e756f 1153
55fd6fcc 1154 vma_start_write(vma);
b46e756f
KS
1155 anon_vma_lock_write(vma->anon_vma);
1156
7d4a8be0
AP
1157 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
1158 address + HPAGE_PMD_SIZE);
ac46d4f3 1159 mmu_notifier_invalidate_range_start(&range);
ec649c9d 1160
b46e756f
KS
1161 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1162 /*
70cbc3cc
YS
1163 * This removes any huge TLB entry from the CPU so we won't allow
1164 * huge and small TLB entries for the same virtual address to
1165 * avoid the risk of CPU bugs in that area.
1166 *
1167 * Parallel fast GUP is fine since fast GUP will back off when
1168 * it detects PMD is changed.
b46e756f
KS
1169 */
1170 _pmd = pmdp_collapse_flush(vma, address, pmd);
1171 spin_unlock(pmd_ptl);
ac46d4f3 1172 mmu_notifier_invalidate_range_end(&range);
2ba99c5e 1173 tlb_remove_table_sync_one();
b46e756f 1174
895f5ee4
HD
1175 pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
1176 if (pte) {
1177 result = __collapse_huge_page_isolate(vma, address, pte, cc,
1178 &compound_pagelist);
1179 spin_unlock(pte_ptl);
1180 } else {
1181 result = SCAN_PMD_NULL;
1182 }
b46e756f 1183
50ad2f24 1184 if (unlikely(result != SCAN_SUCCEED)) {
895f5ee4
HD
1185 if (pte)
1186 pte_unmap(pte);
b46e756f
KS
1187 spin_lock(pmd_ptl);
1188 BUG_ON(!pmd_none(*pmd));
1189 /*
1190 * We can only use set_pmd_at when establishing
1191 * hugepmds and never for establishing regular pmds that
1192 * points to regular pagetables. Use pmd_populate for that
1193 */
1194 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1195 spin_unlock(pmd_ptl);
1196 anon_vma_unlock_write(vma->anon_vma);
18d24a7c 1197 goto out_up_write;
b46e756f
KS
1198 }
1199
1200 /*
1201 * All pages are isolated and locked so anon_vma rmap
1202 * can't run anymore.
1203 */
1204 anon_vma_unlock_write(vma->anon_vma);
1205
98c76c9f
JY
1206 result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
1207 vma, address, pte_ptl,
1208 &compound_pagelist);
b46e756f 1209 pte_unmap(pte);
98c76c9f
JY
1210 if (unlikely(result != SCAN_SUCCEED))
1211 goto out_up_write;
1212
588d01f9
ML
1213 /*
1214 * spin_lock() below is not the equivalent of smp_wmb(), but
1215 * the smp_wmb() inside __SetPageUptodate() can be reused to
1216 * avoid the copy_huge_page writes to become visible after
1217 * the set_pmd_at() write.
1218 */
50ad2f24 1219 __SetPageUptodate(hpage);
b46e756f
KS
1220 pgtable = pmd_pgtable(_pmd);
1221
50ad2f24 1222 _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
f55e1014 1223 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
b46e756f 1224
b46e756f
KS
1225 spin_lock(pmd_ptl);
1226 BUG_ON(!pmd_none(*pmd));
50ad2f24
ZK
1227 page_add_new_anon_rmap(hpage, vma, address);
1228 lru_cache_add_inactive_or_unevictable(hpage, vma);
b46e756f
KS
1229 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1230 set_pmd_at(mm, address, pmd, _pmd);
1231 update_mmu_cache_pmd(vma, address, pmd);
1232 spin_unlock(pmd_ptl);
1233
50ad2f24 1234 hpage = NULL;
b46e756f 1235
b46e756f
KS
1236 result = SCAN_SUCCEED;
1237out_up_write:
d8ed45c5 1238 mmap_write_unlock(mm);
b46e756f 1239out_nolock:
7cb1d7ef 1240 if (hpage)
50ad2f24 1241 put_page(hpage);
50ad2f24
ZK
1242 trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
1243 return result;
b46e756f
KS
1244}
1245
7d2c4385
ZK
1246static int hpage_collapse_scan_pmd(struct mm_struct *mm,
1247 struct vm_area_struct *vma,
1248 unsigned long address, bool *mmap_locked,
1249 struct collapse_control *cc)
b46e756f
KS
1250{
1251 pmd_t *pmd;
1252 pte_t *pte, *_pte;
50ad2f24 1253 int result = SCAN_FAIL, referenced = 0;
71a2c112 1254 int none_or_zero = 0, shared = 0;
b46e756f
KS
1255 struct page *page = NULL;
1256 unsigned long _address;
1257 spinlock_t *ptl;
1258 int node = NUMA_NO_NODE, unmapped = 0;
0db501f7 1259 bool writable = false;
b46e756f
KS
1260
1261 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1262
50722804
ZK
1263 result = find_pmd_or_thp_or_none(mm, address, &pmd);
1264 if (result != SCAN_SUCCEED)
b46e756f 1265 goto out;
b46e756f 1266
34d6b470 1267 memset(cc->node_load, 0, sizeof(cc->node_load));
e031ff96 1268 nodes_clear(cc->alloc_nmask);
b46e756f 1269 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
895f5ee4
HD
1270 if (!pte) {
1271 result = SCAN_PMD_NULL;
1272 goto out;
1273 }
1274
36ee2c78 1275 for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
b46e756f 1276 _pte++, _address += PAGE_SIZE) {
c33c7948 1277 pte_t pteval = ptep_get(_pte);
b46e756f 1278 if (is_swap_pte(pteval)) {
d8ea7cc8
ZK
1279 ++unmapped;
1280 if (!cc->is_khugepaged ||
1281 unmapped <= khugepaged_max_ptes_swap) {
e1e267c7
PX
1282 /*
1283 * Always be strict with uffd-wp
1284 * enabled swap entries. Please see
1285 * comment below for pte_uffd_wp().
1286 */
2bad466c 1287 if (pte_swp_uffd_wp_any(pteval)) {
e1e267c7
PX
1288 result = SCAN_PTE_UFFD_WP;
1289 goto out_unmap;
1290 }
b46e756f
KS
1291 continue;
1292 } else {
1293 result = SCAN_EXCEED_SWAP_PTE;
e9ea874a 1294 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
b46e756f
KS
1295 goto out_unmap;
1296 }
1297 }
1298 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
d8ea7cc8 1299 ++none_or_zero;
b46e756f 1300 if (!userfaultfd_armed(vma) &&
d8ea7cc8
ZK
1301 (!cc->is_khugepaged ||
1302 none_or_zero <= khugepaged_max_ptes_none)) {
b46e756f
KS
1303 continue;
1304 } else {
1305 result = SCAN_EXCEED_NONE_PTE;
e9ea874a 1306 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
b46e756f
KS
1307 goto out_unmap;
1308 }
1309 }
e1e267c7
PX
1310 if (pte_uffd_wp(pteval)) {
1311 /*
1312 * Don't collapse the page if any of the small
1313 * PTEs are armed with uffd write protection.
1314 * Here we can also mark the new huge pmd as
1315 * write protected if any of the small ones is
8958b249 1316 * marked but that could bring unknown
e1e267c7
PX
1317 * userfault messages that falls outside of
1318 * the registered range. So, just be simple.
1319 */
1320 result = SCAN_PTE_UFFD_WP;
1321 goto out_unmap;
1322 }
b46e756f
KS
1323 if (pte_write(pteval))
1324 writable = true;
1325
1326 page = vm_normal_page(vma, _address, pteval);
3218f871 1327 if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
b46e756f
KS
1328 result = SCAN_PAGE_NULL;
1329 goto out_unmap;
1330 }
1331
d8ea7cc8
ZK
1332 if (page_mapcount(page) > 1) {
1333 ++shared;
1334 if (cc->is_khugepaged &&
1335 shared > khugepaged_max_ptes_shared) {
1336 result = SCAN_EXCEED_SHARED_PTE;
1337 count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
1338 goto out_unmap;
1339 }
71a2c112
KS
1340 }
1341
5503fbf2 1342 page = compound_head(page);
b46e756f
KS
1343
1344 /*
1345 * Record which node the original page is from and save this
34d6b470 1346 * information to cc->node_load[].
0b8f0d87 1347 * Khugepaged will allocate hugepage from the node has the max
b46e756f
KS
1348 * hit record.
1349 */
1350 node = page_to_nid(page);
7d2c4385 1351 if (hpage_collapse_scan_abort(node, cc)) {
b46e756f
KS
1352 result = SCAN_SCAN_ABORT;
1353 goto out_unmap;
1354 }
34d6b470 1355 cc->node_load[node]++;
b46e756f
KS
1356 if (!PageLRU(page)) {
1357 result = SCAN_PAGE_LRU;
1358 goto out_unmap;
1359 }
1360 if (PageLocked(page)) {
1361 result = SCAN_PAGE_LOCK;
1362 goto out_unmap;
1363 }
1364 if (!PageAnon(page)) {
1365 result = SCAN_PAGE_ANON;
1366 goto out_unmap;
1367 }
1368
1369 /*
9445689f
KS
1370 * Check if the page has any GUP (or other external) pins.
1371 *
cb67f428
HD
1372 * Here the check may be racy:
1373 * it may see total_mapcount > refcount in some cases?
9445689f
KS
1374 * But such case is ephemeral we could always retry collapse
1375 * later. However it may report false positive if the page
1376 * has excessive GUP pins (i.e. 512). Anyway the same check
1377 * will be done again later the risk seems low.
b46e756f 1378 */
9445689f 1379 if (!is_refcount_suitable(page)) {
b46e756f
KS
1380 result = SCAN_PAGE_COUNT;
1381 goto out_unmap;
1382 }
d8ea7cc8
ZK
1383
1384 /*
1385 * If collapse was initiated by khugepaged, check that there is
1386 * enough young pte to justify collapsing the page
1387 */
1388 if (cc->is_khugepaged &&
1389 (pte_young(pteval) || page_is_young(page) ||
1390 PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
1391 address)))
0db501f7 1392 referenced++;
b46e756f 1393 }
ffe945e6 1394 if (!writable) {
b46e756f 1395 result = SCAN_PAGE_RO;
d8ea7cc8
ZK
1396 } else if (cc->is_khugepaged &&
1397 (!referenced ||
1398 (unmapped && referenced < HPAGE_PMD_NR / 2))) {
ffe945e6
KS
1399 result = SCAN_LACK_REFERENCED_PAGE;
1400 } else {
1401 result = SCAN_SUCCEED;
b46e756f
KS
1402 }
1403out_unmap:
1404 pte_unmap_unlock(pte, ptl);
50ad2f24
ZK
1405 if (result == SCAN_SUCCEED) {
1406 result = collapse_huge_page(mm, address, referenced,
1407 unmapped, cc);
c1e8d7c6 1408 /* collapse_huge_page will return with the mmap_lock released */
50ad2f24 1409 *mmap_locked = false;
b46e756f
KS
1410 }
1411out:
1412 trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
1413 none_or_zero, result, unmapped);
50ad2f24 1414 return result;
b46e756f
KS
1415}
1416
b26e2701 1417static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
b46e756f 1418{
b26e2701
QZ
1419 struct mm_slot *slot = &mm_slot->slot;
1420 struct mm_struct *mm = slot->mm;
b46e756f 1421
35f3aa39 1422 lockdep_assert_held(&khugepaged_mm_lock);
b46e756f 1423
7d2c4385 1424 if (hpage_collapse_test_exit(mm)) {
b46e756f 1425 /* free mm_slot */
b26e2701
QZ
1426 hash_del(&slot->hash);
1427 list_del(&slot->mm_node);
b46e756f
KS
1428
1429 /*
1430 * Not strictly needed because the mm exited already.
1431 *
1432 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1433 */
1434
1435 /* khugepaged_mm_lock actually not necessary for the below */
b26e2701 1436 mm_slot_free(mm_slot_cache, mm_slot);
b46e756f
KS
1437 mmdrop(mm);
1438 }
1439}
1440
396bcc52 1441#ifdef CONFIG_SHMEM
27e1f827
SL
1442/*
1443 * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
1444 * khugepaged should try to collapse the page table.
34488399
ZK
1445 *
1446 * Note that following race exists:
1447 * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
1448 * emptying the A's ->pte_mapped_thp[] array.
1449 * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
1450 * retract_page_tables() finds a VMA in mm_struct A mapping the same extent
1451 * (at virtual address X) and adds an entry (for X) into mm_struct A's
1452 * ->pte-mapped_thp[] array.
1453 * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
1454 * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
1455 * (for X) into mm_struct A's ->pte-mapped_thp[] array.
1456 * Thus, it's possible the same address is added multiple times for the same
1457 * mm_struct. Should this happen, we'll simply attempt
1458 * collapse_pte_mapped_thp() multiple times for the same address, under the same
1459 * exclusive mmap_lock, and assuming the first call is successful, subsequent
1460 * attempts will return quickly (without grabbing any additional locks) when
1461 * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap
1462 * check, and since this is a rare occurrence, the cost of preventing this
1463 * "multiple-add" is thought to be more expensive than just handling it, should
1464 * it occur.
27e1f827 1465 */
58ac9a89 1466static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
081c3256 1467 unsigned long addr)
27e1f827 1468{
b26e2701
QZ
1469 struct khugepaged_mm_slot *mm_slot;
1470 struct mm_slot *slot;
58ac9a89 1471 bool ret = false;
27e1f827
SL
1472
1473 VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
1474
1475 spin_lock(&khugepaged_mm_lock);
b26e2701
QZ
1476 slot = mm_slot_lookup(mm_slots_hash, mm);
1477 mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
58ac9a89 1478 if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) {
27e1f827 1479 mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
58ac9a89
ZK
1480 ret = true;
1481 }
27e1f827 1482 spin_unlock(&khugepaged_mm_lock);
58ac9a89 1483 return ret;
27e1f827
SL
1484}
1485
34488399
ZK
1486/* hpage must be locked, and mmap_lock must be held in write */
1487static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
1488 pmd_t *pmdp, struct page *hpage)
1489{
1490 struct vm_fault vmf = {
1491 .vma = vma,
1492 .address = addr,
1493 .flags = 0,
1494 .pmd = pmdp,
1495 };
1496
1497 VM_BUG_ON(!PageTransHuge(hpage));
1498 mmap_assert_write_locked(vma->vm_mm);
1499
1500 if (do_set_pmd(&vmf, hpage))
1501 return SCAN_FAIL;
1502
1503 get_page(hpage);
1504 return SCAN_SUCCEED;
27e1f827
SL
1505}
1506
8d3c106e
JH
1507/*
1508 * A note about locking:
1509 * Trying to take the page table spinlocks would be useless here because those
1510 * are only used to synchronize:
1511 *
1512 * - modifying terminal entries (ones that point to a data page, not to another
1513 * page table)
1514 * - installing *new* non-terminal entries
1515 *
1516 * Instead, we need roughly the same kind of protection as free_pgtables() or
1517 * mm_take_all_locks() (but only for a single VMA):
1518 * The mmap lock together with this VMA's rmap locks covers all paths towards
1519 * the page table entries we're messing with here, except for hardware page
1520 * table walks and lockless_pages_from_mm().
1521 */
e59a47b8
PT
1522static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
1523 unsigned long addr, pmd_t *pmdp)
1524{
e59a47b8 1525 pmd_t pmd;
f268f6cf 1526 struct mmu_notifier_range range;
e59a47b8 1527
80110bbf 1528 mmap_assert_write_locked(mm);
8d3c106e
JH
1529 if (vma->vm_file)
1530 lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
1531 /*
1532 * All anon_vmas attached to the VMA have the same root and are
1533 * therefore locked by the same lock.
1534 */
1535 if (vma->anon_vma)
1536 lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
1537
7d4a8be0 1538 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
f268f6cf
JH
1539 addr + HPAGE_PMD_SIZE);
1540 mmu_notifier_invalidate_range_start(&range);
e59a47b8 1541 pmd = pmdp_collapse_flush(vma, addr, pmdp);
2ba99c5e 1542 tlb_remove_table_sync_one();
f268f6cf 1543 mmu_notifier_invalidate_range_end(&range);
e59a47b8 1544 mm_dec_nr_ptes(mm);
80110bbf 1545 page_table_check_pte_clear_range(mm, addr, pmd);
e59a47b8
PT
1546 pte_free(mm, pmd_pgtable(pmd));
1547}
1548
27e1f827 1549/**
336e6b53
AS
1550 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
1551 * address haddr.
1552 *
1553 * @mm: process address space where collapse happens
1554 * @addr: THP collapse address
34488399 1555 * @install_pmd: If a huge PMD should be installed
27e1f827
SL
1556 *
1557 * This function checks whether all the PTEs in the PMD are pointing to the
1558 * right THP. If so, retract the page table so the THP can refault in with
34488399 1559 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
27e1f827 1560 */
34488399
ZK
1561int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
1562 bool install_pmd)
27e1f827
SL
1563{
1564 unsigned long haddr = addr & HPAGE_PMD_MASK;
94d815b2 1565 struct vm_area_struct *vma = vma_lookup(mm, haddr);
119a5fc1 1566 struct page *hpage;
27e1f827 1567 pte_t *start_pte, *pte;
e59a47b8 1568 pmd_t *pmd;
27e1f827 1569 spinlock_t *ptl;
58ac9a89 1570 int count = 0, result = SCAN_FAIL;
27e1f827
SL
1571 int i;
1572
58ac9a89
ZK
1573 mmap_assert_write_locked(mm);
1574
34488399 1575 /* Fast check before locking page if already PMD-mapped */
58ac9a89 1576 result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
34488399
ZK
1577 if (result == SCAN_PMD_MAPPED)
1578 return result;
58ac9a89 1579
27e1f827 1580 if (!vma || !vma->vm_file ||
fef792a4 1581 !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
34488399 1582 return SCAN_VMA_CHECK;
27e1f827
SL
1583
1584 /*
a7f4e6e4
ZK
1585 * If we are here, we've succeeded in replacing all the native pages
1586 * in the page cache with a single hugepage. If a mm were to fault-in
1587 * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
1588 * and map it by a PMD, regardless of sysfs THP settings. As such, let's
1589 * analogously elide sysfs THP settings here.
27e1f827 1590 */
a7f4e6e4 1591 if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
34488399 1592 return SCAN_VMA_CHECK;
27e1f827 1593
deb4c93a
PX
1594 /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
1595 if (userfaultfd_wp(vma))
34488399 1596 return SCAN_PTE_UFFD_WP;
deb4c93a 1597
119a5fc1
HD
1598 hpage = find_lock_page(vma->vm_file->f_mapping,
1599 linear_page_index(vma, haddr));
1600 if (!hpage)
34488399 1601 return SCAN_PAGE_NULL;
119a5fc1 1602
34488399
ZK
1603 if (!PageHead(hpage)) {
1604 result = SCAN_FAIL;
119a5fc1 1605 goto drop_hpage;
34488399 1606 }
119a5fc1 1607
34488399
ZK
1608 if (compound_order(hpage) != HPAGE_PMD_ORDER) {
1609 result = SCAN_PAGE_COMPOUND;
119a5fc1 1610 goto drop_hpage;
34488399 1611 }
119a5fc1 1612
34488399
ZK
1613 switch (result) {
1614 case SCAN_SUCCEED:
1615 break;
1616 case SCAN_PMD_NONE:
1617 /*
1618 * In MADV_COLLAPSE path, possible race with khugepaged where
1619 * all pte entries have been removed and pmd cleared. If so,
1620 * skip all the pte checks and just update the pmd mapping.
1621 */
1622 goto maybe_install_pmd;
1623 default:
119a5fc1 1624 goto drop_hpage;
34488399 1625 }
27e1f827 1626
55fd6fcc
SB
1627 /* Lock the vma before taking i_mmap and page table locks */
1628 vma_start_write(vma);
1629
8d3c106e
JH
1630 /*
1631 * We need to lock the mapping so that from here on, only GUP-fast and
1632 * hardware page walks can access the parts of the page tables that
1633 * we're operating on.
1634 * See collapse_and_free_pmd().
1635 */
1636 i_mmap_lock_write(vma->vm_file->f_mapping);
1637
1638 /*
1639 * This spinlock should be unnecessary: Nobody else should be accessing
1640 * the page tables under spinlock protection here, only
1641 * lockless_pages_from_mm() and the hardware page walker can access page
1642 * tables while all the high-level locks are held in write mode.
1643 */
34488399 1644 result = SCAN_FAIL;
895f5ee4
HD
1645 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
1646 if (!start_pte)
1647 goto drop_immap;
27e1f827
SL
1648
1649 /* step 1: check all mapped PTEs are to the right huge page */
1650 for (i = 0, addr = haddr, pte = start_pte;
1651 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1652 struct page *page;
c33c7948 1653 pte_t ptent = ptep_get(pte);
27e1f827
SL
1654
1655 /* empty pte, skip */
c33c7948 1656 if (pte_none(ptent))
27e1f827
SL
1657 continue;
1658
1659 /* page swapped out, abort */
c33c7948 1660 if (!pte_present(ptent)) {
34488399 1661 result = SCAN_PTE_NON_PRESENT;
27e1f827 1662 goto abort;
34488399 1663 }
27e1f827 1664
c33c7948 1665 page = vm_normal_page(vma, addr, ptent);
3218f871
AS
1666 if (WARN_ON_ONCE(page && is_zone_device_page(page)))
1667 page = NULL;
27e1f827 1668 /*
119a5fc1
HD
1669 * Note that uprobe, debugger, or MAP_PRIVATE may change the
1670 * page table, but the new page will not be a subpage of hpage.
27e1f827 1671 */
119a5fc1 1672 if (hpage + i != page)
27e1f827
SL
1673 goto abort;
1674 count++;
1675 }
1676
1677 /* step 2: adjust rmap */
1678 for (i = 0, addr = haddr, pte = start_pte;
1679 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1680 struct page *page;
c33c7948 1681 pte_t ptent = ptep_get(pte);
27e1f827 1682
c33c7948 1683 if (pte_none(ptent))
27e1f827 1684 continue;
c33c7948 1685 page = vm_normal_page(vma, addr, ptent);
3218f871
AS
1686 if (WARN_ON_ONCE(page && is_zone_device_page(page)))
1687 goto abort;
cea86fe2 1688 page_remove_rmap(page, vma, false);
27e1f827
SL
1689 }
1690
1691 pte_unmap_unlock(start_pte, ptl);
1692
1693 /* step 3: set proper refcount and mm_counters. */
119a5fc1 1694 if (count) {
27e1f827
SL
1695 page_ref_sub(hpage, count);
1696 add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
1697 }
1698
34488399 1699 /* step 4: remove pte entries */
ab0c3f12
HD
1700 /* we make no change to anon, but protect concurrent anon page lookup */
1701 if (vma->anon_vma)
1702 anon_vma_lock_write(vma->anon_vma);
1703
e59a47b8 1704 collapse_and_free_pmd(mm, vma, haddr, pmd);
34488399 1705
ab0c3f12
HD
1706 if (vma->anon_vma)
1707 anon_vma_unlock_write(vma->anon_vma);
8d3c106e
JH
1708 i_mmap_unlock_write(vma->vm_file->f_mapping);
1709
34488399
ZK
1710maybe_install_pmd:
1711 /* step 5: install pmd entry */
1712 result = install_pmd
1713 ? set_huge_pmd(vma, haddr, pmd, hpage)
1714 : SCAN_SUCCEED;
1715
119a5fc1
HD
1716drop_hpage:
1717 unlock_page(hpage);
1718 put_page(hpage);
34488399 1719 return result;
27e1f827
SL
1720
1721abort:
1722 pte_unmap_unlock(start_pte, ptl);
895f5ee4 1723drop_immap:
8d3c106e 1724 i_mmap_unlock_write(vma->vm_file->f_mapping);
119a5fc1 1725 goto drop_hpage;
27e1f827
SL
1726}
1727
b26e2701 1728static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
27e1f827 1729{
b26e2701
QZ
1730 struct mm_slot *slot = &mm_slot->slot;
1731 struct mm_struct *mm = slot->mm;
27e1f827
SL
1732 int i;
1733
1734 if (likely(mm_slot->nr_pte_mapped_thp == 0))
0edf61e5 1735 return;
27e1f827 1736
d8ed45c5 1737 if (!mmap_write_trylock(mm))
0edf61e5 1738 return;
27e1f827 1739
7d2c4385 1740 if (unlikely(hpage_collapse_test_exit(mm)))
27e1f827
SL
1741 goto out;
1742
1743 for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
34488399 1744 collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
27e1f827
SL
1745
1746out:
1747 mm_slot->nr_pte_mapped_thp = 0;
d8ed45c5 1748 mmap_write_unlock(mm);
27e1f827
SL
1749}
1750
34488399
ZK
1751static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
1752 struct mm_struct *target_mm,
1753 unsigned long target_addr, struct page *hpage,
1754 struct collapse_control *cc)
f3f0e1d2
KS
1755{
1756 struct vm_area_struct *vma;
34488399 1757 int target_result = SCAN_FAIL;
f3f0e1d2
KS
1758
1759 i_mmap_lock_write(mapping);
1760 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
34488399
ZK
1761 int result = SCAN_FAIL;
1762 struct mm_struct *mm = NULL;
1763 unsigned long addr = 0;
1764 pmd_t *pmd;
1765 bool is_target = false;
1766
27e1f827
SL
1767 /*
1768 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1769 * got written to. These VMAs are likely not worth investing
3e4e28c5 1770 * mmap_write_lock(mm) as PMD-mapping is likely to be split
27e1f827
SL
1771 * later.
1772 *
36ee2c78 1773 * Note that vma->anon_vma check is racy: it can be set up after
c1e8d7c6 1774 * the check but before we took mmap_lock by the fault path.
27e1f827
SL
1775 * But page lock would prevent establishing any new ptes of the
1776 * page, so we are safe.
1777 *
1778 * An alternative would be drop the check, but check that page
1779 * table is clear before calling pmdp_collapse_flush() under
1780 * ptl. It has higher chance to recover THP for the VMA, but
8d3c106e
JH
1781 * has higher cost too. It would also probably require locking
1782 * the anon_vma.
27e1f827 1783 */
023f47a8 1784 if (READ_ONCE(vma->anon_vma)) {
34488399
ZK
1785 result = SCAN_PAGE_ANON;
1786 goto next;
1787 }
f3f0e1d2 1788 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
34488399
ZK
1789 if (addr & ~HPAGE_PMD_MASK ||
1790 vma->vm_end < addr + HPAGE_PMD_SIZE) {
1791 result = SCAN_VMA_CHECK;
1792 goto next;
1793 }
18e77600 1794 mm = vma->vm_mm;
34488399
ZK
1795 is_target = mm == target_mm && addr == target_addr;
1796 result = find_pmd_or_thp_or_none(mm, addr, &pmd);
1797 if (result != SCAN_SUCCEED)
1798 goto next;
f3f0e1d2 1799 /*
c1e8d7c6 1800 * We need exclusive mmap_lock to retract page table.
27e1f827
SL
1801 *
1802 * We use trylock due to lock inversion: we need to acquire
c1e8d7c6 1803 * mmap_lock while holding page lock. Fault path does it in
27e1f827 1804 * reverse order. Trylock is a way to avoid deadlock.
34488399
ZK
1805 *
1806 * Also, it's not MADV_COLLAPSE's job to collapse other
1807 * mappings - let khugepaged take care of them later.
f3f0e1d2 1808 */
34488399
ZK
1809 result = SCAN_PTE_MAPPED_HUGEPAGE;
1810 if ((cc->is_khugepaged || is_target) &&
1811 mmap_write_trylock(mm)) {
55fd6fcc
SB
1812 /* trylock for the same lock inversion as above */
1813 if (!vma_try_start_write(vma))
1814 goto unlock_next;
1815
023f47a8
JH
1816 /*
1817 * Re-check whether we have an ->anon_vma, because
1818 * collapse_and_free_pmd() requires that either no
1819 * ->anon_vma exists or the anon_vma is locked.
1820 * We already checked ->anon_vma above, but that check
1821 * is racy because ->anon_vma can be populated under the
1822 * mmap lock in read mode.
1823 */
1824 if (vma->anon_vma) {
1825 result = SCAN_PAGE_ANON;
1826 goto unlock_next;
1827 }
deb4c93a
PX
1828 /*
1829 * When a vma is registered with uffd-wp, we can't
1830 * recycle the pmd pgtable because there can be pte
1831 * markers installed. Skip it only, so the rest mm/vma
1832 * can still have the same file mapped hugely, however
1833 * it'll always mapped in small page size for uffd-wp
1834 * registered ranges.
1835 */
34488399
ZK
1836 if (hpage_collapse_test_exit(mm)) {
1837 result = SCAN_ANY_PROCESS;
1838 goto unlock_next;
1839 }
1840 if (userfaultfd_wp(vma)) {
1841 result = SCAN_PTE_UFFD_WP;
1842 goto unlock_next;
1843 }
1844 collapse_and_free_pmd(mm, vma, addr, pmd);
1845 if (!cc->is_khugepaged && is_target)
1846 result = set_huge_pmd(vma, addr, pmd, hpage);
1847 else
1848 result = SCAN_SUCCEED;
1849
1850unlock_next:
18e77600 1851 mmap_write_unlock(mm);
34488399
ZK
1852 goto next;
1853 }
1854 /*
1855 * Calling context will handle target mm/addr. Otherwise, let
1856 * khugepaged try again later.
1857 */
1858 if (!is_target) {
18e77600 1859 khugepaged_add_pte_mapped_thp(mm, addr);
34488399 1860 continue;
f3f0e1d2 1861 }
34488399
ZK
1862next:
1863 if (is_target)
1864 target_result = result;
f3f0e1d2
KS
1865 }
1866 i_mmap_unlock_write(mapping);
34488399 1867 return target_result;
f3f0e1d2
KS
1868}
1869
1870/**
99cb0dbd 1871 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
f3f0e1d2 1872 *
336e6b53 1873 * @mm: process address space where collapse happens
34488399 1874 * @addr: virtual collapse start address
336e6b53
AS
1875 * @file: file that collapse on
1876 * @start: collapse start address
9710a78a 1877 * @cc: collapse context and scratchpad
336e6b53 1878 *
f3f0e1d2 1879 * Basic scheme is simple, details are more complex:
87c460a0 1880 * - allocate and lock a new huge page;
a2e17cc2 1881 * - scan page cache, locking old pages
99cb0dbd 1882 * + swap/gup in pages if necessary;
a2e17cc2
DS
1883 * - copy data to new page
1884 * - handle shmem holes
1885 * + re-validate that holes weren't filled by someone else
1886 * + check for userfaultfd
ac492b9c 1887 * - finalize updates to the page cache;
77da9389 1888 * - if replacing succeeds:
87c460a0 1889 * + unlock huge page;
a2e17cc2 1890 * + free old pages;
f3f0e1d2 1891 * - if replacing failed;
a2e17cc2 1892 * + unlock old pages
87c460a0 1893 * + unlock and free huge page;
f3f0e1d2 1894 */
34488399
ZK
1895static int collapse_file(struct mm_struct *mm, unsigned long addr,
1896 struct file *file, pgoff_t start,
1897 struct collapse_control *cc)
f3f0e1d2 1898{
579c571e 1899 struct address_space *mapping = file->f_mapping;
50ad2f24 1900 struct page *hpage;
12904d95
JY
1901 struct page *page;
1902 struct page *tmp;
1903 struct folio *folio;
4c9473e8 1904 pgoff_t index = 0, end = start + HPAGE_PMD_NR;
f3f0e1d2 1905 LIST_HEAD(pagelist);
77da9389 1906 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
f3f0e1d2 1907 int nr_none = 0, result = SCAN_SUCCEED;
99cb0dbd 1908 bool is_shmem = shmem_file(file);
4c9473e8 1909 int nr = 0;
f3f0e1d2 1910
99cb0dbd 1911 VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
f3f0e1d2
KS
1912 VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
1913
50ad2f24 1914 result = alloc_charge_hpage(&hpage, mm, cc);
9710a78a 1915 if (result != SCAN_SUCCEED)
f3f0e1d2 1916 goto out;
f3f0e1d2 1917
cae106dd
DS
1918 __SetPageLocked(hpage);
1919 if (is_shmem)
1920 __SetPageSwapBacked(hpage);
1921 hpage->index = start;
1922 hpage->mapping = mapping;
1923
6b24ca4a
MWO
1924 /*
1925 * Ensure we have slots for all the pages in the range. This is
1926 * almost certainly a no-op because most of the pages must be present
1927 */
95feeabb
HD
1928 do {
1929 xas_lock_irq(&xas);
1930 xas_create_range(&xas);
1931 if (!xas_error(&xas))
1932 break;
1933 xas_unlock_irq(&xas);
1934 if (!xas_nomem(&xas, GFP_KERNEL)) {
95feeabb 1935 result = SCAN_FAIL;
cae106dd 1936 goto rollback;
95feeabb
HD
1937 }
1938 } while (1);
1939
77da9389 1940 for (index = start; index < end; index++) {
e8c716bc
HD
1941 xas_set(&xas, index);
1942 page = xas_load(&xas);
77da9389
MW
1943
1944 VM_BUG_ON(index != xas.xa_index);
99cb0dbd
SL
1945 if (is_shmem) {
1946 if (!page) {
1947 /*
1948 * Stop if extent has been truncated or
1949 * hole-punched, and is now completely
1950 * empty.
1951 */
1952 if (index == start) {
1953 if (!xas_next_entry(&xas, end - 1)) {
1954 result = SCAN_TRUNCATED;
1955 goto xa_locked;
1956 }
99cb0dbd
SL
1957 }
1958 if (!shmem_charge(mapping->host, 1)) {
1959 result = SCAN_FAIL;
042a3082 1960 goto xa_locked;
701270fa 1961 }
99cb0dbd
SL
1962 nr_none++;
1963 continue;
701270fa 1964 }
99cb0dbd
SL
1965
1966 if (xa_is_value(page) || !PageUptodate(page)) {
1967 xas_unlock_irq(&xas);
1968 /* swap in or instantiate fallocated page */
7459c149
MWO
1969 if (shmem_get_folio(mapping->host, index,
1970 &folio, SGP_NOALLOC)) {
99cb0dbd
SL
1971 result = SCAN_FAIL;
1972 goto xa_unlocked;
1973 }
1fec6890 1974 /* drain lru cache to help isolate_lru_page() */
efa3d814 1975 lru_add_drain();
7459c149 1976 page = folio_file_page(folio, index);
99cb0dbd
SL
1977 } else if (trylock_page(page)) {
1978 get_page(page);
1979 xas_unlock_irq(&xas);
1980 } else {
1981 result = SCAN_PAGE_LOCK;
042a3082 1982 goto xa_locked;
77da9389 1983 }
99cb0dbd
SL
1984 } else { /* !is_shmem */
1985 if (!page || xa_is_value(page)) {
1986 xas_unlock_irq(&xas);
1987 page_cache_sync_readahead(mapping, &file->f_ra,
1988 file, index,
e5a59d30 1989 end - index);
1fec6890 1990 /* drain lru cache to help isolate_lru_page() */
99cb0dbd
SL
1991 lru_add_drain();
1992 page = find_lock_page(mapping, index);
1993 if (unlikely(page == NULL)) {
1994 result = SCAN_FAIL;
1995 goto xa_unlocked;
1996 }
75f36069
SL
1997 } else if (PageDirty(page)) {
1998 /*
1999 * khugepaged only works on read-only fd,
2000 * so this page is dirty because it hasn't
2001 * been flushed since first write. There
2002 * won't be new dirty pages.
2003 *
2004 * Trigger async flush here and hope the
2005 * writeback is done when khugepaged
2006 * revisits this page.
2007 *
2008 * This is a one-off situation. We are not
2009 * forcing writeback in loop.
2010 */
2011 xas_unlock_irq(&xas);
2012 filemap_flush(mapping);
2013 result = SCAN_FAIL;
2014 goto xa_unlocked;
74c42e1b
RW
2015 } else if (PageWriteback(page)) {
2016 xas_unlock_irq(&xas);
2017 result = SCAN_FAIL;
2018 goto xa_unlocked;
99cb0dbd
SL
2019 } else if (trylock_page(page)) {
2020 get_page(page);
2021 xas_unlock_irq(&xas);
2022 } else {
2023 result = SCAN_PAGE_LOCK;
2024 goto xa_locked;
f3f0e1d2 2025 }
f3f0e1d2
KS
2026 }
2027
2028 /*
b93b0163 2029 * The page must be locked, so we can drop the i_pages lock
f3f0e1d2
KS
2030 * without racing with truncate.
2031 */
2032 VM_BUG_ON_PAGE(!PageLocked(page), page);
4655e5e5
SL
2033
2034 /* make sure the page is up to date */
2035 if (unlikely(!PageUptodate(page))) {
2036 result = SCAN_FAIL;
2037 goto out_unlock;
2038 }
06a5e126
HD
2039
2040 /*
2041 * If file was truncated then extended, or hole-punched, before
2042 * we locked the first page, then a THP might be there already.
58ac9a89 2043 * This will be discovered on the first iteration.
06a5e126
HD
2044 */
2045 if (PageTransCompound(page)) {
58ac9a89
ZK
2046 struct page *head = compound_head(page);
2047
2048 result = compound_order(head) == HPAGE_PMD_ORDER &&
2049 head->index == start
2050 /* Maybe PMD-mapped */
2051 ? SCAN_PTE_MAPPED_HUGEPAGE
2052 : SCAN_PAGE_COMPOUND;
06a5e126
HD
2053 goto out_unlock;
2054 }
f3f0e1d2 2055
64ab3195
VMO
2056 folio = page_folio(page);
2057
2058 if (folio_mapping(folio) != mapping) {
f3f0e1d2
KS
2059 result = SCAN_TRUNCATED;
2060 goto out_unlock;
2061 }
f3f0e1d2 2062
64ab3195
VMO
2063 if (!is_shmem && (folio_test_dirty(folio) ||
2064 folio_test_writeback(folio))) {
4655e5e5
SL
2065 /*
2066 * khugepaged only works on read-only fd, so this
2067 * page is dirty because it hasn't been flushed
2068 * since first write.
2069 */
2070 result = SCAN_FAIL;
2071 goto out_unlock;
2072 }
2073
be2d5756 2074 if (!folio_isolate_lru(folio)) {
f3f0e1d2 2075 result = SCAN_DEL_PAGE_LRU;
042a3082 2076 goto out_unlock;
f3f0e1d2
KS
2077 }
2078
64ab3195
VMO
2079 if (folio_has_private(folio) &&
2080 !filemap_release_folio(folio, GFP_KERNEL)) {
99cb0dbd 2081 result = SCAN_PAGE_HAS_PRIVATE;
64ab3195 2082 folio_putback_lru(folio);
99cb0dbd
SL
2083 goto out_unlock;
2084 }
2085
64ab3195
VMO
2086 if (folio_mapped(folio))
2087 try_to_unmap(folio,
869f7ee6 2088 TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
f3f0e1d2 2089
77da9389 2090 xas_lock_irq(&xas);
f3f0e1d2 2091
e8c716bc 2092 VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
f3f0e1d2
KS
2093
2094 /*
a2e17cc2 2095 * We control three references to the page:
f3f0e1d2 2096 * - we hold a pin on it;
77da9389 2097 * - one reference from page cache;
f3f0e1d2 2098 * - one from isolate_lru_page;
a2e17cc2
DS
2099 * If those are the only references, then any new usage of the
2100 * page will have to fetch it from the page cache. That requires
2101 * locking the page to handle truncate, so any new usage will be
2102 * blocked until we unlock page after collapse/during rollback.
f3f0e1d2 2103 */
a2e17cc2 2104 if (page_count(page) != 3) {
f3f0e1d2 2105 result = SCAN_PAGE_COUNT;
042a3082
HD
2106 xas_unlock_irq(&xas);
2107 putback_lru_page(page);
2108 goto out_unlock;
f3f0e1d2
KS
2109 }
2110
2111 /*
a2e17cc2 2112 * Accumulate the pages that are being collapsed.
f3f0e1d2
KS
2113 */
2114 list_add_tail(&page->lru, &pagelist);
f3f0e1d2 2115 continue;
f3f0e1d2
KS
2116out_unlock:
2117 unlock_page(page);
2118 put_page(page);
042a3082 2119 goto xa_unlocked;
f3f0e1d2
KS
2120 }
2121
12904d95 2122 if (!is_shmem) {
09d91cda 2123 filemap_nr_thps_inc(mapping);
eb6ecbed
CF
2124 /*
2125 * Paired with smp_mb() in do_dentry_open() to ensure
2126 * i_writecount is up to date and the update to nr_thps is
2127 * visible. Ensures the page cache will be truncated if the
2128 * file is opened writable.
2129 */
2130 smp_mb();
2131 if (inode_is_open_for_write(mapping->host)) {
2132 result = SCAN_FAIL;
eb6ecbed 2133 filemap_nr_thps_dec(mapping);
eb6ecbed 2134 }
09d91cda 2135 }
99cb0dbd 2136
042a3082
HD
2137xa_locked:
2138 xas_unlock_irq(&xas);
77da9389 2139xa_unlocked:
042a3082 2140
6d9df8a5
HD
2141 /*
2142 * If collapse is successful, flush must be done now before copying.
2143 * If collapse is unsuccessful, does flush actually need to be done?
2144 * Do it anyway, to clear the state.
2145 */
2146 try_to_unmap_flush();
2147
cae106dd
DS
2148 if (result != SCAN_SUCCEED)
2149 goto rollback;
2150
2151 /*
a2e17cc2 2152 * The old pages are locked, so they won't change anymore.
cae106dd
DS
2153 */
2154 index = start;
2155 list_for_each_entry(page, &pagelist, lru) {
2156 while (index < page->index) {
12904d95
JY
2157 clear_highpage(hpage + (index % HPAGE_PMD_NR));
2158 index++;
2159 }
cae106dd
DS
2160 if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) {
2161 result = SCAN_COPY_MC;
2162 goto rollback;
2163 }
2164 index++;
2165 }
2166 while (index < end) {
2167 clear_highpage(hpage + (index % HPAGE_PMD_NR));
2168 index++;
2169 }
2170
ac492b9c
DS
2171 if (nr_none) {
2172 struct vm_area_struct *vma;
2173 int nr_none_check = 0;
2174
2175 i_mmap_lock_read(mapping);
2176 xas_lock_irq(&xas);
2177
2178 xas_set(&xas, start);
2179 for (index = start; index < end; index++) {
2180 if (!xas_next(&xas)) {
2181 xas_store(&xas, XA_RETRY_ENTRY);
2182 if (xas_error(&xas)) {
2183 result = SCAN_STORE_FAILED;
2184 goto immap_locked;
2185 }
2186 nr_none_check++;
2187 }
2188 }
2189
2190 if (nr_none != nr_none_check) {
2191 result = SCAN_PAGE_FILLED;
2192 goto immap_locked;
2193 }
2194
2195 /*
2196 * If userspace observed a missing page in a VMA with a MODE_MISSING
2197 * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
2198 * page. If so, we need to roll back to avoid suppressing such an
2199 * event. Since wp/minor userfaultfds don't give userspace any
2200 * guarantees that the kernel doesn't fill a missing page with a zero
2201 * page, so they don't matter here.
2202 *
2203 * Any userfaultfds registered after this point will not be able to
2204 * observe any missing pages due to the previously inserted retry
2205 * entries.
2206 */
2207 vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
2208 if (userfaultfd_missing(vma)) {
2209 result = SCAN_EXCEED_NONE_PTE;
2210 goto immap_locked;
2211 }
2212 }
2213
2214immap_locked:
2215 i_mmap_unlock_read(mapping);
2216 if (result != SCAN_SUCCEED) {
2217 xas_set(&xas, start);
2218 for (index = start; index < end; index++) {
2219 if (xas_next(&xas) == XA_RETRY_ENTRY)
2220 xas_store(&xas, NULL);
2221 }
2222
2223 xas_unlock_irq(&xas);
2224 goto rollback;
2225 }
2226 } else {
2227 xas_lock_irq(&xas);
12904d95
JY
2228 }
2229
2230 nr = thp_nr_pages(hpage);
cae106dd
DS
2231 if (is_shmem)
2232 __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
2233 else
2234 __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
12904d95 2235
cae106dd
DS
2236 if (nr_none) {
2237 __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
2238 /* nr_none is always 0 for non-shmem. */
2239 __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
2240 }
f3f0e1d2 2241
a2e17cc2
DS
2242 /*
2243 * Mark hpage as uptodate before inserting it into the page cache so
2244 * that it isn't mistaken for an fallocated but unwritten page.
2245 */
cae106dd
DS
2246 folio = page_folio(hpage);
2247 folio_mark_uptodate(folio);
2248 folio_ref_add(folio, HPAGE_PMD_NR - 1);
284a344e 2249
cae106dd
DS
2250 if (is_shmem)
2251 folio_mark_dirty(folio);
2252 folio_add_lru(folio);
f3f0e1d2 2253
a2e17cc2
DS
2254 /* Join all the small entries into a single multi-index entry. */
2255 xas_set_order(&xas, start, HPAGE_PMD_ORDER);
2256 xas_store(&xas, hpage);
0175ab61 2257 WARN_ON_ONCE(xas_error(&xas));
a2e17cc2
DS
2258 xas_unlock_irq(&xas);
2259
cae106dd
DS
2260 /*
2261 * Remove pte page tables, so we can re-fault the page as huge.
2262 */
2263 result = retract_page_tables(mapping, start, mm, addr, hpage,
2264 cc);
2265 unlock_page(hpage);
ac492b9c
DS
2266
2267 /*
2268 * The collapse has succeeded, so free the old pages.
2269 */
2270 list_for_each_entry_safe(page, tmp, &pagelist, lru) {
2271 list_del(&page->lru);
2272 page->mapping = NULL;
ac492b9c
DS
2273 ClearPageActive(page);
2274 ClearPageUnevictable(page);
2275 unlock_page(page);
a2e17cc2 2276 folio_put_refs(page_folio(page), 3);
ac492b9c
DS
2277 }
2278
cae106dd
DS
2279 goto out;
2280
2281rollback:
2282 /* Something went wrong: roll back page cache changes */
cae106dd 2283 if (nr_none) {
a2e17cc2 2284 xas_lock_irq(&xas);
cae106dd
DS
2285 mapping->nrpages -= nr_none;
2286 shmem_uncharge(mapping->host, nr_none);
a2e17cc2 2287 xas_unlock_irq(&xas);
cae106dd 2288 }
aaa52e34 2289
a2e17cc2 2290 list_for_each_entry_safe(page, tmp, &pagelist, lru) {
cae106dd 2291 list_del(&page->lru);
cae106dd
DS
2292 unlock_page(page);
2293 putback_lru_page(page);
a2e17cc2 2294 put_page(page);
cae106dd 2295 }
cae106dd
DS
2296 /*
2297 * Undo the updates of filemap_nr_thps_inc for non-SHMEM
2298 * file only. This undo is not needed unless failure is
2299 * due to SCAN_COPY_MC.
2300 */
2301 if (!is_shmem && result == SCAN_COPY_MC) {
2302 filemap_nr_thps_dec(mapping);
12904d95 2303 /*
cae106dd
DS
2304 * Paired with smp_mb() in do_dentry_open() to
2305 * ensure the update to nr_thps is visible.
12904d95 2306 */
cae106dd
DS
2307 smp_mb();
2308 }
12904d95 2309
cae106dd 2310 hpage->mapping = NULL;
042a3082 2311
cae106dd
DS
2312 unlock_page(hpage);
2313 put_page(hpage);
f3f0e1d2
KS
2314out:
2315 VM_BUG_ON(!list_empty(&pagelist));
4c9473e8 2316 trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
50ad2f24 2317 return result;
f3f0e1d2
KS
2318}
2319
34488399
ZK
2320static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
2321 struct file *file, pgoff_t start,
2322 struct collapse_control *cc)
f3f0e1d2
KS
2323{
2324 struct page *page = NULL;
579c571e 2325 struct address_space *mapping = file->f_mapping;
85b392db 2326 XA_STATE(xas, &mapping->i_pages, start);
f3f0e1d2
KS
2327 int present, swap;
2328 int node = NUMA_NO_NODE;
2329 int result = SCAN_SUCCEED;
2330
2331 present = 0;
2332 swap = 0;
34d6b470 2333 memset(cc->node_load, 0, sizeof(cc->node_load));
e031ff96 2334 nodes_clear(cc->alloc_nmask);
f3f0e1d2 2335 rcu_read_lock();
85b392db
MW
2336 xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
2337 if (xas_retry(&xas, page))
f3f0e1d2 2338 continue;
f3f0e1d2 2339
85b392db 2340 if (xa_is_value(page)) {
d8ea7cc8
ZK
2341 ++swap;
2342 if (cc->is_khugepaged &&
2343 swap > khugepaged_max_ptes_swap) {
f3f0e1d2 2344 result = SCAN_EXCEED_SWAP_PTE;
e9ea874a 2345 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
f3f0e1d2
KS
2346 break;
2347 }
2348 continue;
2349 }
2350
6b24ca4a 2351 /*
58ac9a89 2352 * TODO: khugepaged should compact smaller compound pages
6b24ca4a
MWO
2353 * into a PMD sized page
2354 */
f3f0e1d2 2355 if (PageTransCompound(page)) {
58ac9a89
ZK
2356 struct page *head = compound_head(page);
2357
2358 result = compound_order(head) == HPAGE_PMD_ORDER &&
2359 head->index == start
2360 /* Maybe PMD-mapped */
2361 ? SCAN_PTE_MAPPED_HUGEPAGE
2362 : SCAN_PAGE_COMPOUND;
2363 /*
2364 * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
2365 * by the caller won't touch the page cache, and so
2366 * it's safe to skip LRU and refcount checks before
2367 * returning.
2368 */
f3f0e1d2
KS
2369 break;
2370 }
2371
2372 node = page_to_nid(page);
7d2c4385 2373 if (hpage_collapse_scan_abort(node, cc)) {
f3f0e1d2
KS
2374 result = SCAN_SCAN_ABORT;
2375 break;
2376 }
34d6b470 2377 cc->node_load[node]++;
f3f0e1d2
KS
2378
2379 if (!PageLRU(page)) {
2380 result = SCAN_PAGE_LRU;
2381 break;
2382 }
2383
99cb0dbd
SL
2384 if (page_count(page) !=
2385 1 + page_mapcount(page) + page_has_private(page)) {
f3f0e1d2
KS
2386 result = SCAN_PAGE_COUNT;
2387 break;
2388 }
2389
2390 /*
2391 * We probably should check if the page is referenced here, but
2392 * nobody would transfer pte_young() to PageReferenced() for us.
2393 * And rmap walk here is just too costly...
2394 */
2395
2396 present++;
2397
2398 if (need_resched()) {
85b392db 2399 xas_pause(&xas);
f3f0e1d2 2400 cond_resched_rcu();
f3f0e1d2
KS
2401 }
2402 }
2403 rcu_read_unlock();
2404
2405 if (result == SCAN_SUCCEED) {
d8ea7cc8
ZK
2406 if (cc->is_khugepaged &&
2407 present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
f3f0e1d2 2408 result = SCAN_EXCEED_NONE_PTE;
e9ea874a 2409 count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
f3f0e1d2 2410 } else {
34488399 2411 result = collapse_file(mm, addr, file, start, cc);
f3f0e1d2
KS
2412 }
2413 }
2414
045634ff 2415 trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
50ad2f24 2416 return result;
f3f0e1d2
KS
2417}
2418#else
34488399
ZK
2419static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
2420 struct file *file, pgoff_t start,
2421 struct collapse_control *cc)
f3f0e1d2
KS
2422{
2423 BUILD_BUG();
2424}
27e1f827 2425
b26e2701 2426static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
27e1f827 2427{
27e1f827 2428}
58ac9a89
ZK
2429
2430static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
2431 unsigned long addr)
2432{
2433 return false;
2434}
f3f0e1d2
KS
2435#endif
2436
50ad2f24 2437static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
34d6b470 2438 struct collapse_control *cc)
b46e756f
KS
2439 __releases(&khugepaged_mm_lock)
2440 __acquires(&khugepaged_mm_lock)
2441{
68540502 2442 struct vma_iterator vmi;
b26e2701
QZ
2443 struct khugepaged_mm_slot *mm_slot;
2444 struct mm_slot *slot;
b46e756f
KS
2445 struct mm_struct *mm;
2446 struct vm_area_struct *vma;
2447 int progress = 0;
2448
2449 VM_BUG_ON(!pages);
35f3aa39 2450 lockdep_assert_held(&khugepaged_mm_lock);
50ad2f24 2451 *result = SCAN_FAIL;
b46e756f 2452
b26e2701 2453 if (khugepaged_scan.mm_slot) {
b46e756f 2454 mm_slot = khugepaged_scan.mm_slot;
b26e2701
QZ
2455 slot = &mm_slot->slot;
2456 } else {
2457 slot = list_entry(khugepaged_scan.mm_head.next,
b46e756f 2458 struct mm_slot, mm_node);
b26e2701 2459 mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
b46e756f
KS
2460 khugepaged_scan.address = 0;
2461 khugepaged_scan.mm_slot = mm_slot;
2462 }
2463 spin_unlock(&khugepaged_mm_lock);
27e1f827 2464 khugepaged_collapse_pte_mapped_thps(mm_slot);
b46e756f 2465
b26e2701 2466 mm = slot->mm;
3b454ad3
YS
2467 /*
2468 * Don't wait for semaphore (to avoid long wait times). Just move to
2469 * the next mm on the list.
2470 */
2471 vma = NULL;
d8ed45c5 2472 if (unlikely(!mmap_read_trylock(mm)))
c1e8d7c6 2473 goto breakouterloop_mmap_lock;
b46e756f
KS
2474
2475 progress++;
68540502
MWO
2476 if (unlikely(hpage_collapse_test_exit(mm)))
2477 goto breakouterloop;
2478
2479 vma_iter_init(&vmi, mm, khugepaged_scan.address);
2480 for_each_vma(vmi, vma) {
b46e756f
KS
2481 unsigned long hstart, hend;
2482
2483 cond_resched();
7d2c4385 2484 if (unlikely(hpage_collapse_test_exit(mm))) {
b46e756f
KS
2485 progress++;
2486 break;
2487 }
a7f4e6e4 2488 if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
b46e756f
KS
2489skip:
2490 progress++;
2491 continue;
2492 }
4fa6893f
YS
2493 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2494 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
b46e756f
KS
2495 if (khugepaged_scan.address > hend)
2496 goto skip;
2497 if (khugepaged_scan.address < hstart)
2498 khugepaged_scan.address = hstart;
2499 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2500
2501 while (khugepaged_scan.address < hend) {
50ad2f24
ZK
2502 bool mmap_locked = true;
2503
b46e756f 2504 cond_resched();
7d2c4385 2505 if (unlikely(hpage_collapse_test_exit(mm)))
b46e756f
KS
2506 goto breakouterloop;
2507
2508 VM_BUG_ON(khugepaged_scan.address < hstart ||
2509 khugepaged_scan.address + HPAGE_PMD_SIZE >
2510 hend);
99cb0dbd 2511 if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
396bcc52 2512 struct file *file = get_file(vma->vm_file);
f3f0e1d2
KS
2513 pgoff_t pgoff = linear_page_index(vma,
2514 khugepaged_scan.address);
99cb0dbd 2515
d8ed45c5 2516 mmap_read_unlock(mm);
34488399
ZK
2517 *result = hpage_collapse_scan_file(mm,
2518 khugepaged_scan.address,
2519 file, pgoff, cc);
50ad2f24 2520 mmap_locked = false;
f3f0e1d2
KS
2521 fput(file);
2522 } else {
7d2c4385
ZK
2523 *result = hpage_collapse_scan_pmd(mm, vma,
2524 khugepaged_scan.address,
2525 &mmap_locked,
2526 cc);
f3f0e1d2 2527 }
58ac9a89
ZK
2528 switch (*result) {
2529 case SCAN_PTE_MAPPED_HUGEPAGE: {
2530 pmd_t *pmd;
2531
2532 *result = find_pmd_or_thp_or_none(mm,
2533 khugepaged_scan.address,
2534 &pmd);
2535 if (*result != SCAN_SUCCEED)
2536 break;
2537 if (!khugepaged_add_pte_mapped_thp(mm,
2538 khugepaged_scan.address))
2539 break;
2540 } fallthrough;
2541 case SCAN_SUCCEED:
50ad2f24 2542 ++khugepaged_pages_collapsed;
58ac9a89
ZK
2543 break;
2544 default:
2545 break;
f3f0e1d2 2546 }
58ac9a89 2547
b46e756f
KS
2548 /* move to next address */
2549 khugepaged_scan.address += HPAGE_PMD_SIZE;
2550 progress += HPAGE_PMD_NR;
50ad2f24
ZK
2551 if (!mmap_locked)
2552 /*
2553 * We released mmap_lock so break loop. Note
2554 * that we drop mmap_lock before all hugepage
2555 * allocations, so if allocation fails, we are
2556 * guaranteed to break here and report the
2557 * correct result back to caller.
2558 */
c1e8d7c6 2559 goto breakouterloop_mmap_lock;
b46e756f
KS
2560 if (progress >= pages)
2561 goto breakouterloop;
2562 }
2563 }
2564breakouterloop:
d8ed45c5 2565 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
c1e8d7c6 2566breakouterloop_mmap_lock:
b46e756f
KS
2567
2568 spin_lock(&khugepaged_mm_lock);
2569 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2570 /*
2571 * Release the current mm_slot if this mm is about to die, or
2572 * if we scanned all vmas of this mm.
2573 */
7d2c4385 2574 if (hpage_collapse_test_exit(mm) || !vma) {
b46e756f
KS
2575 /*
2576 * Make sure that if mm_users is reaching zero while
2577 * khugepaged runs here, khugepaged_exit will find
2578 * mm_slot not pointing to the exiting mm.
2579 */
b26e2701
QZ
2580 if (slot->mm_node.next != &khugepaged_scan.mm_head) {
2581 slot = list_entry(slot->mm_node.next,
2582 struct mm_slot, mm_node);
2583 khugepaged_scan.mm_slot =
2584 mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
b46e756f
KS
2585 khugepaged_scan.address = 0;
2586 } else {
2587 khugepaged_scan.mm_slot = NULL;
2588 khugepaged_full_scans++;
2589 }
2590
2591 collect_mm_slot(mm_slot);
2592 }
2593
2594 return progress;
2595}
2596
2597static int khugepaged_has_work(void)
2598{
2599 return !list_empty(&khugepaged_scan.mm_head) &&
1064026b 2600 hugepage_flags_enabled();
b46e756f
KS
2601}
2602
2603static int khugepaged_wait_event(void)
2604{
2605 return !list_empty(&khugepaged_scan.mm_head) ||
2606 kthread_should_stop();
2607}
2608
34d6b470 2609static void khugepaged_do_scan(struct collapse_control *cc)
b46e756f 2610{
b46e756f 2611 unsigned int progress = 0, pass_through_head = 0;
89dc6a96 2612 unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
b46e756f 2613 bool wait = true;
50ad2f24 2614 int result = SCAN_SUCCEED;
b46e756f 2615
a980df33
KS
2616 lru_add_drain_all();
2617
c6a7f445 2618 while (true) {
b46e756f
KS
2619 cond_resched();
2620
2621 if (unlikely(kthread_should_stop() || try_to_freeze()))
2622 break;
2623
2624 spin_lock(&khugepaged_mm_lock);
2625 if (!khugepaged_scan.mm_slot)
2626 pass_through_head++;
2627 if (khugepaged_has_work() &&
2628 pass_through_head < 2)
2629 progress += khugepaged_scan_mm_slot(pages - progress,
50ad2f24 2630 &result, cc);
b46e756f
KS
2631 else
2632 progress = pages;
2633 spin_unlock(&khugepaged_mm_lock);
b46e756f 2634
c6a7f445
YS
2635 if (progress >= pages)
2636 break;
2637
50ad2f24 2638 if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
c6a7f445
YS
2639 /*
2640 * If fail to allocate the first time, try to sleep for
2641 * a while. When hit again, cancel the scan.
2642 */
2643 if (!wait)
2644 break;
2645 wait = false;
c6a7f445
YS
2646 khugepaged_alloc_sleep();
2647 }
2648 }
b46e756f
KS
2649}
2650
2651static bool khugepaged_should_wakeup(void)
2652{
2653 return kthread_should_stop() ||
2654 time_after_eq(jiffies, khugepaged_sleep_expire);
2655}
2656
2657static void khugepaged_wait_work(void)
2658{
2659 if (khugepaged_has_work()) {
2660 const unsigned long scan_sleep_jiffies =
2661 msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
2662
2663 if (!scan_sleep_jiffies)
2664 return;
2665
2666 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
2667 wait_event_freezable_timeout(khugepaged_wait,
2668 khugepaged_should_wakeup(),
2669 scan_sleep_jiffies);
2670 return;
2671 }
2672
1064026b 2673 if (hugepage_flags_enabled())
b46e756f
KS
2674 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2675}
2676
2677static int khugepaged(void *none)
2678{
b26e2701 2679 struct khugepaged_mm_slot *mm_slot;
b46e756f
KS
2680
2681 set_freezable();
2682 set_user_nice(current, MAX_NICE);
2683
2684 while (!kthread_should_stop()) {
34d6b470 2685 khugepaged_do_scan(&khugepaged_collapse_control);
b46e756f
KS
2686 khugepaged_wait_work();
2687 }
2688
2689 spin_lock(&khugepaged_mm_lock);
2690 mm_slot = khugepaged_scan.mm_slot;
2691 khugepaged_scan.mm_slot = NULL;
2692 if (mm_slot)
2693 collect_mm_slot(mm_slot);
2694 spin_unlock(&khugepaged_mm_lock);
2695 return 0;
2696}
2697
2698static void set_recommended_min_free_kbytes(void)
2699{
2700 struct zone *zone;
2701 int nr_zones = 0;
2702 unsigned long recommended_min;
2703
1064026b 2704 if (!hugepage_flags_enabled()) {
bd3400ea
LF
2705 calculate_min_free_kbytes();
2706 goto update_wmarks;
2707 }
2708
b7d349c7
JK
2709 for_each_populated_zone(zone) {
2710 /*
2711 * We don't need to worry about fragmentation of
2712 * ZONE_MOVABLE since it only has movable pages.
2713 */
2714 if (zone_idx(zone) > gfp_zone(GFP_USER))
2715 continue;
2716
b46e756f 2717 nr_zones++;
b7d349c7 2718 }
b46e756f
KS
2719
2720 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
2721 recommended_min = pageblock_nr_pages * nr_zones * 2;
2722
2723 /*
2724 * Make sure that on average at least two pageblocks are almost free
2725 * of another type, one for a migratetype to fall back to and a
2726 * second to avoid subsequent fallbacks of other types There are 3
2727 * MIGRATE_TYPES we care about.
2728 */
2729 recommended_min += pageblock_nr_pages * nr_zones *
2730 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
2731
2732 /* don't ever allow to reserve more than 5% of the lowmem */
2733 recommended_min = min(recommended_min,
2734 (unsigned long) nr_free_buffer_pages() / 20);
2735 recommended_min <<= (PAGE_SHIFT-10);
2736
2737 if (recommended_min > min_free_kbytes) {
2738 if (user_min_free_kbytes >= 0)
2739 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
2740 min_free_kbytes, recommended_min);
2741
2742 min_free_kbytes = recommended_min;
2743 }
bd3400ea
LF
2744
2745update_wmarks:
b46e756f
KS
2746 setup_per_zone_wmarks();
2747}
2748
2749int start_stop_khugepaged(void)
2750{
b46e756f
KS
2751 int err = 0;
2752
2753 mutex_lock(&khugepaged_mutex);
1064026b 2754 if (hugepage_flags_enabled()) {
b46e756f
KS
2755 if (!khugepaged_thread)
2756 khugepaged_thread = kthread_run(khugepaged, NULL,
2757 "khugepaged");
2758 if (IS_ERR(khugepaged_thread)) {
2759 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
2760 err = PTR_ERR(khugepaged_thread);
2761 khugepaged_thread = NULL;
2762 goto fail;
2763 }
2764
2765 if (!list_empty(&khugepaged_scan.mm_head))
2766 wake_up_interruptible(&khugepaged_wait);
b46e756f
KS
2767 } else if (khugepaged_thread) {
2768 kthread_stop(khugepaged_thread);
2769 khugepaged_thread = NULL;
2770 }
bd3400ea 2771 set_recommended_min_free_kbytes();
b46e756f
KS
2772fail:
2773 mutex_unlock(&khugepaged_mutex);
2774 return err;
2775}
4aab2be0
VB
2776
2777void khugepaged_min_free_kbytes_update(void)
2778{
2779 mutex_lock(&khugepaged_mutex);
1064026b 2780 if (hugepage_flags_enabled() && khugepaged_thread)
4aab2be0
VB
2781 set_recommended_min_free_kbytes();
2782 mutex_unlock(&khugepaged_mutex);
2783}
7d8faaf1 2784
57e9cc50
JW
2785bool current_is_khugepaged(void)
2786{
2787 return kthread_func(current) == khugepaged;
2788}
2789
7d8faaf1
ZK
2790static int madvise_collapse_errno(enum scan_result r)
2791{
2792 /*
2793 * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
2794 * actionable feedback to caller, so they may take an appropriate
2795 * fallback measure depending on the nature of the failure.
2796 */
2797 switch (r) {
2798 case SCAN_ALLOC_HUGE_PAGE_FAIL:
2799 return -ENOMEM;
2800 case SCAN_CGROUP_CHARGE_FAIL:
ac492b9c 2801 case SCAN_EXCEED_NONE_PTE:
7d8faaf1
ZK
2802 return -EBUSY;
2803 /* Resource temporary unavailable - trying again might succeed */
ae63c898 2804 case SCAN_PAGE_COUNT:
7d8faaf1
ZK
2805 case SCAN_PAGE_LOCK:
2806 case SCAN_PAGE_LRU:
0f3e2a2c 2807 case SCAN_DEL_PAGE_LRU:
ac492b9c 2808 case SCAN_PAGE_FILLED:
7d8faaf1
ZK
2809 return -EAGAIN;
2810 /*
2811 * Other: Trying again likely not to succeed / error intrinsic to
2812 * specified memory range. khugepaged likely won't be able to collapse
2813 * either.
2814 */
2815 default:
2816 return -EINVAL;
2817 }
2818}
2819
2820int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
2821 unsigned long start, unsigned long end)
2822{
2823 struct collapse_control *cc;
2824 struct mm_struct *mm = vma->vm_mm;
2825 unsigned long hstart, hend, addr;
2826 int thps = 0, last_fail = SCAN_FAIL;
2827 bool mmap_locked = true;
2828
2829 BUG_ON(vma->vm_start > start);
2830 BUG_ON(vma->vm_end < end);
2831
2832 *prev = vma;
2833
7d8faaf1
ZK
2834 if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
2835 return -EINVAL;
2836
2837 cc = kmalloc(sizeof(*cc), GFP_KERNEL);
2838 if (!cc)
2839 return -ENOMEM;
2840 cc->is_khugepaged = false;
7d8faaf1
ZK
2841
2842 mmgrab(mm);
2843 lru_add_drain_all();
2844
2845 hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2846 hend = end & HPAGE_PMD_MASK;
2847
2848 for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
2849 int result = SCAN_FAIL;
2850
2851 if (!mmap_locked) {
2852 cond_resched();
2853 mmap_read_lock(mm);
2854 mmap_locked = true;
34488399
ZK
2855 result = hugepage_vma_revalidate(mm, addr, false, &vma,
2856 cc);
7d8faaf1
ZK
2857 if (result != SCAN_SUCCEED) {
2858 last_fail = result;
2859 goto out_nolock;
2860 }
4d24de94 2861
52dc0310 2862 hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
7d8faaf1
ZK
2863 }
2864 mmap_assert_locked(mm);
2865 memset(cc->node_load, 0, sizeof(cc->node_load));
e031ff96 2866 nodes_clear(cc->alloc_nmask);
34488399
ZK
2867 if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
2868 struct file *file = get_file(vma->vm_file);
2869 pgoff_t pgoff = linear_page_index(vma, addr);
2870
2871 mmap_read_unlock(mm);
2872 mmap_locked = false;
2873 result = hpage_collapse_scan_file(mm, addr, file, pgoff,
2874 cc);
2875 fput(file);
2876 } else {
2877 result = hpage_collapse_scan_pmd(mm, vma, addr,
2878 &mmap_locked, cc);
2879 }
7d8faaf1
ZK
2880 if (!mmap_locked)
2881 *prev = NULL; /* Tell caller we dropped mmap_lock */
2882
34488399 2883handle_result:
7d8faaf1
ZK
2884 switch (result) {
2885 case SCAN_SUCCEED:
2886 case SCAN_PMD_MAPPED:
2887 ++thps;
2888 break;
34488399
ZK
2889 case SCAN_PTE_MAPPED_HUGEPAGE:
2890 BUG_ON(mmap_locked);
2891 BUG_ON(*prev);
2892 mmap_write_lock(mm);
2893 result = collapse_pte_mapped_thp(mm, addr, true);
2894 mmap_write_unlock(mm);
2895 goto handle_result;
7d8faaf1
ZK
2896 /* Whitelisted set of results where continuing OK */
2897 case SCAN_PMD_NULL:
2898 case SCAN_PTE_NON_PRESENT:
2899 case SCAN_PTE_UFFD_WP:
2900 case SCAN_PAGE_RO:
2901 case SCAN_LACK_REFERENCED_PAGE:
2902 case SCAN_PAGE_NULL:
2903 case SCAN_PAGE_COUNT:
2904 case SCAN_PAGE_LOCK:
2905 case SCAN_PAGE_COMPOUND:
2906 case SCAN_PAGE_LRU:
0f3e2a2c 2907 case SCAN_DEL_PAGE_LRU:
7d8faaf1
ZK
2908 last_fail = result;
2909 break;
2910 default:
2911 last_fail = result;
2912 /* Other error, exit */
2913 goto out_maybelock;
2914 }
2915 }
2916
2917out_maybelock:
2918 /* Caller expects us to hold mmap_lock on return */
2919 if (!mmap_locked)
2920 mmap_read_lock(mm);
2921out_nolock:
2922 mmap_assert_locked(mm);
2923 mmdrop(mm);
2924 kfree(cc);
2925
2926 return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
2927 : madvise_collapse_errno(last_fail);
2928}