]> git.ipfire.org Git - thirdparty/linux.git/blame - mm/page_alloc.c
mm: page_alloc: collect mem statistic into show_mem.c
[thirdparty/linux.git] / mm / page_alloc.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * linux/mm/page_alloc.c
4 *
5 * Manages the free list, the system allocates free pages here.
6 * Note that kmalloc() lives in slab.c
7 *
8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 * Swap reorganised 29.12.95, Stephen Tweedie
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
12 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
13 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
14 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
15 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
16 */
17
1da177e4
LT
18#include <linux/stddef.h>
19#include <linux/mm.h>
ca79b0c2 20#include <linux/highmem.h>
1da177e4 21#include <linux/interrupt.h>
10ed273f 22#include <linux/jiffies.h>
1da177e4 23#include <linux/compiler.h>
9f158333 24#include <linux/kernel.h>
b8c73fc2 25#include <linux/kasan.h>
b073d7f8 26#include <linux/kmsan.h>
1da177e4
LT
27#include <linux/module.h>
28#include <linux/suspend.h>
29#include <linux/pagevec.h>
a238ab5b 30#include <linux/ratelimit.h>
5a3135c2 31#include <linux/oom.h>
1da177e4
LT
32#include <linux/topology.h>
33#include <linux/sysctl.h>
34#include <linux/cpu.h>
35#include <linux/cpuset.h>
bdc8cb98 36#include <linux/memory_hotplug.h>
1da177e4 37#include <linux/nodemask.h>
a6cccdc3 38#include <linux/vmstat.h>
c713216d
MG
39#include <linux/sort.h>
40#include <linux/pfn.h>
933e312e 41#include <linux/fault-inject.h>
56de7263 42#include <linux/compaction.h>
0d3d062a 43#include <trace/events/kmem.h>
d379f01d 44#include <trace/events/oom.h>
268bb0ce 45#include <linux/prefetch.h>
6e543d57 46#include <linux/mm_inline.h>
f920e413 47#include <linux/mmu_notifier.h>
041d3a8c 48#include <linux/migrate.h>
5b3cc15a 49#include <linux/sched/mm.h>
48c96a36 50#include <linux/page_owner.h>
df4e817b 51#include <linux/page_table_check.h>
4949148a 52#include <linux/memcontrol.h>
42c269c8 53#include <linux/ftrace.h>
d92a8cfc 54#include <linux/lockdep.h>
556b969a 55#include <linux/nmi.h>
eb414681 56#include <linux/psi.h>
4aab2be0 57#include <linux/khugepaged.h>
5bf18281 58#include <linux/delayacct.h>
ac924c60 59#include <asm/div64.h>
1da177e4 60#include "internal.h"
e900a918 61#include "shuffle.h"
36e66c55 62#include "page_reporting.h"
1da177e4 63
f04a5d5d
DH
64/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
65typedef int __bitwise fpi_t;
66
67/* No special request */
68#define FPI_NONE ((__force fpi_t)0)
69
70/*
71 * Skip free page reporting notification for the (possibly merged) page.
72 * This does not hinder free page reporting from grabbing the page,
73 * reporting it and marking it "reported" - it only skips notifying
74 * the free page reporting infrastructure about a newly freed page. For
75 * example, used when temporarily pulling a page from a freelist and
76 * putting it back unmodified.
77 */
78#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
79
47b6a24a
DH
80/*
81 * Place the (possibly merged) page to the tail of the freelist. Will ignore
82 * page shuffling (relevant code - e.g., memory onlining - is expected to
83 * shuffle the whole zone).
84 *
85 * Note: No code should rely on this flag for correctness - it's purely
86 * to allow for optimizations when handing back either fresh pages
87 * (memory onlining) or untouched pages (page isolation, free page
88 * reporting).
89 */
90#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
91
c8e251fa
CS
92/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
93static DEFINE_MUTEX(pcp_batch_high_lock);
74f44822 94#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
c8e251fa 95
4b23a68f
MG
96#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
97/*
98 * On SMP, spin_trylock is sufficient protection.
99 * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
100 */
101#define pcp_trylock_prepare(flags) do { } while (0)
102#define pcp_trylock_finish(flag) do { } while (0)
103#else
104
105/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
106#define pcp_trylock_prepare(flags) local_irq_save(flags)
107#define pcp_trylock_finish(flags) local_irq_restore(flags)
108#endif
109
01b44456
MG
110/*
111 * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
112 * a migration causing the wrong PCP to be locked and remote memory being
113 * potentially allocated, pin the task to the CPU for the lookup+lock.
114 * preempt_disable is used on !RT because it is faster than migrate_disable.
115 * migrate_disable is used on RT because otherwise RT spinlock usage is
116 * interfered with and a high priority task cannot preempt the allocator.
117 */
118#ifndef CONFIG_PREEMPT_RT
119#define pcpu_task_pin() preempt_disable()
120#define pcpu_task_unpin() preempt_enable()
121#else
122#define pcpu_task_pin() migrate_disable()
123#define pcpu_task_unpin() migrate_enable()
124#endif
c8e251fa 125
01b44456
MG
126/*
127 * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
128 * Return value should be used with equivalent unlock helper.
129 */
130#define pcpu_spin_lock(type, member, ptr) \
131({ \
132 type *_ret; \
133 pcpu_task_pin(); \
134 _ret = this_cpu_ptr(ptr); \
135 spin_lock(&_ret->member); \
136 _ret; \
137})
138
57490774 139#define pcpu_spin_trylock(type, member, ptr) \
01b44456
MG
140({ \
141 type *_ret; \
142 pcpu_task_pin(); \
143 _ret = this_cpu_ptr(ptr); \
57490774 144 if (!spin_trylock(&_ret->member)) { \
01b44456
MG
145 pcpu_task_unpin(); \
146 _ret = NULL; \
147 } \
148 _ret; \
149})
150
151#define pcpu_spin_unlock(member, ptr) \
152({ \
153 spin_unlock(&ptr->member); \
154 pcpu_task_unpin(); \
155})
156
01b44456
MG
157/* struct per_cpu_pages specific helpers. */
158#define pcp_spin_lock(ptr) \
159 pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
160
57490774
MG
161#define pcp_spin_trylock(ptr) \
162 pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
01b44456
MG
163
164#define pcp_spin_unlock(ptr) \
165 pcpu_spin_unlock(lock, ptr)
166
72812019
LS
167#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
168DEFINE_PER_CPU(int, numa_node);
169EXPORT_PER_CPU_SYMBOL(numa_node);
170#endif
171
4518085e
KW
172DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
173
7aac7898
LS
174#ifdef CONFIG_HAVE_MEMORYLESS_NODES
175/*
176 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
177 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
178 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
179 * defined in <linux/topology.h>.
180 */
181DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
182EXPORT_PER_CPU_SYMBOL(_numa_mem_);
183#endif
184
8b885f53 185static DEFINE_MUTEX(pcpu_drain_mutex);
bd233f53 186
38addce8 187#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
58bea414 188volatile unsigned long latent_entropy __latent_entropy;
38addce8
ER
189EXPORT_SYMBOL(latent_entropy);
190#endif
191
1da177e4 192/*
13808910 193 * Array of node states.
1da177e4 194 */
13808910
CL
195nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
196 [N_POSSIBLE] = NODE_MASK_ALL,
197 [N_ONLINE] = { { [0] = 1UL } },
198#ifndef CONFIG_NUMA
199 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
200#ifdef CONFIG_HIGHMEM
201 [N_HIGH_MEMORY] = { { [0] = 1UL } },
20b2f52b 202#endif
20b2f52b 203 [N_MEMORY] = { { [0] = 1UL } },
13808910
CL
204 [N_CPU] = { { [0] = 1UL } },
205#endif /* NUMA */
206};
207EXPORT_SYMBOL(node_states);
208
74f44822 209int percpu_pagelist_high_fraction;
dcce284a 210gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
6471384a 211
bb14c2c7
VB
212/*
213 * A cached value of the page's pageblock's migratetype, used when the page is
214 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
215 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
216 * Also the migratetype set in the page does not necessarily match the pcplist
217 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
218 * other index - this ensures that it will be put on the correct CMA freelist.
219 */
220static inline int get_pcppage_migratetype(struct page *page)
221{
222 return page->index;
223}
224
225static inline void set_pcppage_migratetype(struct page *page, int migratetype)
226{
227 page->index = migratetype;
228}
229
452aa699
RW
230#ifdef CONFIG_PM_SLEEP
231/*
232 * The following functions are used by the suspend/hibernate code to temporarily
233 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
234 * while devices are suspended. To avoid races with the suspend/hibernate code,
55f2503c
PL
235 * they should always be called with system_transition_mutex held
236 * (gfp_allowed_mask also should only be modified with system_transition_mutex
237 * held, unless the suspend/hibernate code is guaranteed not to run in parallel
238 * with that modification).
452aa699 239 */
c9e664f1
RW
240
241static gfp_t saved_gfp_mask;
242
243void pm_restore_gfp_mask(void)
452aa699 244{
55f2503c 245 WARN_ON(!mutex_is_locked(&system_transition_mutex));
c9e664f1
RW
246 if (saved_gfp_mask) {
247 gfp_allowed_mask = saved_gfp_mask;
248 saved_gfp_mask = 0;
249 }
452aa699
RW
250}
251
c9e664f1 252void pm_restrict_gfp_mask(void)
452aa699 253{
55f2503c 254 WARN_ON(!mutex_is_locked(&system_transition_mutex));
c9e664f1
RW
255 WARN_ON(saved_gfp_mask);
256 saved_gfp_mask = gfp_allowed_mask;
d0164adc 257 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
452aa699 258}
f90ac398
MG
259
260bool pm_suspended_storage(void)
261{
d0164adc 262 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
f90ac398
MG
263 return false;
264 return true;
265}
452aa699
RW
266#endif /* CONFIG_PM_SLEEP */
267
d9c23400 268#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
d00181b9 269unsigned int pageblock_order __read_mostly;
d9c23400
MG
270#endif
271
7fef431b
DH
272static void __free_pages_ok(struct page *page, unsigned int order,
273 fpi_t fpi_flags);
a226f6c8 274
1da177e4
LT
275/*
276 * results with 256, 32 in the lowmem_reserve sysctl:
277 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
278 * 1G machine -> (16M dma, 784M normal, 224M high)
279 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
280 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
84109e15 281 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
a2f1b424
AK
282 *
283 * TBD: should special case ZONE_DMA32 machines here - in those we normally
284 * don't need any ZONE_NORMAL reservation
1da177e4 285 */
d3cda233 286int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
4b51d669 287#ifdef CONFIG_ZONE_DMA
d3cda233 288 [ZONE_DMA] = 256,
4b51d669 289#endif
fb0e7942 290#ifdef CONFIG_ZONE_DMA32
d3cda233 291 [ZONE_DMA32] = 256,
fb0e7942 292#endif
d3cda233 293 [ZONE_NORMAL] = 32,
e53ef38d 294#ifdef CONFIG_HIGHMEM
d3cda233 295 [ZONE_HIGHMEM] = 0,
e53ef38d 296#endif
d3cda233 297 [ZONE_MOVABLE] = 0,
2f1b6248 298};
1da177e4 299
9420f89d 300char * const zone_names[MAX_NR_ZONES] = {
4b51d669 301#ifdef CONFIG_ZONE_DMA
2f1b6248 302 "DMA",
4b51d669 303#endif
fb0e7942 304#ifdef CONFIG_ZONE_DMA32
2f1b6248 305 "DMA32",
fb0e7942 306#endif
2f1b6248 307 "Normal",
e53ef38d 308#ifdef CONFIG_HIGHMEM
2a1e274a 309 "HighMem",
e53ef38d 310#endif
2a1e274a 311 "Movable",
033fbae9
DW
312#ifdef CONFIG_ZONE_DEVICE
313 "Device",
314#endif
2f1b6248
CL
315};
316
c999fbd3 317const char * const migratetype_names[MIGRATE_TYPES] = {
60f30350
VB
318 "Unmovable",
319 "Movable",
320 "Reclaimable",
321 "HighAtomic",
322#ifdef CONFIG_CMA
323 "CMA",
324#endif
325#ifdef CONFIG_MEMORY_ISOLATION
326 "Isolate",
327#endif
328};
329
ae70eddd
AK
330compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
331 [NULL_COMPOUND_DTOR] = NULL,
332 [COMPOUND_PAGE_DTOR] = free_compound_page,
f1e61557 333#ifdef CONFIG_HUGETLB_PAGE
ae70eddd 334 [HUGETLB_PAGE_DTOR] = free_huge_page,
f1e61557 335#endif
9a982250 336#ifdef CONFIG_TRANSPARENT_HUGEPAGE
ae70eddd 337 [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
9a982250 338#endif
f1e61557
KS
339};
340
1da177e4 341int min_free_kbytes = 1024;
42aa83cb 342int user_min_free_kbytes = -1;
1c30844d 343int watermark_boost_factor __read_mostly = 15000;
795ae7a0 344int watermark_scale_factor = 10;
1da177e4 345
0ee332c1
TH
346/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
347int movable_zone;
348EXPORT_SYMBOL(movable_zone);
c713216d 349
418508c1 350#if MAX_NUMNODES > 1
b9726c26 351unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
ce0725f7 352unsigned int nr_online_nodes __read_mostly = 1;
418508c1 353EXPORT_SYMBOL(nr_node_ids);
62bc62a8 354EXPORT_SYMBOL(nr_online_nodes);
418508c1
MS
355#endif
356
9ef9acb0
MG
357int page_group_by_mobility_disabled __read_mostly;
358
3a80a7fa 359#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3c0c12cc
WL
360/*
361 * During boot we initialize deferred pages on-demand, as needed, but once
362 * page_alloc_init_late() has finished, the deferred pages are all initialized,
363 * and we can permanently disable that path.
364 */
9420f89d 365DEFINE_STATIC_KEY_TRUE(deferred_pages);
3c0c12cc 366
94ae8b83 367static inline bool deferred_pages_enabled(void)
3c0c12cc 368{
94ae8b83 369 return static_branch_unlikely(&deferred_pages);
3c0c12cc
WL
370}
371
3a80a7fa 372/*
9420f89d
MRI
373 * deferred_grow_zone() is __init, but it is called from
374 * get_page_from_freelist() during early boot until deferred_pages permanently
375 * disables this call. This is why we have refdata wrapper to avoid warning,
376 * and to ensure that the function body gets unloaded.
3a80a7fa 377 */
9420f89d
MRI
378static bool __ref
379_deferred_grow_zone(struct zone *zone, unsigned int order)
3a80a7fa 380{
9420f89d 381 return deferred_grow_zone(zone, order);
3a80a7fa
MG
382}
383#else
94ae8b83 384static inline bool deferred_pages_enabled(void)
2c335680 385{
94ae8b83 386 return false;
2c335680 387}
9420f89d 388#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
3a80a7fa 389
0b423ca2 390/* Return a pointer to the bitmap storing bits affecting a block of pages */
ca891f41 391static inline unsigned long *get_pageblock_bitmap(const struct page *page,
0b423ca2
MG
392 unsigned long pfn)
393{
394#ifdef CONFIG_SPARSEMEM
f1eca35a 395 return section_to_usemap(__pfn_to_section(pfn));
0b423ca2
MG
396#else
397 return page_zone(page)->pageblock_flags;
398#endif /* CONFIG_SPARSEMEM */
399}
400
ca891f41 401static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
0b423ca2
MG
402{
403#ifdef CONFIG_SPARSEMEM
404 pfn &= (PAGES_PER_SECTION-1);
0b423ca2 405#else
4f9bc69a 406 pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
0b423ca2 407#endif /* CONFIG_SPARSEMEM */
399b795b 408 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
0b423ca2
MG
409}
410
535b81e2 411static __always_inline
ca891f41 412unsigned long __get_pfnblock_flags_mask(const struct page *page,
0b423ca2 413 unsigned long pfn,
0b423ca2
MG
414 unsigned long mask)
415{
416 unsigned long *bitmap;
417 unsigned long bitidx, word_bitidx;
418 unsigned long word;
419
420 bitmap = get_pageblock_bitmap(page, pfn);
421 bitidx = pfn_to_bitidx(page, pfn);
422 word_bitidx = bitidx / BITS_PER_LONG;
423 bitidx &= (BITS_PER_LONG-1);
1c563432
MK
424 /*
425 * This races, without locks, with set_pfnblock_flags_mask(). Ensure
426 * a consistent read of the memory array, so that results, even though
427 * racy, are not corrupted.
428 */
429 word = READ_ONCE(bitmap[word_bitidx]);
d93d5ab9 430 return (word >> bitidx) & mask;
0b423ca2
MG
431}
432
a00cda3f
MCC
433/**
434 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
435 * @page: The page within the block of interest
436 * @pfn: The target page frame number
437 * @mask: mask of bits that the caller is interested in
438 *
439 * Return: pageblock_bits flags
440 */
ca891f41
MWO
441unsigned long get_pfnblock_flags_mask(const struct page *page,
442 unsigned long pfn, unsigned long mask)
0b423ca2 443{
535b81e2 444 return __get_pfnblock_flags_mask(page, pfn, mask);
0b423ca2
MG
445}
446
ca891f41
MWO
447static __always_inline int get_pfnblock_migratetype(const struct page *page,
448 unsigned long pfn)
0b423ca2 449{
535b81e2 450 return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
0b423ca2
MG
451}
452
453/**
454 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
455 * @page: The page within the block of interest
456 * @flags: The flags to set
457 * @pfn: The target page frame number
0b423ca2
MG
458 * @mask: mask of bits that the caller is interested in
459 */
460void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
461 unsigned long pfn,
0b423ca2
MG
462 unsigned long mask)
463{
464 unsigned long *bitmap;
465 unsigned long bitidx, word_bitidx;
04ec0061 466 unsigned long word;
0b423ca2
MG
467
468 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
125b860b 469 BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
0b423ca2
MG
470
471 bitmap = get_pageblock_bitmap(page, pfn);
472 bitidx = pfn_to_bitidx(page, pfn);
473 word_bitidx = bitidx / BITS_PER_LONG;
474 bitidx &= (BITS_PER_LONG-1);
475
476 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
477
d93d5ab9
WY
478 mask <<= bitidx;
479 flags <<= bitidx;
0b423ca2
MG
480
481 word = READ_ONCE(bitmap[word_bitidx]);
04ec0061
UB
482 do {
483 } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
0b423ca2 484}
3a80a7fa 485
ee6f509c 486void set_pageblock_migratetype(struct page *page, int migratetype)
b2a0ac88 487{
5d0f3f72
KM
488 if (unlikely(page_group_by_mobility_disabled &&
489 migratetype < MIGRATE_PCPTYPES))
49255c61
MG
490 migratetype = MIGRATE_UNMOVABLE;
491
d93d5ab9 492 set_pfnblock_flags_mask(page, (unsigned long)migratetype,
535b81e2 493 page_to_pfn(page), MIGRATETYPE_MASK);
b2a0ac88
MG
494}
495
13e7444b 496#ifdef CONFIG_DEBUG_VM
c6a57e19 497static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
1da177e4 498{
bdc8cb98
DH
499 int ret = 0;
500 unsigned seq;
501 unsigned long pfn = page_to_pfn(page);
b5e6a5a2 502 unsigned long sp, start_pfn;
c6a57e19 503
bdc8cb98
DH
504 do {
505 seq = zone_span_seqbegin(zone);
b5e6a5a2
CS
506 start_pfn = zone->zone_start_pfn;
507 sp = zone->spanned_pages;
108bcc96 508 if (!zone_spans_pfn(zone, pfn))
bdc8cb98
DH
509 ret = 1;
510 } while (zone_span_seqretry(zone, seq));
511
b5e6a5a2 512 if (ret)
613813e8
DH
513 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
514 pfn, zone_to_nid(zone), zone->name,
515 start_pfn, start_pfn + sp);
b5e6a5a2 516
bdc8cb98 517 return ret;
c6a57e19
DH
518}
519
520static int page_is_consistent(struct zone *zone, struct page *page)
521{
1da177e4 522 if (zone != page_zone(page))
c6a57e19
DH
523 return 0;
524
525 return 1;
526}
527/*
528 * Temporary debugging check for pages not lying within a given zone.
529 */
d73d3c9f 530static int __maybe_unused bad_range(struct zone *zone, struct page *page)
c6a57e19
DH
531{
532 if (page_outside_zone_boundaries(zone, page))
1da177e4 533 return 1;
c6a57e19
DH
534 if (!page_is_consistent(zone, page))
535 return 1;
536
1da177e4
LT
537 return 0;
538}
13e7444b 539#else
d73d3c9f 540static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
13e7444b
NP
541{
542 return 0;
543}
544#endif
545
82a3241a 546static void bad_page(struct page *page, const char *reason)
1da177e4 547{
d936cf9b
HD
548 static unsigned long resume;
549 static unsigned long nr_shown;
550 static unsigned long nr_unshown;
551
552 /*
553 * Allow a burst of 60 reports, then keep quiet for that minute;
554 * or allow a steady drip of one report per second.
555 */
556 if (nr_shown == 60) {
557 if (time_before(jiffies, resume)) {
558 nr_unshown++;
559 goto out;
560 }
561 if (nr_unshown) {
ff8e8116 562 pr_alert(
1e9e6365 563 "BUG: Bad page state: %lu messages suppressed\n",
d936cf9b
HD
564 nr_unshown);
565 nr_unshown = 0;
566 }
567 nr_shown = 0;
568 }
569 if (nr_shown++ == 0)
570 resume = jiffies + 60 * HZ;
571
ff8e8116 572 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
3dc14741 573 current->comm, page_to_pfn(page));
d2f07ec0 574 dump_page(page, reason);
3dc14741 575
4f31888c 576 print_modules();
1da177e4 577 dump_stack();
d936cf9b 578out:
8cc3b392 579 /* Leave bad fields for debug, except PageBuddy could make trouble */
22b751c3 580 page_mapcount_reset(page); /* remove PageBuddy */
373d4d09 581 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1da177e4
LT
582}
583
44042b44
MG
584static inline unsigned int order_to_pindex(int migratetype, int order)
585{
586 int base = order;
587
588#ifdef CONFIG_TRANSPARENT_HUGEPAGE
589 if (order > PAGE_ALLOC_COSTLY_ORDER) {
590 VM_BUG_ON(order != pageblock_order);
5d0a661d 591 return NR_LOWORDER_PCP_LISTS;
44042b44
MG
592 }
593#else
594 VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
595#endif
596
597 return (MIGRATE_PCPTYPES * base) + migratetype;
598}
599
600static inline int pindex_to_order(unsigned int pindex)
601{
602 int order = pindex / MIGRATE_PCPTYPES;
603
604#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5d0a661d 605 if (pindex == NR_LOWORDER_PCP_LISTS)
44042b44 606 order = pageblock_order;
44042b44
MG
607#else
608 VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
609#endif
610
611 return order;
612}
613
614static inline bool pcp_allowed_order(unsigned int order)
615{
616 if (order <= PAGE_ALLOC_COSTLY_ORDER)
617 return true;
618#ifdef CONFIG_TRANSPARENT_HUGEPAGE
619 if (order == pageblock_order)
620 return true;
621#endif
622 return false;
623}
624
21d02f8f
MG
625static inline void free_the_page(struct page *page, unsigned int order)
626{
44042b44
MG
627 if (pcp_allowed_order(order)) /* Via pcp? */
628 free_unref_page(page, order);
21d02f8f
MG
629 else
630 __free_pages_ok(page, order, FPI_NONE);
631}
632
1da177e4
LT
633/*
634 * Higher-order pages are called "compound pages". They are structured thusly:
635 *
1d798ca3 636 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
1da177e4 637 *
1d798ca3
KS
638 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
639 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
1da177e4 640 *
1d798ca3
KS
641 * The first tail page's ->compound_dtor holds the offset in array of compound
642 * page destructors. See compound_page_dtors.
1da177e4 643 *
1d798ca3 644 * The first tail page's ->compound_order holds the order of allocation.
41d78ba5 645 * This usage means that zero-order pages may not be compound.
1da177e4 646 */
d98c7a09 647
9a982250 648void free_compound_page(struct page *page)
d98c7a09 649{
bbc6b703 650 mem_cgroup_uncharge(page_folio(page));
44042b44 651 free_the_page(page, compound_order(page));
d98c7a09
HD
652}
653
d00181b9 654void prep_compound_page(struct page *page, unsigned int order)
18229df5
AW
655{
656 int i;
657 int nr_pages = 1 << order;
658
18229df5 659 __SetPageHead(page);
5b24eeef
JM
660 for (i = 1; i < nr_pages; i++)
661 prep_compound_tail(page, i);
1378a5ee 662
5b24eeef 663 prep_compound_head(page, order);
18229df5
AW
664}
665
5375336c
MWO
666void destroy_large_folio(struct folio *folio)
667{
a60d5942 668 enum compound_dtor_id dtor = folio->_folio_dtor;
5375336c
MWO
669
670 VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
671 compound_page_dtors[dtor](&folio->page);
672}
673
c0a32fc5
SG
674#ifdef CONFIG_DEBUG_PAGEALLOC
675unsigned int _debug_guardpage_minorder;
96a2b03f 676
8e57f8ac
VB
677bool _debug_pagealloc_enabled_early __read_mostly
678 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
679EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
96a2b03f 680DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
505f6d22 681EXPORT_SYMBOL(_debug_pagealloc_enabled);
96a2b03f
VB
682
683DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
e30825f1 684
031bc574
JK
685static int __init early_debug_pagealloc(char *buf)
686{
8e57f8ac 687 return kstrtobool(buf, &_debug_pagealloc_enabled_early);
031bc574
JK
688}
689early_param("debug_pagealloc", early_debug_pagealloc);
690
c0a32fc5
SG
691static int __init debug_guardpage_minorder_setup(char *buf)
692{
693 unsigned long res;
694
695 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
1170532b 696 pr_err("Bad debug_guardpage_minorder value\n");
c0a32fc5
SG
697 return 0;
698 }
699 _debug_guardpage_minorder = res;
1170532b 700 pr_info("Setting debug_guardpage_minorder to %lu\n", res);
c0a32fc5
SG
701 return 0;
702}
f1c1e9f7 703early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
c0a32fc5 704
acbc15a4 705static inline bool set_page_guard(struct zone *zone, struct page *page,
2847cf95 706 unsigned int order, int migratetype)
c0a32fc5 707{
e30825f1 708 if (!debug_guardpage_enabled())
acbc15a4
JK
709 return false;
710
711 if (order >= debug_guardpage_minorder())
712 return false;
e30825f1 713
3972f6bb 714 __SetPageGuard(page);
bf75f200 715 INIT_LIST_HEAD(&page->buddy_list);
2847cf95
JK
716 set_page_private(page, order);
717 /* Guard pages are not available for any usage */
b3618455
ML
718 if (!is_migrate_isolate(migratetype))
719 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
acbc15a4
JK
720
721 return true;
c0a32fc5
SG
722}
723
2847cf95
JK
724static inline void clear_page_guard(struct zone *zone, struct page *page,
725 unsigned int order, int migratetype)
c0a32fc5 726{
e30825f1
JK
727 if (!debug_guardpage_enabled())
728 return;
729
3972f6bb 730 __ClearPageGuard(page);
e30825f1 731
2847cf95
JK
732 set_page_private(page, 0);
733 if (!is_migrate_isolate(migratetype))
734 __mod_zone_freepage_state(zone, (1 << order), migratetype);
c0a32fc5
SG
735}
736#else
acbc15a4
JK
737static inline bool set_page_guard(struct zone *zone, struct page *page,
738 unsigned int order, int migratetype) { return false; }
2847cf95
JK
739static inline void clear_page_guard(struct zone *zone, struct page *page,
740 unsigned int order, int migratetype) {}
c0a32fc5
SG
741#endif
742
ab130f91 743static inline void set_buddy_order(struct page *page, unsigned int order)
6aa3001b 744{
4c21e2f2 745 set_page_private(page, order);
676165a8 746 __SetPageBuddy(page);
1da177e4
LT
747}
748
5e1f0f09
MG
749#ifdef CONFIG_COMPACTION
750static inline struct capture_control *task_capc(struct zone *zone)
751{
752 struct capture_control *capc = current->capture_control;
753
deba0487 754 return unlikely(capc) &&
5e1f0f09
MG
755 !(current->flags & PF_KTHREAD) &&
756 !capc->page &&
deba0487 757 capc->cc->zone == zone ? capc : NULL;
5e1f0f09
MG
758}
759
760static inline bool
761compaction_capture(struct capture_control *capc, struct page *page,
762 int order, int migratetype)
763{
764 if (!capc || order != capc->cc->order)
765 return false;
766
767 /* Do not accidentally pollute CMA or isolated regions*/
768 if (is_migrate_cma(migratetype) ||
769 is_migrate_isolate(migratetype))
770 return false;
771
772 /*
f0953a1b 773 * Do not let lower order allocations pollute a movable pageblock.
5e1f0f09
MG
774 * This might let an unmovable request use a reclaimable pageblock
775 * and vice-versa but no more than normal fallback logic which can
776 * have trouble finding a high-order free page.
777 */
778 if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
779 return false;
780
781 capc->page = page;
782 return true;
783}
784
785#else
786static inline struct capture_control *task_capc(struct zone *zone)
787{
788 return NULL;
789}
790
791static inline bool
792compaction_capture(struct capture_control *capc, struct page *page,
793 int order, int migratetype)
794{
795 return false;
796}
797#endif /* CONFIG_COMPACTION */
798
6ab01363
AD
799/* Used for pages not on another list */
800static inline void add_to_free_list(struct page *page, struct zone *zone,
801 unsigned int order, int migratetype)
802{
803 struct free_area *area = &zone->free_area[order];
804
bf75f200 805 list_add(&page->buddy_list, &area->free_list[migratetype]);
6ab01363
AD
806 area->nr_free++;
807}
808
809/* Used for pages not on another list */
810static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
811 unsigned int order, int migratetype)
812{
813 struct free_area *area = &zone->free_area[order];
814
bf75f200 815 list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
6ab01363
AD
816 area->nr_free++;
817}
818
293ffa5e
DH
819/*
820 * Used for pages which are on another list. Move the pages to the tail
821 * of the list - so the moved pages won't immediately be considered for
822 * allocation again (e.g., optimization for memory onlining).
823 */
6ab01363
AD
824static inline void move_to_free_list(struct page *page, struct zone *zone,
825 unsigned int order, int migratetype)
826{
827 struct free_area *area = &zone->free_area[order];
828
bf75f200 829 list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
6ab01363
AD
830}
831
832static inline void del_page_from_free_list(struct page *page, struct zone *zone,
833 unsigned int order)
834{
36e66c55
AD
835 /* clear reported state and update reported page count */
836 if (page_reported(page))
837 __ClearPageReported(page);
838
bf75f200 839 list_del(&page->buddy_list);
6ab01363
AD
840 __ClearPageBuddy(page);
841 set_page_private(page, 0);
842 zone->free_area[order].nr_free--;
843}
844
5d671eb4
MRI
845static inline struct page *get_page_from_free_area(struct free_area *area,
846 int migratetype)
847{
848 return list_first_entry_or_null(&area->free_list[migratetype],
849 struct page, lru);
850}
851
a2129f24
AD
852/*
853 * If this is not the largest possible page, check if the buddy
854 * of the next-highest order is free. If it is, it's possible
855 * that pages are being freed that will coalesce soon. In case,
856 * that is happening, add the free page to the tail of the list
857 * so it's less likely to be used soon and more likely to be merged
858 * as a higher order page
859 */
860static inline bool
861buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
862 struct page *page, unsigned int order)
863{
8170ac47
ZY
864 unsigned long higher_page_pfn;
865 struct page *higher_page;
a2129f24 866
23baf831 867 if (order >= MAX_ORDER - 1)
a2129f24
AD
868 return false;
869
8170ac47
ZY
870 higher_page_pfn = buddy_pfn & pfn;
871 higher_page = page + (higher_page_pfn - pfn);
a2129f24 872
8170ac47
ZY
873 return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
874 NULL) != NULL;
a2129f24
AD
875}
876
1da177e4
LT
877/*
878 * Freeing function for a buddy system allocator.
879 *
880 * The concept of a buddy system is to maintain direct-mapped table
881 * (containing bit values) for memory blocks of various "orders".
882 * The bottom level table contains the map for the smallest allocatable
883 * units of memory (here, pages), and each level above it describes
884 * pairs of units from the levels below, hence, "buddies".
885 * At a high level, all that happens here is marking the table entry
886 * at the bottom level available, and propagating the changes upward
887 * as necessary, plus some accounting needed to play nicely with other
888 * parts of the VM system.
889 * At each level, we keep a list of pages, which are heads of continuous
6e292b9b
MW
890 * free pages of length of (1 << order) and marked with PageBuddy.
891 * Page's order is recorded in page_private(page) field.
1da177e4 892 * So when we are allocating or freeing one, we can derive the state of the
5f63b720
MN
893 * other. That is, if we allocate a small block, and both were
894 * free, the remainder of the region must be split into blocks.
1da177e4 895 * If a block is freed, and its buddy is also free, then this
5f63b720 896 * triggers coalescing into a block of larger size.
1da177e4 897 *
6d49e352 898 * -- nyc
1da177e4
LT
899 */
900
48db57f8 901static inline void __free_one_page(struct page *page,
dc4b0caf 902 unsigned long pfn,
ed0ae21d 903 struct zone *zone, unsigned int order,
f04a5d5d 904 int migratetype, fpi_t fpi_flags)
1da177e4 905{
a2129f24 906 struct capture_control *capc = task_capc(zone);
dae37a5d 907 unsigned long buddy_pfn = 0;
a2129f24 908 unsigned long combined_pfn;
a2129f24
AD
909 struct page *buddy;
910 bool to_tail;
d9dddbf5 911
d29bb978 912 VM_BUG_ON(!zone_is_initialized(zone));
6e9f0d58 913 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
1da177e4 914
ed0ae21d 915 VM_BUG_ON(migratetype == -1);
d9dddbf5 916 if (likely(!is_migrate_isolate(migratetype)))
8f82b55d 917 __mod_zone_freepage_state(zone, 1 << order, migratetype);
ed0ae21d 918
76741e77 919 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
309381fe 920 VM_BUG_ON_PAGE(bad_range(zone, page), page);
1da177e4 921
23baf831 922 while (order < MAX_ORDER) {
5e1f0f09
MG
923 if (compaction_capture(capc, page, order, migratetype)) {
924 __mod_zone_freepage_state(zone, -(1 << order),
925 migratetype);
926 return;
927 }
13ad59df 928
8170ac47
ZY
929 buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
930 if (!buddy)
d9dddbf5 931 goto done_merging;
bb0e28eb
ZY
932
933 if (unlikely(order >= pageblock_order)) {
934 /*
935 * We want to prevent merge between freepages on pageblock
936 * without fallbacks and normal pageblock. Without this,
937 * pageblock isolation could cause incorrect freepage or CMA
938 * accounting or HIGHATOMIC accounting.
939 */
940 int buddy_mt = get_pageblock_migratetype(buddy);
941
942 if (migratetype != buddy_mt
943 && (!migratetype_is_mergeable(migratetype) ||
944 !migratetype_is_mergeable(buddy_mt)))
945 goto done_merging;
946 }
947
c0a32fc5
SG
948 /*
949 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
950 * merge with it and move up one order.
951 */
b03641af 952 if (page_is_guard(buddy))
2847cf95 953 clear_page_guard(zone, buddy, order, migratetype);
b03641af 954 else
6ab01363 955 del_page_from_free_list(buddy, zone, order);
76741e77
VB
956 combined_pfn = buddy_pfn & pfn;
957 page = page + (combined_pfn - pfn);
958 pfn = combined_pfn;
1da177e4
LT
959 order++;
960 }
d9dddbf5
VB
961
962done_merging:
ab130f91 963 set_buddy_order(page, order);
6dda9d55 964
47b6a24a
DH
965 if (fpi_flags & FPI_TO_TAIL)
966 to_tail = true;
967 else if (is_shuffle_order(order))
a2129f24 968 to_tail = shuffle_pick_tail();
97500a4a 969 else
a2129f24 970 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
97500a4a 971
a2129f24 972 if (to_tail)
6ab01363 973 add_to_free_list_tail(page, zone, order, migratetype);
a2129f24 974 else
6ab01363 975 add_to_free_list(page, zone, order, migratetype);
36e66c55
AD
976
977 /* Notify page reporting subsystem of freed page */
f04a5d5d 978 if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
36e66c55 979 page_reporting_notify_free(order);
1da177e4
LT
980}
981
b2c9e2fb
ZY
982/**
983 * split_free_page() -- split a free page at split_pfn_offset
984 * @free_page: the original free page
985 * @order: the order of the page
986 * @split_pfn_offset: split offset within the page
987 *
86d28b07
ZY
988 * Return -ENOENT if the free page is changed, otherwise 0
989 *
b2c9e2fb
ZY
990 * It is used when the free page crosses two pageblocks with different migratetypes
991 * at split_pfn_offset within the page. The split free page will be put into
992 * separate migratetype lists afterwards. Otherwise, the function achieves
993 * nothing.
994 */
86d28b07
ZY
995int split_free_page(struct page *free_page,
996 unsigned int order, unsigned long split_pfn_offset)
b2c9e2fb
ZY
997{
998 struct zone *zone = page_zone(free_page);
999 unsigned long free_page_pfn = page_to_pfn(free_page);
1000 unsigned long pfn;
1001 unsigned long flags;
1002 int free_page_order;
86d28b07
ZY
1003 int mt;
1004 int ret = 0;
b2c9e2fb 1005
88ee1343 1006 if (split_pfn_offset == 0)
86d28b07 1007 return ret;
88ee1343 1008
b2c9e2fb 1009 spin_lock_irqsave(&zone->lock, flags);
86d28b07
ZY
1010
1011 if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
1012 ret = -ENOENT;
1013 goto out;
1014 }
1015
1016 mt = get_pageblock_migratetype(free_page);
1017 if (likely(!is_migrate_isolate(mt)))
1018 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1019
b2c9e2fb
ZY
1020 del_page_from_free_list(free_page, zone, order);
1021 for (pfn = free_page_pfn;
1022 pfn < free_page_pfn + (1UL << order);) {
1023 int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
1024
86d28b07 1025 free_page_order = min_t(unsigned int,
88ee1343
ZY
1026 pfn ? __ffs(pfn) : order,
1027 __fls(split_pfn_offset));
b2c9e2fb
ZY
1028 __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
1029 mt, FPI_NONE);
1030 pfn += 1UL << free_page_order;
1031 split_pfn_offset -= (1UL << free_page_order);
1032 /* we have done the first part, now switch to second part */
1033 if (split_pfn_offset == 0)
1034 split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
1035 }
86d28b07 1036out:
b2c9e2fb 1037 spin_unlock_irqrestore(&zone->lock, flags);
86d28b07 1038 return ret;
b2c9e2fb 1039}
7bfec6f4
MG
1040/*
1041 * A bad page could be due to a number of fields. Instead of multiple branches,
1042 * try and check multiple fields with one check. The caller must do a detailed
1043 * check if necessary.
1044 */
1045static inline bool page_expected_state(struct page *page,
1046 unsigned long check_flags)
1047{
1048 if (unlikely(atomic_read(&page->_mapcount) != -1))
1049 return false;
1050
1051 if (unlikely((unsigned long)page->mapping |
1052 page_ref_count(page) |
1053#ifdef CONFIG_MEMCG
48060834 1054 page->memcg_data |
7bfec6f4
MG
1055#endif
1056 (page->flags & check_flags)))
1057 return false;
1058
1059 return true;
1060}
1061
58b7f119 1062static const char *page_bad_reason(struct page *page, unsigned long flags)
1da177e4 1063{
82a3241a 1064 const char *bad_reason = NULL;
f0b791a3 1065
53f9263b 1066 if (unlikely(atomic_read(&page->_mapcount) != -1))
f0b791a3
DH
1067 bad_reason = "nonzero mapcount";
1068 if (unlikely(page->mapping != NULL))
1069 bad_reason = "non-NULL mapping";
fe896d18 1070 if (unlikely(page_ref_count(page) != 0))
0139aa7b 1071 bad_reason = "nonzero _refcount";
58b7f119
WY
1072 if (unlikely(page->flags & flags)) {
1073 if (flags == PAGE_FLAGS_CHECK_AT_PREP)
1074 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
1075 else
1076 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
f0b791a3 1077 }
9edad6ea 1078#ifdef CONFIG_MEMCG
48060834 1079 if (unlikely(page->memcg_data))
9edad6ea
JW
1080 bad_reason = "page still charged to cgroup";
1081#endif
58b7f119
WY
1082 return bad_reason;
1083}
1084
a8368cd8 1085static void free_page_is_bad_report(struct page *page)
58b7f119
WY
1086{
1087 bad_page(page,
1088 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
bb552ac6
MG
1089}
1090
a8368cd8 1091static inline bool free_page_is_bad(struct page *page)
bb552ac6 1092{
da838d4f 1093 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
a8368cd8 1094 return false;
bb552ac6
MG
1095
1096 /* Something has gone sideways, find it */
a8368cd8
AM
1097 free_page_is_bad_report(page);
1098 return true;
1da177e4
LT
1099}
1100
8666925c 1101static int free_tail_page_prepare(struct page *head_page, struct page *page)
4db7548c 1102{
94688e8e 1103 struct folio *folio = (struct folio *)head_page;
4db7548c
MG
1104 int ret = 1;
1105
1106 /*
1107 * We rely page->lru.next never has bit 0 set, unless the page
1108 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
1109 */
1110 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
1111
8666925c 1112 if (!static_branch_unlikely(&check_pages_enabled)) {
4db7548c
MG
1113 ret = 0;
1114 goto out;
1115 }
1116 switch (page - head_page) {
1117 case 1:
cb67f428 1118 /* the first tail page: these may be in place of ->mapping */
65a689f3
MWO
1119 if (unlikely(folio_entire_mapcount(folio))) {
1120 bad_page(page, "nonzero entire_mapcount");
4db7548c
MG
1121 goto out;
1122 }
65a689f3
MWO
1123 if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
1124 bad_page(page, "nonzero nr_pages_mapped");
cb67f428
HD
1125 goto out;
1126 }
94688e8e
MWO
1127 if (unlikely(atomic_read(&folio->_pincount))) {
1128 bad_page(page, "nonzero pincount");
cb67f428
HD
1129 goto out;
1130 }
4db7548c
MG
1131 break;
1132 case 2:
1133 /*
1134 * the second tail page: ->mapping is
fa3015b7 1135 * deferred_list.next -- ignore value.
4db7548c
MG
1136 */
1137 break;
1138 default:
1139 if (page->mapping != TAIL_MAPPING) {
82a3241a 1140 bad_page(page, "corrupted mapping in tail page");
4db7548c
MG
1141 goto out;
1142 }
1143 break;
1144 }
1145 if (unlikely(!PageTail(page))) {
82a3241a 1146 bad_page(page, "PageTail not set");
4db7548c
MG
1147 goto out;
1148 }
1149 if (unlikely(compound_head(page) != head_page)) {
82a3241a 1150 bad_page(page, "compound_head not consistent");
4db7548c
MG
1151 goto out;
1152 }
1153 ret = 0;
1154out:
1155 page->mapping = NULL;
1156 clear_compound_head(page);
1157 return ret;
1158}
1159
94ae8b83
AK
1160/*
1161 * Skip KASAN memory poisoning when either:
1162 *
0a54864f
PC
1163 * 1. For generic KASAN: deferred memory initialization has not yet completed.
1164 * Tag-based KASAN modes skip pages freed via deferred memory initialization
1165 * using page tags instead (see below).
1166 * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
1167 * that error detection is disabled for accesses via the page address.
1168 *
1169 * Pages will have match-all tags in the following circumstances:
1170 *
1171 * 1. Pages are being initialized for the first time, including during deferred
1172 * memory init; see the call to page_kasan_tag_reset in __init_single_page.
1173 * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
1174 * exception of pages unpoisoned by kasan_unpoison_vmalloc.
1175 * 3. The allocation was excluded from being checked due to sampling,
44383cef 1176 * see the call to kasan_unpoison_pages.
94ae8b83
AK
1177 *
1178 * Poisoning pages during deferred memory init will greatly lengthen the
1179 * process and cause problem in large memory systems as the deferred pages
1180 * initialization is done with interrupt disabled.
1181 *
1182 * Assuming that there will be no reference to those newly initialized
1183 * pages before they are ever allocated, this should have no effect on
1184 * KASAN memory tracking as the poison will be properly inserted at page
1185 * allocation time. The only corner case is when pages are allocated by
1186 * on-demand allocation and then freed again before the deferred pages
1187 * initialization is done, but this is not likely to happen.
1188 */
1189static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
1190{
0a54864f
PC
1191 if (IS_ENABLED(CONFIG_KASAN_GENERIC))
1192 return deferred_pages_enabled();
1193
1194 return page_kasan_tag(page) == 0xff;
94ae8b83
AK
1195}
1196
aeaec8e2 1197static void kernel_init_pages(struct page *page, int numpages)
6471384a
AP
1198{
1199 int i;
1200
9e15afa5
QC
1201 /* s390's use of memset() could override KASAN redzones. */
1202 kasan_disable_current();
d9da8f6c
AK
1203 for (i = 0; i < numpages; i++)
1204 clear_highpage_kasan_tagged(page + i);
9e15afa5 1205 kasan_enable_current();
6471384a
AP
1206}
1207
e2769dbd 1208static __always_inline bool free_pages_prepare(struct page *page,
700d2e9a 1209 unsigned int order, fpi_t fpi_flags)
4db7548c 1210{
e2769dbd 1211 int bad = 0;
f446883d 1212 bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
c3525330 1213 bool init = want_init_on_free();
4db7548c 1214
4db7548c
MG
1215 VM_BUG_ON_PAGE(PageTail(page), page);
1216
e2769dbd 1217 trace_mm_page_free(page, order);
b073d7f8 1218 kmsan_free_page(page, order);
e2769dbd 1219
79f5f8fa
OS
1220 if (unlikely(PageHWPoison(page)) && !order) {
1221 /*
1222 * Do not let hwpoison pages hit pcplists/buddy
1223 * Untie memcg state and reset page's owner
1224 */
f7a449f7 1225 if (memcg_kmem_online() && PageMemcgKmem(page))
79f5f8fa
OS
1226 __memcg_kmem_uncharge_page(page, order);
1227 reset_page_owner(page, order);
df4e817b 1228 page_table_check_free(page, order);
79f5f8fa
OS
1229 return false;
1230 }
1231
e2769dbd
MG
1232 /*
1233 * Check tail pages before head page information is cleared to
1234 * avoid checking PageCompound for order-0 pages.
1235 */
1236 if (unlikely(order)) {
1237 bool compound = PageCompound(page);
1238 int i;
1239
1240 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
4db7548c 1241
cb67f428 1242 if (compound)
eac96c3e 1243 ClearPageHasHWPoisoned(page);
e2769dbd
MG
1244 for (i = 1; i < (1 << order); i++) {
1245 if (compound)
8666925c 1246 bad += free_tail_page_prepare(page, page + i);
fce0b421 1247 if (is_check_pages_enabled()) {
8666925c 1248 if (free_page_is_bad(page + i)) {
700d2e9a
VB
1249 bad++;
1250 continue;
1251 }
e2769dbd
MG
1252 }
1253 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1254 }
1255 }
bda807d4 1256 if (PageMappingFlags(page))
4db7548c 1257 page->mapping = NULL;
f7a449f7 1258 if (memcg_kmem_online() && PageMemcgKmem(page))
f4b00eab 1259 __memcg_kmem_uncharge_page(page, order);
fce0b421 1260 if (is_check_pages_enabled()) {
700d2e9a
VB
1261 if (free_page_is_bad(page))
1262 bad++;
1263 if (bad)
1264 return false;
1265 }
4db7548c 1266
e2769dbd
MG
1267 page_cpupid_reset_last(page);
1268 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1269 reset_page_owner(page, order);
df4e817b 1270 page_table_check_free(page, order);
4db7548c
MG
1271
1272 if (!PageHighMem(page)) {
1273 debug_check_no_locks_freed(page_address(page),
e2769dbd 1274 PAGE_SIZE << order);
4db7548c 1275 debug_check_no_obj_freed(page_address(page),
e2769dbd 1276 PAGE_SIZE << order);
4db7548c 1277 }
6471384a 1278
8db26a3d
VB
1279 kernel_poison_pages(page, 1 << order);
1280
f9d79e8d 1281 /*
1bb5eab3 1282 * As memory initialization might be integrated into KASAN,
7c13c163 1283 * KASAN poisoning and memory initialization code must be
1bb5eab3
AK
1284 * kept together to avoid discrepancies in behavior.
1285 *
f9d79e8d
AK
1286 * With hardware tag-based KASAN, memory tags must be set before the
1287 * page becomes unavailable via debug_pagealloc or arch_free_page.
1288 */
f446883d 1289 if (!skip_kasan_poison) {
c3525330 1290 kasan_poison_pages(page, order, init);
f9d79e8d 1291
db8a0477
AK
1292 /* Memory is already initialized if KASAN did it internally. */
1293 if (kasan_has_integrated_init())
1294 init = false;
1295 }
1296 if (init)
aeaec8e2 1297 kernel_init_pages(page, 1 << order);
db8a0477 1298
234fdce8
QC
1299 /*
1300 * arch_free_page() can make the page's contents inaccessible. s390
1301 * does this. So nothing which can access the page's contents should
1302 * happen after this.
1303 */
1304 arch_free_page(page, order);
1305
77bc7fd6 1306 debug_pagealloc_unmap_pages(page, 1 << order);
d6332692 1307
4db7548c
MG
1308 return true;
1309}
1310
1da177e4 1311/*
5f8dcc21 1312 * Frees a number of pages from the PCP lists
7cba630b 1313 * Assumes all pages on list are in same zone.
207f36ee 1314 * count is the number of pages to free.
1da177e4 1315 */
5f8dcc21 1316static void free_pcppages_bulk(struct zone *zone, int count,
fd56eef2
MG
1317 struct per_cpu_pages *pcp,
1318 int pindex)
1da177e4 1319{
57490774 1320 unsigned long flags;
35b6d770
MG
1321 int min_pindex = 0;
1322 int max_pindex = NR_PCP_LISTS - 1;
44042b44 1323 unsigned int order;
3777999d 1324 bool isolated_pageblocks;
8b10b465 1325 struct page *page;
f2260e6b 1326
88e8ac11
CTR
1327 /*
1328 * Ensure proper count is passed which otherwise would stuck in the
1329 * below while (list_empty(list)) loop.
1330 */
1331 count = min(pcp->count, count);
d61372bc
MG
1332
1333 /* Ensure requested pindex is drained first. */
1334 pindex = pindex - 1;
1335
57490774 1336 spin_lock_irqsave(&zone->lock, flags);
8b10b465
MG
1337 isolated_pageblocks = has_isolate_pageblock(zone);
1338
44042b44 1339 while (count > 0) {
5f8dcc21 1340 struct list_head *list;
fd56eef2 1341 int nr_pages;
5f8dcc21 1342
fd56eef2 1343 /* Remove pages from lists in a round-robin fashion. */
5f8dcc21 1344 do {
35b6d770
MG
1345 if (++pindex > max_pindex)
1346 pindex = min_pindex;
44042b44 1347 list = &pcp->lists[pindex];
35b6d770
MG
1348 if (!list_empty(list))
1349 break;
1350
1351 if (pindex == max_pindex)
1352 max_pindex--;
1353 if (pindex == min_pindex)
1354 min_pindex++;
1355 } while (1);
48db57f8 1356
44042b44 1357 order = pindex_to_order(pindex);
fd56eef2 1358 nr_pages = 1 << order;
a6f9edd6 1359 do {
8b10b465
MG
1360 int mt;
1361
bf75f200 1362 page = list_last_entry(list, struct page, pcp_list);
8b10b465
MG
1363 mt = get_pcppage_migratetype(page);
1364
0a5f4e5b 1365 /* must delete to avoid corrupting pcp list */
bf75f200 1366 list_del(&page->pcp_list);
fd56eef2
MG
1367 count -= nr_pages;
1368 pcp->count -= nr_pages;
aa016d14 1369
8b10b465
MG
1370 /* MIGRATE_ISOLATE page should not go to pcplists */
1371 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1372 /* Pageblock could have been isolated meanwhile */
1373 if (unlikely(isolated_pageblocks))
1374 mt = get_pageblock_migratetype(page);
0a5f4e5b 1375
8b10b465
MG
1376 __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
1377 trace_mm_page_pcpu_drain(page, order, mt);
1378 } while (count > 0 && !list_empty(list));
0a5f4e5b 1379 }
8b10b465 1380
57490774 1381 spin_unlock_irqrestore(&zone->lock, flags);
1da177e4
LT
1382}
1383
dc4b0caf
MG
1384static void free_one_page(struct zone *zone,
1385 struct page *page, unsigned long pfn,
7aeb09f9 1386 unsigned int order,
7fef431b 1387 int migratetype, fpi_t fpi_flags)
1da177e4 1388{
df1acc85
MG
1389 unsigned long flags;
1390
1391 spin_lock_irqsave(&zone->lock, flags);
ad53f92e
JK
1392 if (unlikely(has_isolate_pageblock(zone) ||
1393 is_migrate_isolate(migratetype))) {
1394 migratetype = get_pfnblock_migratetype(page, pfn);
ad53f92e 1395 }
7fef431b 1396 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
df1acc85 1397 spin_unlock_irqrestore(&zone->lock, flags);
48db57f8
NP
1398}
1399
7fef431b
DH
1400static void __free_pages_ok(struct page *page, unsigned int order,
1401 fpi_t fpi_flags)
ec95f53a 1402{
d34b0733 1403 unsigned long flags;
95e34412 1404 int migratetype;
dc4b0caf 1405 unsigned long pfn = page_to_pfn(page);
56f0e661 1406 struct zone *zone = page_zone(page);
ec95f53a 1407
700d2e9a 1408 if (!free_pages_prepare(page, order, fpi_flags))
ec95f53a
KM
1409 return;
1410
ac4b2901
DW
1411 /*
1412 * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
1413 * is used to avoid calling get_pfnblock_migratetype() under the lock.
1414 * This will reduce the lock holding time.
1415 */
cfc47a28 1416 migratetype = get_pfnblock_migratetype(page, pfn);
dbbee9d5 1417
56f0e661 1418 spin_lock_irqsave(&zone->lock, flags);
56f0e661
MG
1419 if (unlikely(has_isolate_pageblock(zone) ||
1420 is_migrate_isolate(migratetype))) {
1421 migratetype = get_pfnblock_migratetype(page, pfn);
1422 }
1423 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1424 spin_unlock_irqrestore(&zone->lock, flags);
90249993 1425
d34b0733 1426 __count_vm_events(PGFREE, 1 << order);
1da177e4
LT
1427}
1428
a9cd410a 1429void __free_pages_core(struct page *page, unsigned int order)
a226f6c8 1430{
c3993076 1431 unsigned int nr_pages = 1 << order;
e2d0bd2b 1432 struct page *p = page;
c3993076 1433 unsigned int loop;
a226f6c8 1434
7fef431b
DH
1435 /*
1436 * When initializing the memmap, __init_single_page() sets the refcount
1437 * of all pages to 1 ("allocated"/"not free"). We have to set the
1438 * refcount of all involved pages to 0.
1439 */
e2d0bd2b
YL
1440 prefetchw(p);
1441 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1442 prefetchw(p + 1);
c3993076
JW
1443 __ClearPageReserved(p);
1444 set_page_count(p, 0);
a226f6c8 1445 }
e2d0bd2b
YL
1446 __ClearPageReserved(p);
1447 set_page_count(p, 0);
c3993076 1448
9705bea5 1449 atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
7fef431b
DH
1450
1451 /*
1452 * Bypass PCP and place fresh pages right to the tail, primarily
1453 * relevant for memory onlining.
1454 */
0a54864f 1455 __free_pages_ok(page, order, FPI_TO_TAIL);
a226f6c8
DH
1456}
1457
7cf91a98
JK
1458/*
1459 * Check that the whole (or subset of) a pageblock given by the interval of
1460 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
859a85dd 1461 * with the migration of free compaction scanner.
7cf91a98
JK
1462 *
1463 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1464 *
1465 * It's possible on some configurations to have a setup like node0 node1 node0
1466 * i.e. it's possible that all pages within a zones range of pages do not
1467 * belong to a single zone. We assume that a border between node0 and node1
1468 * can occur within a single pageblock, but not a node0 node1 node0
1469 * interleaving within a single pageblock. It is therefore sufficient to check
1470 * the first and last page of a pageblock and avoid checking each individual
1471 * page in a pageblock.
65f67a3e
BW
1472 *
1473 * Note: the function may return non-NULL struct page even for a page block
1474 * which contains a memory hole (i.e. there is no physical memory for a subset
1475 * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
1476 * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
1477 * even though the start pfn is online and valid. This should be safe most of
1478 * the time because struct pages are still initialized via init_unavailable_range()
1479 * and pfn walkers shouldn't touch any physical memory range for which they do
1480 * not recognize any specific metadata in struct pages.
7cf91a98
JK
1481 */
1482struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1483 unsigned long end_pfn, struct zone *zone)
1484{
1485 struct page *start_page;
1486 struct page *end_page;
1487
1488 /* end_pfn is one past the range we are checking */
1489 end_pfn--;
1490
3c4322c9 1491 if (!pfn_valid(end_pfn))
7cf91a98
JK
1492 return NULL;
1493
2d070eab
MH
1494 start_page = pfn_to_online_page(start_pfn);
1495 if (!start_page)
1496 return NULL;
7cf91a98
JK
1497
1498 if (page_zone(start_page) != zone)
1499 return NULL;
1500
1501 end_page = pfn_to_page(end_pfn);
1502
1503 /* This gives a shorter code than deriving page_zone(end_page) */
1504 if (page_zone_id(start_page) != page_zone_id(end_page))
1505 return NULL;
1506
1507 return start_page;
1508}
1509
2f47a91f 1510/*
9420f89d
MRI
1511 * The order of subdivision here is critical for the IO subsystem.
1512 * Please do not alter this order without good reasons and regression
1513 * testing. Specifically, as large blocks of memory are subdivided,
1514 * the order in which smaller blocks are delivered depends on the order
1515 * they're subdivided in this function. This is the primary factor
1516 * influencing the order in which pages are delivered to the IO
1517 * subsystem according to empirical testing, and this is also justified
1518 * by considering the behavior of a buddy system containing a single
1519 * large block of memory acted on by a series of small allocations.
1520 * This behavior is a critical factor in sglist merging's success.
80b1f41c 1521 *
9420f89d 1522 * -- nyc
2f47a91f 1523 */
9420f89d
MRI
1524static inline void expand(struct zone *zone, struct page *page,
1525 int low, int high, int migratetype)
2f47a91f 1526{
9420f89d 1527 unsigned long size = 1 << high;
2f47a91f 1528
9420f89d
MRI
1529 while (high > low) {
1530 high--;
1531 size >>= 1;
1532 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
2f47a91f 1533
9420f89d
MRI
1534 /*
1535 * Mark as guard pages (or page), that will allow to
1536 * merge back to allocator when buddy will be freed.
1537 * Corresponding page table entries will not be touched,
1538 * pages will stay not present in virtual address space
1539 */
1540 if (set_page_guard(zone, &page[size], high, migratetype))
2f47a91f 1541 continue;
9420f89d
MRI
1542
1543 add_to_free_list(&page[size], zone, high, migratetype);
1544 set_buddy_order(&page[size], high);
2f47a91f 1545 }
2f47a91f
PT
1546}
1547
9420f89d 1548static void check_new_page_bad(struct page *page)
0e56acae 1549{
9420f89d
MRI
1550 if (unlikely(page->flags & __PG_HWPOISON)) {
1551 /* Don't complain about hwpoisoned pages */
1552 page_mapcount_reset(page); /* remove PageBuddy */
1553 return;
0e56acae
AD
1554 }
1555
9420f89d
MRI
1556 bad_page(page,
1557 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
0e56acae
AD
1558}
1559
1560/*
9420f89d 1561 * This page is about to be returned from the page allocator
0e56acae 1562 */
9420f89d 1563static int check_new_page(struct page *page)
0e56acae 1564{
9420f89d
MRI
1565 if (likely(page_expected_state(page,
1566 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
1567 return 0;
0e56acae 1568
9420f89d
MRI
1569 check_new_page_bad(page);
1570 return 1;
1571}
0e56acae 1572
9420f89d
MRI
1573static inline bool check_new_pages(struct page *page, unsigned int order)
1574{
1575 if (is_check_pages_enabled()) {
1576 for (int i = 0; i < (1 << order); i++) {
1577 struct page *p = page + i;
0e56acae 1578
8666925c 1579 if (check_new_page(p))
9420f89d 1580 return true;
0e56acae
AD
1581 }
1582 }
1583
9420f89d 1584 return false;
0e56acae
AD
1585}
1586
9420f89d 1587static inline bool should_skip_kasan_unpoison(gfp_t flags)
e4443149 1588{
9420f89d
MRI
1589 /* Don't skip if a software KASAN mode is enabled. */
1590 if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
1591 IS_ENABLED(CONFIG_KASAN_SW_TAGS))
1592 return false;
e4443149 1593
9420f89d
MRI
1594 /* Skip, if hardware tag-based KASAN is not enabled. */
1595 if (!kasan_hw_tags_enabled())
1596 return true;
e4443149
DJ
1597
1598 /*
9420f89d
MRI
1599 * With hardware tag-based KASAN enabled, skip if this has been
1600 * requested via __GFP_SKIP_KASAN.
e4443149 1601 */
9420f89d 1602 return flags & __GFP_SKIP_KASAN;
e4443149
DJ
1603}
1604
9420f89d 1605static inline bool should_skip_init(gfp_t flags)
ecd09650 1606{
9420f89d
MRI
1607 /* Don't skip, if hardware tag-based KASAN is not enabled. */
1608 if (!kasan_hw_tags_enabled())
1609 return false;
1610
1611 /* For hardware tag-based KASAN, skip if requested. */
1612 return (flags & __GFP_SKIP_ZERO);
ecd09650
DJ
1613}
1614
9420f89d
MRI
1615inline void post_alloc_hook(struct page *page, unsigned int order,
1616 gfp_t gfp_flags)
7e18adb4 1617{
9420f89d
MRI
1618 bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
1619 !should_skip_init(gfp_flags);
1620 bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
1621 int i;
1622
1623 set_page_private(page, 0);
1624 set_page_refcounted(page);
0e1cc95b 1625
9420f89d
MRI
1626 arch_alloc_page(page, order);
1627 debug_pagealloc_map_pages(page, 1 << order);
7e18adb4 1628
3d060856 1629 /*
9420f89d
MRI
1630 * Page unpoisoning must happen before memory initialization.
1631 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
1632 * allocations and the page unpoisoning code will complain.
3d060856 1633 */
9420f89d 1634 kernel_unpoison_pages(page, 1 << order);
862b6dee 1635
1bb5eab3
AK
1636 /*
1637 * As memory initialization might be integrated into KASAN,
b42090ae 1638 * KASAN unpoisoning and memory initializion code must be
1bb5eab3
AK
1639 * kept together to avoid discrepancies in behavior.
1640 */
9294b128
AK
1641
1642 /*
44383cef
AK
1643 * If memory tags should be zeroed
1644 * (which happens only when memory should be initialized as well).
9294b128 1645 */
44383cef 1646 if (zero_tags) {
420ef683 1647 /* Initialize both memory and memory tags. */
9294b128
AK
1648 for (i = 0; i != 1 << order; ++i)
1649 tag_clear_highpage(page + i);
1650
44383cef 1651 /* Take note that memory was initialized by the loop above. */
9294b128
AK
1652 init = false;
1653 }
0a54864f
PC
1654 if (!should_skip_kasan_unpoison(gfp_flags) &&
1655 kasan_unpoison_pages(page, order, init)) {
1656 /* Take note that memory was initialized by KASAN. */
1657 if (kasan_has_integrated_init())
1658 init = false;
1659 } else {
1660 /*
1661 * If memory tags have not been set by KASAN, reset the page
1662 * tags to ensure page_address() dereferencing does not fault.
1663 */
70c248ac
CM
1664 for (i = 0; i != 1 << order; ++i)
1665 page_kasan_tag_reset(page + i);
7a3b8353 1666 }
44383cef 1667 /* If memory is still not initialized, initialize it now. */
7e3cbba6 1668 if (init)
aeaec8e2 1669 kernel_init_pages(page, 1 << order);
1bb5eab3
AK
1670
1671 set_page_owner(page, order, gfp_flags);
df4e817b 1672 page_table_check_alloc(page, order);
46f24fd8
JK
1673}
1674
479f854a 1675static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
c603844b 1676 unsigned int alloc_flags)
2a7684a2 1677{
46f24fd8 1678 post_alloc_hook(page, order, gfp_flags);
17cf4406 1679
17cf4406
NP
1680 if (order && (gfp_flags & __GFP_COMP))
1681 prep_compound_page(page, order);
1682
75379191 1683 /*
2f064f34 1684 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
75379191
VB
1685 * allocate the page. The expectation is that the caller is taking
1686 * steps that will free more memory. The caller should avoid the page
1687 * being used for !PFMEMALLOC purposes.
1688 */
2f064f34
MH
1689 if (alloc_flags & ALLOC_NO_WATERMARKS)
1690 set_page_pfmemalloc(page);
1691 else
1692 clear_page_pfmemalloc(page);
1da177e4
LT
1693}
1694
56fd56b8
MG
1695/*
1696 * Go through the free lists for the given migratetype and remove
1697 * the smallest available page from the freelists
1698 */
85ccc8fa 1699static __always_inline
728ec980 1700struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
56fd56b8
MG
1701 int migratetype)
1702{
1703 unsigned int current_order;
b8af2941 1704 struct free_area *area;
56fd56b8
MG
1705 struct page *page;
1706
1707 /* Find a page of the appropriate size in the preferred list */
23baf831 1708 for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
56fd56b8 1709 area = &(zone->free_area[current_order]);
b03641af 1710 page = get_page_from_free_area(area, migratetype);
a16601c5
GT
1711 if (!page)
1712 continue;
6ab01363
AD
1713 del_page_from_free_list(page, zone, current_order);
1714 expand(zone, page, order, current_order, migratetype);
bb14c2c7 1715 set_pcppage_migratetype(page, migratetype);
10e0f753
WY
1716 trace_mm_page_alloc_zone_locked(page, order, migratetype,
1717 pcp_allowed_order(order) &&
1718 migratetype < MIGRATE_PCPTYPES);
56fd56b8
MG
1719 return page;
1720 }
1721
1722 return NULL;
1723}
1724
1725
b2a0ac88
MG
1726/*
1727 * This array describes the order lists are fallen back to when
1728 * the free lists for the desirable migrate type are depleted
1dd214b8
ZY
1729 *
1730 * The other migratetypes do not have fallbacks.
b2a0ac88 1731 */
aa02d3c1
YD
1732static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
1733 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
1734 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
1735 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
b2a0ac88
MG
1736};
1737
dc67647b 1738#ifdef CONFIG_CMA
85ccc8fa 1739static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
dc67647b
JK
1740 unsigned int order)
1741{
1742 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1743}
1744#else
1745static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1746 unsigned int order) { return NULL; }
1747#endif
1748
c361be55 1749/*
293ffa5e 1750 * Move the free pages in a range to the freelist tail of the requested type.
d9c23400 1751 * Note that start_page and end_pages are not aligned on a pageblock
c361be55
MG
1752 * boundary. If alignment is required, use move_freepages_block()
1753 */
02aa0cdd 1754static int move_freepages(struct zone *zone,
39ddb991 1755 unsigned long start_pfn, unsigned long end_pfn,
02aa0cdd 1756 int migratetype, int *num_movable)
c361be55
MG
1757{
1758 struct page *page;
39ddb991 1759 unsigned long pfn;
d00181b9 1760 unsigned int order;
d100313f 1761 int pages_moved = 0;
c361be55 1762
39ddb991 1763 for (pfn = start_pfn; pfn <= end_pfn;) {
39ddb991 1764 page = pfn_to_page(pfn);
c361be55 1765 if (!PageBuddy(page)) {
02aa0cdd
VB
1766 /*
1767 * We assume that pages that could be isolated for
1768 * migration are movable. But we don't actually try
1769 * isolating, as that would be expensive.
1770 */
1771 if (num_movable &&
1772 (PageLRU(page) || __PageMovable(page)))
1773 (*num_movable)++;
39ddb991 1774 pfn++;
c361be55
MG
1775 continue;
1776 }
1777
cd961038
DR
1778 /* Make sure we are not inadvertently changing nodes */
1779 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1780 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
1781
ab130f91 1782 order = buddy_order(page);
6ab01363 1783 move_to_free_list(page, zone, order, migratetype);
39ddb991 1784 pfn += 1 << order;
d100313f 1785 pages_moved += 1 << order;
c361be55
MG
1786 }
1787
d100313f 1788 return pages_moved;
c361be55
MG
1789}
1790
ee6f509c 1791int move_freepages_block(struct zone *zone, struct page *page,
02aa0cdd 1792 int migratetype, int *num_movable)
c361be55 1793{
39ddb991 1794 unsigned long start_pfn, end_pfn, pfn;
c361be55 1795
4a222127
DR
1796 if (num_movable)
1797 *num_movable = 0;
1798
39ddb991 1799 pfn = page_to_pfn(page);
4f9bc69a
KW
1800 start_pfn = pageblock_start_pfn(pfn);
1801 end_pfn = pageblock_end_pfn(pfn) - 1;
c361be55
MG
1802
1803 /* Do not cross zone boundaries */
108bcc96 1804 if (!zone_spans_pfn(zone, start_pfn))
39ddb991 1805 start_pfn = pfn;
108bcc96 1806 if (!zone_spans_pfn(zone, end_pfn))
c361be55
MG
1807 return 0;
1808
39ddb991 1809 return move_freepages(zone, start_pfn, end_pfn, migratetype,
02aa0cdd 1810 num_movable);
c361be55
MG
1811}
1812
2f66a68f
MG
1813static void change_pageblock_range(struct page *pageblock_page,
1814 int start_order, int migratetype)
1815{
1816 int nr_pageblocks = 1 << (start_order - pageblock_order);
1817
1818 while (nr_pageblocks--) {
1819 set_pageblock_migratetype(pageblock_page, migratetype);
1820 pageblock_page += pageblock_nr_pages;
1821 }
1822}
1823
fef903ef 1824/*
9c0415eb
VB
1825 * When we are falling back to another migratetype during allocation, try to
1826 * steal extra free pages from the same pageblocks to satisfy further
1827 * allocations, instead of polluting multiple pageblocks.
1828 *
1829 * If we are stealing a relatively large buddy page, it is likely there will
1830 * be more free pages in the pageblock, so try to steal them all. For
1831 * reclaimable and unmovable allocations, we steal regardless of page size,
1832 * as fragmentation caused by those allocations polluting movable pageblocks
1833 * is worse than movable allocations stealing from unmovable and reclaimable
1834 * pageblocks.
fef903ef 1835 */
4eb7dce6
JK
1836static bool can_steal_fallback(unsigned int order, int start_mt)
1837{
1838 /*
1839 * Leaving this order check is intended, although there is
1840 * relaxed order check in next check. The reason is that
1841 * we can actually steal whole pageblock if this condition met,
1842 * but, below check doesn't guarantee it and that is just heuristic
1843 * so could be changed anytime.
1844 */
1845 if (order >= pageblock_order)
1846 return true;
1847
1848 if (order >= pageblock_order / 2 ||
1849 start_mt == MIGRATE_RECLAIMABLE ||
1850 start_mt == MIGRATE_UNMOVABLE ||
1851 page_group_by_mobility_disabled)
1852 return true;
1853
1854 return false;
1855}
1856
597c8920 1857static inline bool boost_watermark(struct zone *zone)
1c30844d
MG
1858{
1859 unsigned long max_boost;
1860
1861 if (!watermark_boost_factor)
597c8920 1862 return false;
14f69140
HW
1863 /*
1864 * Don't bother in zones that are unlikely to produce results.
1865 * On small machines, including kdump capture kernels running
1866 * in a small area, boosting the watermark can cause an out of
1867 * memory situation immediately.
1868 */
1869 if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
597c8920 1870 return false;
1c30844d
MG
1871
1872 max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
1873 watermark_boost_factor, 10000);
94b3334c
MG
1874
1875 /*
1876 * high watermark may be uninitialised if fragmentation occurs
1877 * very early in boot so do not boost. We do not fall
1878 * through and boost by pageblock_nr_pages as failing
1879 * allocations that early means that reclaim is not going
1880 * to help and it may even be impossible to reclaim the
1881 * boosted watermark resulting in a hang.
1882 */
1883 if (!max_boost)
597c8920 1884 return false;
94b3334c 1885
1c30844d
MG
1886 max_boost = max(pageblock_nr_pages, max_boost);
1887
1888 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
1889 max_boost);
597c8920
JW
1890
1891 return true;
1c30844d
MG
1892}
1893
4eb7dce6
JK
1894/*
1895 * This function implements actual steal behaviour. If order is large enough,
1896 * we can steal whole pageblock. If not, we first move freepages in this
02aa0cdd
VB
1897 * pageblock to our migratetype and determine how many already-allocated pages
1898 * are there in the pageblock with a compatible migratetype. If at least half
1899 * of pages are free or compatible, we can change migratetype of the pageblock
1900 * itself, so pages freed in the future will be put on the correct free list.
4eb7dce6
JK
1901 */
1902static void steal_suitable_fallback(struct zone *zone, struct page *page,
1c30844d 1903 unsigned int alloc_flags, int start_type, bool whole_block)
fef903ef 1904{
ab130f91 1905 unsigned int current_order = buddy_order(page);
02aa0cdd
VB
1906 int free_pages, movable_pages, alike_pages;
1907 int old_block_type;
1908
1909 old_block_type = get_pageblock_migratetype(page);
fef903ef 1910
3bc48f96
VB
1911 /*
1912 * This can happen due to races and we want to prevent broken
1913 * highatomic accounting.
1914 */
02aa0cdd 1915 if (is_migrate_highatomic(old_block_type))
3bc48f96
VB
1916 goto single_page;
1917
fef903ef
SB
1918 /* Take ownership for orders >= pageblock_order */
1919 if (current_order >= pageblock_order) {
1920 change_pageblock_range(page, current_order, start_type);
3bc48f96 1921 goto single_page;
fef903ef
SB
1922 }
1923
1c30844d
MG
1924 /*
1925 * Boost watermarks to increase reclaim pressure to reduce the
1926 * likelihood of future fallbacks. Wake kswapd now as the node
1927 * may be balanced overall and kswapd will not wake naturally.
1928 */
597c8920 1929 if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
73444bc4 1930 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
1c30844d 1931
3bc48f96
VB
1932 /* We are not allowed to try stealing from the whole block */
1933 if (!whole_block)
1934 goto single_page;
1935
02aa0cdd
VB
1936 free_pages = move_freepages_block(zone, page, start_type,
1937 &movable_pages);
1938 /*
1939 * Determine how many pages are compatible with our allocation.
1940 * For movable allocation, it's the number of movable pages which
1941 * we just obtained. For other types it's a bit more tricky.
1942 */
1943 if (start_type == MIGRATE_MOVABLE) {
1944 alike_pages = movable_pages;
1945 } else {
1946 /*
1947 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
1948 * to MOVABLE pageblock, consider all non-movable pages as
1949 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
1950 * vice versa, be conservative since we can't distinguish the
1951 * exact migratetype of non-movable pages.
1952 */
1953 if (old_block_type == MIGRATE_MOVABLE)
1954 alike_pages = pageblock_nr_pages
1955 - (free_pages + movable_pages);
1956 else
1957 alike_pages = 0;
1958 }
1959
3bc48f96 1960 /* moving whole block can fail due to zone boundary conditions */
02aa0cdd 1961 if (!free_pages)
3bc48f96 1962 goto single_page;
fef903ef 1963
02aa0cdd
VB
1964 /*
1965 * If a sufficient number of pages in the block are either free or of
1966 * comparable migratability as our allocation, claim the whole block.
1967 */
1968 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
4eb7dce6
JK
1969 page_group_by_mobility_disabled)
1970 set_pageblock_migratetype(page, start_type);
3bc48f96
VB
1971
1972 return;
1973
1974single_page:
6ab01363 1975 move_to_free_list(page, zone, current_order, start_type);
4eb7dce6
JK
1976}
1977
2149cdae
JK
1978/*
1979 * Check whether there is a suitable fallback freepage with requested order.
1980 * If only_stealable is true, this function returns fallback_mt only if
1981 * we can steal other freepages all together. This would help to reduce
1982 * fragmentation due to mixed migratetype pages in one pageblock.
1983 */
1984int find_suitable_fallback(struct free_area *area, unsigned int order,
1985 int migratetype, bool only_stealable, bool *can_steal)
4eb7dce6
JK
1986{
1987 int i;
1988 int fallback_mt;
1989
1990 if (area->nr_free == 0)
1991 return -1;
1992
1993 *can_steal = false;
aa02d3c1 1994 for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
4eb7dce6 1995 fallback_mt = fallbacks[migratetype][i];
b03641af 1996 if (free_area_empty(area, fallback_mt))
4eb7dce6 1997 continue;
fef903ef 1998
4eb7dce6
JK
1999 if (can_steal_fallback(order, migratetype))
2000 *can_steal = true;
2001
2149cdae
JK
2002 if (!only_stealable)
2003 return fallback_mt;
2004
2005 if (*can_steal)
2006 return fallback_mt;
fef903ef 2007 }
4eb7dce6
JK
2008
2009 return -1;
fef903ef
SB
2010}
2011
0aaa29a5
MG
2012/*
2013 * Reserve a pageblock for exclusive use of high-order atomic allocations if
2014 * there are no empty page blocks that contain a page with a suitable order
2015 */
2016static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2017 unsigned int alloc_order)
2018{
2019 int mt;
2020 unsigned long max_managed, flags;
2021
2022 /*
2023 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2024 * Check is race-prone but harmless.
2025 */
9705bea5 2026 max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
0aaa29a5
MG
2027 if (zone->nr_reserved_highatomic >= max_managed)
2028 return;
2029
2030 spin_lock_irqsave(&zone->lock, flags);
2031
2032 /* Recheck the nr_reserved_highatomic limit under the lock */
2033 if (zone->nr_reserved_highatomic >= max_managed)
2034 goto out_unlock;
2035
2036 /* Yoink! */
2037 mt = get_pageblock_migratetype(page);
1dd214b8
ZY
2038 /* Only reserve normal pageblocks (i.e., they can merge with others) */
2039 if (migratetype_is_mergeable(mt)) {
0aaa29a5
MG
2040 zone->nr_reserved_highatomic += pageblock_nr_pages;
2041 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
02aa0cdd 2042 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
0aaa29a5
MG
2043 }
2044
2045out_unlock:
2046 spin_unlock_irqrestore(&zone->lock, flags);
2047}
2048
2049/*
2050 * Used when an allocation is about to fail under memory pressure. This
2051 * potentially hurts the reliability of high-order allocations when under
2052 * intense memory pressure but failed atomic allocations should be easier
2053 * to recover from than an OOM.
29fac03b
MK
2054 *
2055 * If @force is true, try to unreserve a pageblock even though highatomic
2056 * pageblock is exhausted.
0aaa29a5 2057 */
29fac03b
MK
2058static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2059 bool force)
0aaa29a5
MG
2060{
2061 struct zonelist *zonelist = ac->zonelist;
2062 unsigned long flags;
2063 struct zoneref *z;
2064 struct zone *zone;
2065 struct page *page;
2066 int order;
04c8716f 2067 bool ret;
0aaa29a5 2068
97a225e6 2069 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
0aaa29a5 2070 ac->nodemask) {
29fac03b
MK
2071 /*
2072 * Preserve at least one pageblock unless memory pressure
2073 * is really high.
2074 */
2075 if (!force && zone->nr_reserved_highatomic <=
2076 pageblock_nr_pages)
0aaa29a5
MG
2077 continue;
2078
2079 spin_lock_irqsave(&zone->lock, flags);
23baf831 2080 for (order = 0; order <= MAX_ORDER; order++) {
0aaa29a5
MG
2081 struct free_area *area = &(zone->free_area[order]);
2082
b03641af 2083 page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
a16601c5 2084 if (!page)
0aaa29a5
MG
2085 continue;
2086
0aaa29a5 2087 /*
4855e4a7
MK
2088 * In page freeing path, migratetype change is racy so
2089 * we can counter several free pages in a pageblock
f0953a1b 2090 * in this loop although we changed the pageblock type
4855e4a7
MK
2091 * from highatomic to ac->migratetype. So we should
2092 * adjust the count once.
0aaa29a5 2093 */
a6ffdc07 2094 if (is_migrate_highatomic_page(page)) {
4855e4a7
MK
2095 /*
2096 * It should never happen but changes to
2097 * locking could inadvertently allow a per-cpu
2098 * drain to add pages to MIGRATE_HIGHATOMIC
2099 * while unreserving so be safe and watch for
2100 * underflows.
2101 */
2102 zone->nr_reserved_highatomic -= min(
2103 pageblock_nr_pages,
2104 zone->nr_reserved_highatomic);
2105 }
0aaa29a5
MG
2106
2107 /*
2108 * Convert to ac->migratetype and avoid the normal
2109 * pageblock stealing heuristics. Minimally, the caller
2110 * is doing the work and needs the pages. More
2111 * importantly, if the block was always converted to
2112 * MIGRATE_UNMOVABLE or another type then the number
2113 * of pageblocks that cannot be completely freed
2114 * may increase.
2115 */
2116 set_pageblock_migratetype(page, ac->migratetype);
02aa0cdd
VB
2117 ret = move_freepages_block(zone, page, ac->migratetype,
2118 NULL);
29fac03b
MK
2119 if (ret) {
2120 spin_unlock_irqrestore(&zone->lock, flags);
2121 return ret;
2122 }
0aaa29a5
MG
2123 }
2124 spin_unlock_irqrestore(&zone->lock, flags);
2125 }
04c8716f
MK
2126
2127 return false;
0aaa29a5
MG
2128}
2129
3bc48f96
VB
2130/*
2131 * Try finding a free buddy page on the fallback list and put it on the free
2132 * list of requested migratetype, possibly along with other pages from the same
2133 * block, depending on fragmentation avoidance heuristics. Returns true if
2134 * fallback was found so that __rmqueue_smallest() can grab it.
b002529d
RV
2135 *
2136 * The use of signed ints for order and current_order is a deliberate
2137 * deviation from the rest of this file, to make the for loop
2138 * condition simpler.
3bc48f96 2139 */
85ccc8fa 2140static __always_inline bool
6bb15450
MG
2141__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2142 unsigned int alloc_flags)
b2a0ac88 2143{
b8af2941 2144 struct free_area *area;
b002529d 2145 int current_order;
6bb15450 2146 int min_order = order;
b2a0ac88 2147 struct page *page;
4eb7dce6
JK
2148 int fallback_mt;
2149 bool can_steal;
b2a0ac88 2150
6bb15450
MG
2151 /*
2152 * Do not steal pages from freelists belonging to other pageblocks
2153 * i.e. orders < pageblock_order. If there are no local zones free,
2154 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2155 */
e933dc4a 2156 if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
6bb15450
MG
2157 min_order = pageblock_order;
2158
7a8f58f3
VB
2159 /*
2160 * Find the largest available free page in the other list. This roughly
2161 * approximates finding the pageblock with the most free pages, which
2162 * would be too costly to do exactly.
2163 */
23baf831 2164 for (current_order = MAX_ORDER; current_order >= min_order;
7aeb09f9 2165 --current_order) {
4eb7dce6
JK
2166 area = &(zone->free_area[current_order]);
2167 fallback_mt = find_suitable_fallback(area, current_order,
2149cdae 2168 start_migratetype, false, &can_steal);
4eb7dce6
JK
2169 if (fallback_mt == -1)
2170 continue;
b2a0ac88 2171
7a8f58f3
VB
2172 /*
2173 * We cannot steal all free pages from the pageblock and the
2174 * requested migratetype is movable. In that case it's better to
2175 * steal and split the smallest available page instead of the
2176 * largest available page, because even if the next movable
2177 * allocation falls back into a different pageblock than this
2178 * one, it won't cause permanent fragmentation.
2179 */
2180 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2181 && current_order > order)
2182 goto find_smallest;
b2a0ac88 2183
7a8f58f3
VB
2184 goto do_steal;
2185 }
e0fff1bd 2186
7a8f58f3 2187 return false;
e0fff1bd 2188
7a8f58f3 2189find_smallest:
23baf831 2190 for (current_order = order; current_order <= MAX_ORDER;
7a8f58f3
VB
2191 current_order++) {
2192 area = &(zone->free_area[current_order]);
2193 fallback_mt = find_suitable_fallback(area, current_order,
2194 start_migratetype, false, &can_steal);
2195 if (fallback_mt != -1)
2196 break;
b2a0ac88
MG
2197 }
2198
7a8f58f3
VB
2199 /*
2200 * This should not happen - we already found a suitable fallback
2201 * when looking for the largest page.
2202 */
23baf831 2203 VM_BUG_ON(current_order > MAX_ORDER);
7a8f58f3
VB
2204
2205do_steal:
b03641af 2206 page = get_page_from_free_area(area, fallback_mt);
7a8f58f3 2207
1c30844d
MG
2208 steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2209 can_steal);
7a8f58f3
VB
2210
2211 trace_mm_page_alloc_extfrag(page, order, current_order,
2212 start_migratetype, fallback_mt);
2213
2214 return true;
2215
b2a0ac88
MG
2216}
2217
56fd56b8 2218/*
1da177e4
LT
2219 * Do the hard work of removing an element from the buddy allocator.
2220 * Call me with the zone->lock already held.
2221 */
85ccc8fa 2222static __always_inline struct page *
6bb15450
MG
2223__rmqueue(struct zone *zone, unsigned int order, int migratetype,
2224 unsigned int alloc_flags)
1da177e4 2225{
1da177e4
LT
2226 struct page *page;
2227
ce8f86ee
H
2228 if (IS_ENABLED(CONFIG_CMA)) {
2229 /*
2230 * Balance movable allocations between regular and CMA areas by
2231 * allocating from CMA when over half of the zone's free memory
2232 * is in the CMA area.
2233 */
2234 if (alloc_flags & ALLOC_CMA &&
2235 zone_page_state(zone, NR_FREE_CMA_PAGES) >
2236 zone_page_state(zone, NR_FREE_PAGES) / 2) {
2237 page = __rmqueue_cma_fallback(zone, order);
2238 if (page)
10e0f753 2239 return page;
ce8f86ee 2240 }
16867664 2241 }
3bc48f96 2242retry:
56fd56b8 2243 page = __rmqueue_smallest(zone, order, migratetype);
974a786e 2244 if (unlikely(!page)) {
8510e69c 2245 if (alloc_flags & ALLOC_CMA)
dc67647b
JK
2246 page = __rmqueue_cma_fallback(zone, order);
2247
6bb15450
MG
2248 if (!page && __rmqueue_fallback(zone, order, migratetype,
2249 alloc_flags))
3bc48f96 2250 goto retry;
728ec980 2251 }
b2a0ac88 2252 return page;
1da177e4
LT
2253}
2254
5f63b720 2255/*
1da177e4
LT
2256 * Obtain a specified number of elements from the buddy allocator, all under
2257 * a single hold of the lock, for efficiency. Add them to the supplied list.
2258 * Returns the number of new pages which were placed at *list.
2259 */
5f63b720 2260static int rmqueue_bulk(struct zone *zone, unsigned int order,
b2a0ac88 2261 unsigned long count, struct list_head *list,
6bb15450 2262 int migratetype, unsigned int alloc_flags)
1da177e4 2263{
57490774 2264 unsigned long flags;
700d2e9a 2265 int i;
5f63b720 2266
57490774 2267 spin_lock_irqsave(&zone->lock, flags);
1da177e4 2268 for (i = 0; i < count; ++i) {
6bb15450
MG
2269 struct page *page = __rmqueue(zone, order, migratetype,
2270 alloc_flags);
085cc7d5 2271 if (unlikely(page == NULL))
1da177e4 2272 break;
81eabcbe
MG
2273
2274 /*
0fac3ba5
VB
2275 * Split buddy pages returned by expand() are received here in
2276 * physical page order. The page is added to the tail of
2277 * caller's list. From the callers perspective, the linked list
2278 * is ordered by page number under some conditions. This is
2279 * useful for IO devices that can forward direction from the
2280 * head, thus also in the physical page order. This is useful
2281 * for IO devices that can merge IO requests if the physical
2282 * pages are ordered properly.
81eabcbe 2283 */
bf75f200 2284 list_add_tail(&page->pcp_list, list);
bb14c2c7 2285 if (is_migrate_cma(get_pcppage_migratetype(page)))
d1ce749a
BZ
2286 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2287 -(1 << order));
1da177e4 2288 }
a6de734b 2289
f2260e6b 2290 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
57490774 2291 spin_unlock_irqrestore(&zone->lock, flags);
2ede3c13 2292
700d2e9a 2293 return i;
1da177e4
LT
2294}
2295
4ae7c039 2296#ifdef CONFIG_NUMA
8fce4d8e 2297/*
4037d452
CL
2298 * Called from the vmstat counter updater to drain pagesets of this
2299 * currently executing processor on remote nodes after they have
2300 * expired.
8fce4d8e 2301 */
4037d452 2302void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
4ae7c039 2303{
7be12fc9 2304 int to_drain, batch;
4ae7c039 2305
4db0c3c2 2306 batch = READ_ONCE(pcp->batch);
7be12fc9 2307 to_drain = min(pcp->count, batch);
4b23a68f 2308 if (to_drain > 0) {
57490774 2309 spin_lock(&pcp->lock);
fd56eef2 2310 free_pcppages_bulk(zone, to_drain, pcp, 0);
57490774 2311 spin_unlock(&pcp->lock);
4b23a68f 2312 }
4ae7c039
CL
2313}
2314#endif
2315
9f8f2172 2316/*
93481ff0 2317 * Drain pcplists of the indicated processor and zone.
9f8f2172 2318 */
93481ff0 2319static void drain_pages_zone(unsigned int cpu, struct zone *zone)
1da177e4 2320{
93481ff0 2321 struct per_cpu_pages *pcp;
1da177e4 2322
28f836b6 2323 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
4b23a68f 2324 if (pcp->count) {
57490774 2325 spin_lock(&pcp->lock);
4b23a68f 2326 free_pcppages_bulk(zone, pcp->count, pcp, 0);
57490774 2327 spin_unlock(&pcp->lock);
4b23a68f 2328 }
93481ff0 2329}
3dfa5721 2330
93481ff0
VB
2331/*
2332 * Drain pcplists of all zones on the indicated processor.
93481ff0
VB
2333 */
2334static void drain_pages(unsigned int cpu)
2335{
2336 struct zone *zone;
2337
2338 for_each_populated_zone(zone) {
2339 drain_pages_zone(cpu, zone);
1da177e4
LT
2340 }
2341}
1da177e4 2342
9f8f2172
CL
2343/*
2344 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2345 */
93481ff0 2346void drain_local_pages(struct zone *zone)
9f8f2172 2347{
93481ff0
VB
2348 int cpu = smp_processor_id();
2349
2350 if (zone)
2351 drain_pages_zone(cpu, zone);
2352 else
2353 drain_pages(cpu);
9f8f2172
CL
2354}
2355
2356/*
ec6e8c7e
VB
2357 * The implementation of drain_all_pages(), exposing an extra parameter to
2358 * drain on all cpus.
93481ff0 2359 *
ec6e8c7e
VB
2360 * drain_all_pages() is optimized to only execute on cpus where pcplists are
2361 * not empty. The check for non-emptiness can however race with a free to
2362 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
2363 * that need the guarantee that every CPU has drained can disable the
2364 * optimizing racy check.
9f8f2172 2365 */
3b1f3658 2366static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
9f8f2172 2367{
74046494 2368 int cpu;
74046494
GBY
2369
2370 /*
041711ce 2371 * Allocate in the BSS so we won't require allocation in
74046494
GBY
2372 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2373 */
2374 static cpumask_t cpus_with_pcps;
2375
bd233f53
MG
2376 /*
2377 * Do not drain if one is already in progress unless it's specific to
2378 * a zone. Such callers are primarily CMA and memory hotplug and need
2379 * the drain to be complete when the call returns.
2380 */
2381 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2382 if (!zone)
2383 return;
2384 mutex_lock(&pcpu_drain_mutex);
2385 }
0ccce3b9 2386
74046494
GBY
2387 /*
2388 * We don't care about racing with CPU hotplug event
2389 * as offline notification will cause the notified
2390 * cpu to drain that CPU pcps and on_each_cpu_mask
2391 * disables preemption as part of its processing
2392 */
2393 for_each_online_cpu(cpu) {
28f836b6 2394 struct per_cpu_pages *pcp;
93481ff0 2395 struct zone *z;
74046494 2396 bool has_pcps = false;
93481ff0 2397
ec6e8c7e
VB
2398 if (force_all_cpus) {
2399 /*
2400 * The pcp.count check is racy, some callers need a
2401 * guarantee that no cpu is missed.
2402 */
2403 has_pcps = true;
2404 } else if (zone) {
28f836b6
MG
2405 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2406 if (pcp->count)
74046494 2407 has_pcps = true;
93481ff0
VB
2408 } else {
2409 for_each_populated_zone(z) {
28f836b6
MG
2410 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
2411 if (pcp->count) {
93481ff0
VB
2412 has_pcps = true;
2413 break;
2414 }
74046494
GBY
2415 }
2416 }
93481ff0 2417
74046494
GBY
2418 if (has_pcps)
2419 cpumask_set_cpu(cpu, &cpus_with_pcps);
2420 else
2421 cpumask_clear_cpu(cpu, &cpus_with_pcps);
2422 }
0ccce3b9 2423
bd233f53 2424 for_each_cpu(cpu, &cpus_with_pcps) {
443c2acc
NSJ
2425 if (zone)
2426 drain_pages_zone(cpu, zone);
2427 else
2428 drain_pages(cpu);
0ccce3b9 2429 }
bd233f53
MG
2430
2431 mutex_unlock(&pcpu_drain_mutex);
9f8f2172
CL
2432}
2433
ec6e8c7e
VB
2434/*
2435 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2436 *
2437 * When zone parameter is non-NULL, spill just the single zone's pages.
ec6e8c7e
VB
2438 */
2439void drain_all_pages(struct zone *zone)
2440{
2441 __drain_all_pages(zone, false);
2442}
2443
296699de 2444#ifdef CONFIG_HIBERNATION
1da177e4 2445
556b969a
CY
2446/*
2447 * Touch the watchdog for every WD_PAGE_COUNT pages.
2448 */
2449#define WD_PAGE_COUNT (128*1024)
2450
1da177e4
LT
2451void mark_free_pages(struct zone *zone)
2452{
556b969a 2453 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
f623f0db 2454 unsigned long flags;
7aeb09f9 2455 unsigned int order, t;
86760a2c 2456 struct page *page;
1da177e4 2457
8080fc03 2458 if (zone_is_empty(zone))
1da177e4
LT
2459 return;
2460
2461 spin_lock_irqsave(&zone->lock, flags);
f623f0db 2462
108bcc96 2463 max_zone_pfn = zone_end_pfn(zone);
f623f0db
RW
2464 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2465 if (pfn_valid(pfn)) {
86760a2c 2466 page = pfn_to_page(pfn);
ba6b0979 2467
556b969a
CY
2468 if (!--page_count) {
2469 touch_nmi_watchdog();
2470 page_count = WD_PAGE_COUNT;
2471 }
2472
ba6b0979
JK
2473 if (page_zone(page) != zone)
2474 continue;
2475
7be98234
RW
2476 if (!swsusp_page_is_forbidden(page))
2477 swsusp_unset_page_free(page);
f623f0db 2478 }
1da177e4 2479
b2a0ac88 2480 for_each_migratetype_order(order, t) {
86760a2c 2481 list_for_each_entry(page,
bf75f200 2482 &zone->free_area[order].free_list[t], buddy_list) {
f623f0db 2483 unsigned long i;
1da177e4 2484
86760a2c 2485 pfn = page_to_pfn(page);
556b969a
CY
2486 for (i = 0; i < (1UL << order); i++) {
2487 if (!--page_count) {
2488 touch_nmi_watchdog();
2489 page_count = WD_PAGE_COUNT;
2490 }
7be98234 2491 swsusp_set_page_free(pfn_to_page(pfn + i));
556b969a 2492 }
f623f0db 2493 }
b2a0ac88 2494 }
1da177e4
LT
2495 spin_unlock_irqrestore(&zone->lock, flags);
2496}
e2c55dc8 2497#endif /* CONFIG_PM */
1da177e4 2498
44042b44
MG
2499static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
2500 unsigned int order)
1da177e4 2501{
5f8dcc21 2502 int migratetype;
1da177e4 2503
700d2e9a 2504 if (!free_pages_prepare(page, order, FPI_NONE))
9cca35d4 2505 return false;
689bcebf 2506
dc4b0caf 2507 migratetype = get_pfnblock_migratetype(page, pfn);
bb14c2c7 2508 set_pcppage_migratetype(page, migratetype);
9cca35d4
MG
2509 return true;
2510}
2511
f26b3fa0
MG
2512static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
2513 bool free_high)
3b12e7e9
MG
2514{
2515 int min_nr_free, max_nr_free;
2516
f26b3fa0
MG
2517 /* Free everything if batch freeing high-order pages. */
2518 if (unlikely(free_high))
2519 return pcp->count;
2520
3b12e7e9
MG
2521 /* Check for PCP disabled or boot pageset */
2522 if (unlikely(high < batch))
2523 return 1;
2524
2525 /* Leave at least pcp->batch pages on the list */
2526 min_nr_free = batch;
2527 max_nr_free = high - batch;
2528
2529 /*
2530 * Double the number of pages freed each time there is subsequent
2531 * freeing of pages without any allocation.
2532 */
2533 batch <<= pcp->free_factor;
2534 if (batch < max_nr_free)
2535 pcp->free_factor++;
2536 batch = clamp(batch, min_nr_free, max_nr_free);
2537
2538 return batch;
2539}
2540
f26b3fa0
MG
2541static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
2542 bool free_high)
c49c2c47
MG
2543{
2544 int high = READ_ONCE(pcp->high);
2545
f26b3fa0 2546 if (unlikely(!high || free_high))
c49c2c47
MG
2547 return 0;
2548
2549 if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
2550 return high;
2551
2552 /*
2553 * If reclaim is active, limit the number of pages that can be
2554 * stored on pcp lists
2555 */
2556 return min(READ_ONCE(pcp->batch) << 2, high);
2557}
2558
4b23a68f
MG
2559static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
2560 struct page *page, int migratetype,
56651377 2561 unsigned int order)
9cca35d4 2562{
3b12e7e9 2563 int high;
44042b44 2564 int pindex;
f26b3fa0 2565 bool free_high;
9cca35d4 2566
15cd9004 2567 __count_vm_events(PGFREE, 1 << order);
44042b44 2568 pindex = order_to_pindex(migratetype, order);
bf75f200 2569 list_add(&page->pcp_list, &pcp->lists[pindex]);
44042b44 2570 pcp->count += 1 << order;
f26b3fa0
MG
2571
2572 /*
2573 * As high-order pages other than THP's stored on PCP can contribute
2574 * to fragmentation, limit the number stored when PCP is heavily
2575 * freeing without allocation. The remainder after bulk freeing
2576 * stops will be drained from vmstat refresh context.
2577 */
2578 free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
2579
2580 high = nr_pcp_high(pcp, zone, free_high);
3b12e7e9
MG
2581 if (pcp->count >= high) {
2582 int batch = READ_ONCE(pcp->batch);
2583
f26b3fa0 2584 free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
3b12e7e9 2585 }
9cca35d4 2586}
5f8dcc21 2587
9cca35d4 2588/*
44042b44 2589 * Free a pcp page
9cca35d4 2590 */
44042b44 2591void free_unref_page(struct page *page, unsigned int order)
9cca35d4 2592{
4b23a68f
MG
2593 unsigned long __maybe_unused UP_flags;
2594 struct per_cpu_pages *pcp;
2595 struct zone *zone;
9cca35d4 2596 unsigned long pfn = page_to_pfn(page);
df1acc85 2597 int migratetype;
9cca35d4 2598
44042b44 2599 if (!free_unref_page_prepare(page, pfn, order))
9cca35d4 2600 return;
da456f14 2601
5f8dcc21
MG
2602 /*
2603 * We only track unmovable, reclaimable and movable on pcp lists.
df1acc85 2604 * Place ISOLATE pages on the isolated list because they are being
a6ffdc07 2605 * offlined but treat HIGHATOMIC as movable pages so we can get those
5f8dcc21
MG
2606 * areas back if necessary. Otherwise, we may have to free
2607 * excessively into the page allocator
2608 */
df1acc85
MG
2609 migratetype = get_pcppage_migratetype(page);
2610 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
194159fb 2611 if (unlikely(is_migrate_isolate(migratetype))) {
44042b44 2612 free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
9cca35d4 2613 return;
5f8dcc21
MG
2614 }
2615 migratetype = MIGRATE_MOVABLE;
2616 }
2617
4b23a68f
MG
2618 zone = page_zone(page);
2619 pcp_trylock_prepare(UP_flags);
57490774 2620 pcp = pcp_spin_trylock(zone->per_cpu_pageset);
01b44456 2621 if (pcp) {
4b23a68f 2622 free_unref_page_commit(zone, pcp, page, migratetype, order);
57490774 2623 pcp_spin_unlock(pcp);
4b23a68f
MG
2624 } else {
2625 free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
2626 }
2627 pcp_trylock_finish(UP_flags);
1da177e4
LT
2628}
2629
cc59850e
KK
2630/*
2631 * Free a list of 0-order pages
2632 */
2d4894b5 2633void free_unref_page_list(struct list_head *list)
cc59850e 2634{
57490774 2635 unsigned long __maybe_unused UP_flags;
cc59850e 2636 struct page *page, *next;
4b23a68f
MG
2637 struct per_cpu_pages *pcp = NULL;
2638 struct zone *locked_zone = NULL;
c24ad77d 2639 int batch_count = 0;
df1acc85 2640 int migratetype;
9cca35d4
MG
2641
2642 /* Prepare pages for freeing */
2643 list_for_each_entry_safe(page, next, list, lru) {
56651377 2644 unsigned long pfn = page_to_pfn(page);
053cfda1 2645 if (!free_unref_page_prepare(page, pfn, 0)) {
9cca35d4 2646 list_del(&page->lru);
053cfda1
ML
2647 continue;
2648 }
df1acc85
MG
2649
2650 /*
2651 * Free isolated pages directly to the allocator, see
2652 * comment in free_unref_page.
2653 */
2654 migratetype = get_pcppage_migratetype(page);
47aef601
DB
2655 if (unlikely(is_migrate_isolate(migratetype))) {
2656 list_del(&page->lru);
2657 free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
2658 continue;
df1acc85 2659 }
9cca35d4 2660 }
cc59850e
KK
2661
2662 list_for_each_entry_safe(page, next, list, lru) {
4b23a68f
MG
2663 struct zone *zone = page_zone(page);
2664
c3e58a70 2665 list_del(&page->lru);
57490774 2666 migratetype = get_pcppage_migratetype(page);
c3e58a70 2667
a4bafffb
MG
2668 /*
2669 * Either different zone requiring a different pcp lock or
2670 * excessive lock hold times when freeing a large list of
2671 * pages.
2672 */
2673 if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
57490774
MG
2674 if (pcp) {
2675 pcp_spin_unlock(pcp);
2676 pcp_trylock_finish(UP_flags);
2677 }
01b44456 2678
a4bafffb
MG
2679 batch_count = 0;
2680
57490774
MG
2681 /*
2682 * trylock is necessary as pages may be getting freed
2683 * from IRQ or SoftIRQ context after an IO completion.
2684 */
2685 pcp_trylock_prepare(UP_flags);
2686 pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2687 if (unlikely(!pcp)) {
2688 pcp_trylock_finish(UP_flags);
2689 free_one_page(zone, page, page_to_pfn(page),
2690 0, migratetype, FPI_NONE);
2691 locked_zone = NULL;
2692 continue;
2693 }
4b23a68f 2694 locked_zone = zone;
4b23a68f
MG
2695 }
2696
47aef601
DB
2697 /*
2698 * Non-isolated types over MIGRATE_PCPTYPES get added
2699 * to the MIGRATE_MOVABLE pcp list.
2700 */
47aef601
DB
2701 if (unlikely(migratetype >= MIGRATE_PCPTYPES))
2702 migratetype = MIGRATE_MOVABLE;
2703
2d4894b5 2704 trace_mm_page_free_batched(page);
4b23a68f 2705 free_unref_page_commit(zone, pcp, page, migratetype, 0);
a4bafffb 2706 batch_count++;
cc59850e 2707 }
4b23a68f 2708
57490774
MG
2709 if (pcp) {
2710 pcp_spin_unlock(pcp);
2711 pcp_trylock_finish(UP_flags);
2712 }
cc59850e
KK
2713}
2714
8dfcc9ba
NP
2715/*
2716 * split_page takes a non-compound higher-order page, and splits it into
2717 * n (1<<order) sub-pages: page[0..n]
2718 * Each sub-page must be freed individually.
2719 *
2720 * Note: this is probably too low level an operation for use in drivers.
2721 * Please consult with lkml before using this in your driver.
2722 */
2723void split_page(struct page *page, unsigned int order)
2724{
2725 int i;
2726
309381fe
SL
2727 VM_BUG_ON_PAGE(PageCompound(page), page);
2728 VM_BUG_ON_PAGE(!page_count(page), page);
b1eeab67 2729
a9627bc5 2730 for (i = 1; i < (1 << order); i++)
7835e98b 2731 set_page_refcounted(page + i);
8fb156c9 2732 split_page_owner(page, 1 << order);
e1baddf8 2733 split_page_memcg(page, 1 << order);
8dfcc9ba 2734}
5853ff23 2735EXPORT_SYMBOL_GPL(split_page);
8dfcc9ba 2736
3c605096 2737int __isolate_free_page(struct page *page, unsigned int order)
748446bb 2738{
9a157dd8
KW
2739 struct zone *zone = page_zone(page);
2740 int mt = get_pageblock_migratetype(page);
748446bb 2741
194159fb 2742 if (!is_migrate_isolate(mt)) {
9a157dd8 2743 unsigned long watermark;
8348faf9
VB
2744 /*
2745 * Obey watermarks as if the page was being allocated. We can
2746 * emulate a high-order watermark check with a raised order-0
2747 * watermark, because we already know our high-order page
2748 * exists.
2749 */
fd1444b2 2750 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
d883c6cf 2751 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2e30abd1
MS
2752 return 0;
2753
8fb74b9f 2754 __mod_zone_freepage_state(zone, -(1UL << order), mt);
2e30abd1 2755 }
748446bb 2756
6ab01363 2757 del_page_from_free_list(page, zone, order);
2139cbe6 2758
400bc7fd 2759 /*
2760 * Set the pageblock if the isolated page is at least half of a
2761 * pageblock
2762 */
748446bb
MG
2763 if (order >= pageblock_order - 1) {
2764 struct page *endpage = page + (1 << order) - 1;
47118af0
MN
2765 for (; page < endpage; page += pageblock_nr_pages) {
2766 int mt = get_pageblock_migratetype(page);
1dd214b8
ZY
2767 /*
2768 * Only change normal pageblocks (i.e., they can merge
2769 * with others)
2770 */
2771 if (migratetype_is_mergeable(mt))
47118af0
MN
2772 set_pageblock_migratetype(page,
2773 MIGRATE_MOVABLE);
2774 }
748446bb
MG
2775 }
2776
8fb74b9f 2777 return 1UL << order;
1fb3f8ca
MG
2778}
2779
624f58d8
AD
2780/**
2781 * __putback_isolated_page - Return a now-isolated page back where we got it
2782 * @page: Page that was isolated
2783 * @order: Order of the isolated page
e6a0a7ad 2784 * @mt: The page's pageblock's migratetype
624f58d8
AD
2785 *
2786 * This function is meant to return a page pulled from the free lists via
2787 * __isolate_free_page back to the free lists they were pulled from.
2788 */
2789void __putback_isolated_page(struct page *page, unsigned int order, int mt)
2790{
2791 struct zone *zone = page_zone(page);
2792
2793 /* zone lock should be held when this function is called */
2794 lockdep_assert_held(&zone->lock);
2795
2796 /* Return isolated page to tail of freelist. */
f04a5d5d 2797 __free_one_page(page, page_to_pfn(page), zone, order, mt,
47b6a24a 2798 FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
624f58d8
AD
2799}
2800
060e7417
MG
2801/*
2802 * Update NUMA hit/miss statistics
060e7417 2803 */
3e23060b
MG
2804static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
2805 long nr_account)
060e7417
MG
2806{
2807#ifdef CONFIG_NUMA
3a321d2a 2808 enum numa_stat_item local_stat = NUMA_LOCAL;
060e7417 2809
4518085e
KW
2810 /* skip numa counters update if numa stats is disabled */
2811 if (!static_branch_likely(&vm_numa_stat_key))
2812 return;
2813
c1093b74 2814 if (zone_to_nid(z) != numa_node_id())
060e7417 2815 local_stat = NUMA_OTHER;
060e7417 2816
c1093b74 2817 if (zone_to_nid(z) == zone_to_nid(preferred_zone))
3e23060b 2818 __count_numa_events(z, NUMA_HIT, nr_account);
2df26639 2819 else {
3e23060b
MG
2820 __count_numa_events(z, NUMA_MISS, nr_account);
2821 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
060e7417 2822 }
3e23060b 2823 __count_numa_events(z, local_stat, nr_account);
060e7417
MG
2824#endif
2825}
2826
589d9973
MG
2827static __always_inline
2828struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
2829 unsigned int order, unsigned int alloc_flags,
2830 int migratetype)
2831{
2832 struct page *page;
2833 unsigned long flags;
2834
2835 do {
2836 page = NULL;
2837 spin_lock_irqsave(&zone->lock, flags);
2838 /*
2839 * order-0 request can reach here when the pcplist is skipped
2840 * due to non-CMA allocation context. HIGHATOMIC area is
2841 * reserved for high-order atomic allocation, so order-0
2842 * request should skip it.
2843 */
eb2e2b42 2844 if (alloc_flags & ALLOC_HIGHATOMIC)
589d9973
MG
2845 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2846 if (!page) {
2847 page = __rmqueue(zone, order, migratetype, alloc_flags);
eb2e2b42
MG
2848
2849 /*
2850 * If the allocation fails, allow OOM handling access
2851 * to HIGHATOMIC reserves as failing now is worse than
2852 * failing a high-order atomic allocation in the
2853 * future.
2854 */
2855 if (!page && (alloc_flags & ALLOC_OOM))
2856 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2857
589d9973
MG
2858 if (!page) {
2859 spin_unlock_irqrestore(&zone->lock, flags);
2860 return NULL;
2861 }
2862 }
2863 __mod_zone_freepage_state(zone, -(1 << order),
2864 get_pcppage_migratetype(page));
2865 spin_unlock_irqrestore(&zone->lock, flags);
2866 } while (check_new_pages(page, order));
2867
2868 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2869 zone_statistics(preferred_zone, zone, 1);
2870
2871 return page;
2872}
2873
066b2393 2874/* Remove page from the per-cpu list, caller must protect the list */
3b822017 2875static inline
44042b44
MG
2876struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
2877 int migratetype,
6bb15450 2878 unsigned int alloc_flags,
453f85d4 2879 struct per_cpu_pages *pcp,
066b2393
MG
2880 struct list_head *list)
2881{
2882 struct page *page;
2883
2884 do {
2885 if (list_empty(list)) {
44042b44
MG
2886 int batch = READ_ONCE(pcp->batch);
2887 int alloced;
2888
2889 /*
2890 * Scale batch relative to order if batch implies
2891 * free pages can be stored on the PCP. Batch can
2892 * be 1 for small zones or for boot pagesets which
2893 * should never store free pages as the pages may
2894 * belong to arbitrary zones.
2895 */
2896 if (batch > 1)
2897 batch = max(batch >> order, 2);
2898 alloced = rmqueue_bulk(zone, order,
2899 batch, list,
6bb15450 2900 migratetype, alloc_flags);
44042b44
MG
2901
2902 pcp->count += alloced << order;
066b2393
MG
2903 if (unlikely(list_empty(list)))
2904 return NULL;
2905 }
2906
bf75f200
MG
2907 page = list_first_entry(list, struct page, pcp_list);
2908 list_del(&page->pcp_list);
44042b44 2909 pcp->count -= 1 << order;
700d2e9a 2910 } while (check_new_pages(page, order));
066b2393
MG
2911
2912 return page;
2913}
2914
2915/* Lock and remove page from the per-cpu list */
2916static struct page *rmqueue_pcplist(struct zone *preferred_zone,
44042b44 2917 struct zone *zone, unsigned int order,
663d0cfd 2918 int migratetype, unsigned int alloc_flags)
066b2393
MG
2919{
2920 struct per_cpu_pages *pcp;
2921 struct list_head *list;
066b2393 2922 struct page *page;
4b23a68f 2923 unsigned long __maybe_unused UP_flags;
066b2393 2924
57490774 2925 /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
4b23a68f 2926 pcp_trylock_prepare(UP_flags);
57490774 2927 pcp = pcp_spin_trylock(zone->per_cpu_pageset);
01b44456 2928 if (!pcp) {
4b23a68f 2929 pcp_trylock_finish(UP_flags);
4b23a68f
MG
2930 return NULL;
2931 }
3b12e7e9
MG
2932
2933 /*
2934 * On allocation, reduce the number of pages that are batch freed.
2935 * See nr_pcp_free() where free_factor is increased for subsequent
2936 * frees.
2937 */
3b12e7e9 2938 pcp->free_factor >>= 1;
44042b44
MG
2939 list = &pcp->lists[order_to_pindex(migratetype, order)];
2940 page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
57490774 2941 pcp_spin_unlock(pcp);
4b23a68f 2942 pcp_trylock_finish(UP_flags);
066b2393 2943 if (page) {
15cd9004 2944 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3e23060b 2945 zone_statistics(preferred_zone, zone, 1);
066b2393 2946 }
066b2393
MG
2947 return page;
2948}
2949
1da177e4 2950/*
a57ae9ef
RX
2951 * Allocate a page from the given zone.
2952 * Use pcplists for THP or "cheap" high-order allocations.
1da177e4 2953 */
b073d7f8
AP
2954
2955/*
2956 * Do not instrument rmqueue() with KMSAN. This function may call
2957 * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
2958 * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
2959 * may call rmqueue() again, which will result in a deadlock.
1da177e4 2960 */
b073d7f8 2961__no_sanitize_memory
0a15c3e9 2962static inline
066b2393 2963struct page *rmqueue(struct zone *preferred_zone,
7aeb09f9 2964 struct zone *zone, unsigned int order,
c603844b
MG
2965 gfp_t gfp_flags, unsigned int alloc_flags,
2966 int migratetype)
1da177e4 2967{
689bcebf 2968 struct page *page;
1da177e4 2969
589d9973
MG
2970 /*
2971 * We most definitely don't want callers attempting to
2972 * allocate greater than order-1 page units with __GFP_NOFAIL.
2973 */
2974 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2975
44042b44 2976 if (likely(pcp_allowed_order(order))) {
1d91df85
JK
2977 /*
2978 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
2979 * we need to skip it when CMA area isn't allowed.
2980 */
2981 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
2982 migratetype != MIGRATE_MOVABLE) {
44042b44 2983 page = rmqueue_pcplist(preferred_zone, zone, order,
663d0cfd 2984 migratetype, alloc_flags);
4b23a68f
MG
2985 if (likely(page))
2986 goto out;
1d91df85 2987 }
066b2393 2988 }
83b9355b 2989
589d9973
MG
2990 page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
2991 migratetype);
1da177e4 2992
066b2393 2993out:
73444bc4 2994 /* Separate test+clear to avoid unnecessary atomics */
e2a66c21 2995 if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
73444bc4
MG
2996 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2997 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
2998 }
2999
066b2393 3000 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
1da177e4
LT
3001 return page;
3002}
3003
933e312e
AM
3004#ifdef CONFIG_FAIL_PAGE_ALLOC
3005
b2588c4b 3006static struct {
933e312e
AM
3007 struct fault_attr attr;
3008
621a5f7a 3009 bool ignore_gfp_highmem;
71baba4b 3010 bool ignore_gfp_reclaim;
54114994 3011 u32 min_order;
933e312e
AM
3012} fail_page_alloc = {
3013 .attr = FAULT_ATTR_INITIALIZER,
71baba4b 3014 .ignore_gfp_reclaim = true,
621a5f7a 3015 .ignore_gfp_highmem = true,
54114994 3016 .min_order = 1,
933e312e
AM
3017};
3018
3019static int __init setup_fail_page_alloc(char *str)
3020{
3021 return setup_fault_attr(&fail_page_alloc.attr, str);
3022}
3023__setup("fail_page_alloc=", setup_fail_page_alloc);
3024
af3b8544 3025static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
933e312e 3026{
ea4452de
QZ
3027 int flags = 0;
3028
54114994 3029 if (order < fail_page_alloc.min_order)
deaf386e 3030 return false;
933e312e 3031 if (gfp_mask & __GFP_NOFAIL)
deaf386e 3032 return false;
933e312e 3033 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
deaf386e 3034 return false;
71baba4b
MG
3035 if (fail_page_alloc.ignore_gfp_reclaim &&
3036 (gfp_mask & __GFP_DIRECT_RECLAIM))
deaf386e 3037 return false;
933e312e 3038
ea4452de 3039 /* See comment in __should_failslab() */
3f913fc5 3040 if (gfp_mask & __GFP_NOWARN)
ea4452de 3041 flags |= FAULT_NOWARN;
3f913fc5 3042
ea4452de 3043 return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
933e312e
AM
3044}
3045
3046#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3047
3048static int __init fail_page_alloc_debugfs(void)
3049{
0825a6f9 3050 umode_t mode = S_IFREG | 0600;
933e312e 3051 struct dentry *dir;
933e312e 3052
dd48c085
AM
3053 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3054 &fail_page_alloc.attr);
b2588c4b 3055
d9f7979c
GKH
3056 debugfs_create_bool("ignore-gfp-wait", mode, dir,
3057 &fail_page_alloc.ignore_gfp_reclaim);
3058 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3059 &fail_page_alloc.ignore_gfp_highmem);
3060 debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
933e312e 3061
d9f7979c 3062 return 0;
933e312e
AM
3063}
3064
3065late_initcall(fail_page_alloc_debugfs);
3066
3067#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3068
3069#else /* CONFIG_FAIL_PAGE_ALLOC */
3070
af3b8544 3071static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
933e312e 3072{
deaf386e 3073 return false;
933e312e
AM
3074}
3075
3076#endif /* CONFIG_FAIL_PAGE_ALLOC */
3077
54aa3866 3078noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
af3b8544
BP
3079{
3080 return __should_fail_alloc_page(gfp_mask, order);
3081}
3082ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3083
f27ce0e1
JK
3084static inline long __zone_watermark_unusable_free(struct zone *z,
3085 unsigned int order, unsigned int alloc_flags)
3086{
f27ce0e1
JK
3087 long unusable_free = (1 << order) - 1;
3088
3089 /*
ab350885
MG
3090 * If the caller does not have rights to reserves below the min
3091 * watermark then subtract the high-atomic reserves. This will
3092 * over-estimate the size of the atomic reserve but it avoids a search.
f27ce0e1 3093 */
ab350885 3094 if (likely(!(alloc_flags & ALLOC_RESERVES)))
f27ce0e1
JK
3095 unusable_free += z->nr_reserved_highatomic;
3096
3097#ifdef CONFIG_CMA
3098 /* If allocation can't use CMA areas don't use free CMA pages */
3099 if (!(alloc_flags & ALLOC_CMA))
3100 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
3101#endif
3102
3103 return unusable_free;
3104}
3105
1da177e4 3106/*
97a16fc8
MG
3107 * Return true if free base pages are above 'mark'. For high-order checks it
3108 * will return true of the order-0 watermark is reached and there is at least
3109 * one free page of a suitable size. Checking now avoids taking the zone lock
3110 * to check in the allocation paths if no pages are free.
1da177e4 3111 */
86a294a8 3112bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
97a225e6 3113 int highest_zoneidx, unsigned int alloc_flags,
86a294a8 3114 long free_pages)
1da177e4 3115{
d23ad423 3116 long min = mark;
1da177e4
LT
3117 int o;
3118
0aaa29a5 3119 /* free_pages may go negative - that's OK */
f27ce0e1 3120 free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
0aaa29a5 3121
ab350885
MG
3122 if (unlikely(alloc_flags & ALLOC_RESERVES)) {
3123 /*
3124 * __GFP_HIGH allows access to 50% of the min reserve as well
3125 * as OOM.
3126 */
1ebbb218 3127 if (alloc_flags & ALLOC_MIN_RESERVE) {
ab350885 3128 min -= min / 2;
0aaa29a5 3129
1ebbb218
MG
3130 /*
3131 * Non-blocking allocations (e.g. GFP_ATOMIC) can
3132 * access more reserves than just __GFP_HIGH. Other
3133 * non-blocking allocations requests such as GFP_NOWAIT
3134 * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
3135 * access to the min reserve.
3136 */
3137 if (alloc_flags & ALLOC_NON_BLOCK)
3138 min -= min / 4;
3139 }
0aaa29a5 3140
cd04ae1e 3141 /*
ab350885 3142 * OOM victims can try even harder than the normal reserve
cd04ae1e
MH
3143 * users on the grounds that it's definitely going to be in
3144 * the exit path shortly and free memory. Any allocation it
3145 * makes during the free path will be small and short-lived.
3146 */
3147 if (alloc_flags & ALLOC_OOM)
3148 min -= min / 2;
cd04ae1e
MH
3149 }
3150
97a16fc8
MG
3151 /*
3152 * Check watermarks for an order-0 allocation request. If these
3153 * are not met, then a high-order request also cannot go ahead
3154 * even if a suitable page happened to be free.
3155 */
97a225e6 3156 if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
88f5acf8 3157 return false;
1da177e4 3158
97a16fc8
MG
3159 /* If this is an order-0 request then the watermark is fine */
3160 if (!order)
3161 return true;
3162
3163 /* For a high-order request, check at least one suitable page is free */
23baf831 3164 for (o = order; o <= MAX_ORDER; o++) {
97a16fc8
MG
3165 struct free_area *area = &z->free_area[o];
3166 int mt;
3167
3168 if (!area->nr_free)
3169 continue;
3170
97a16fc8 3171 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
b03641af 3172 if (!free_area_empty(area, mt))
97a16fc8
MG
3173 return true;
3174 }
3175
3176#ifdef CONFIG_CMA
d883c6cf 3177 if ((alloc_flags & ALLOC_CMA) &&
b03641af 3178 !free_area_empty(area, MIGRATE_CMA)) {
97a16fc8 3179 return true;
d883c6cf 3180 }
97a16fc8 3181#endif
eb2e2b42
MG
3182 if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
3183 !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
b050e376 3184 return true;
eb2e2b42 3185 }
1da177e4 3186 }
97a16fc8 3187 return false;
88f5acf8
MG
3188}
3189
7aeb09f9 3190bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
97a225e6 3191 int highest_zoneidx, unsigned int alloc_flags)
88f5acf8 3192{
97a225e6 3193 return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
88f5acf8
MG
3194 zone_page_state(z, NR_FREE_PAGES));
3195}
3196
48ee5f36 3197static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
97a225e6 3198 unsigned long mark, int highest_zoneidx,
f80b08fc 3199 unsigned int alloc_flags, gfp_t gfp_mask)
48ee5f36 3200{
f27ce0e1 3201 long free_pages;
d883c6cf 3202
f27ce0e1 3203 free_pages = zone_page_state(z, NR_FREE_PAGES);
48ee5f36
MG
3204
3205 /*
3206 * Fast check for order-0 only. If this fails then the reserves
f27ce0e1 3207 * need to be calculated.
48ee5f36 3208 */
f27ce0e1 3209 if (!order) {
9282012f
JK
3210 long usable_free;
3211 long reserved;
f27ce0e1 3212
9282012f
JK
3213 usable_free = free_pages;
3214 reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
3215
3216 /* reserved may over estimate high-atomic reserves. */
3217 usable_free -= min(usable_free, reserved);
3218 if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
f27ce0e1
JK
3219 return true;
3220 }
48ee5f36 3221
f80b08fc
CTR
3222 if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3223 free_pages))
3224 return true;
2973d822 3225
f80b08fc 3226 /*
2973d822 3227 * Ignore watermark boosting for __GFP_HIGH order-0 allocations
f80b08fc
CTR
3228 * when checking the min watermark. The min watermark is the
3229 * point where boosting is ignored so that kswapd is woken up
3230 * when below the low watermark.
3231 */
2973d822 3232 if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
f80b08fc
CTR
3233 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
3234 mark = z->_watermark[WMARK_MIN];
3235 return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3236 alloc_flags, free_pages);
3237 }
3238
3239 return false;
48ee5f36
MG
3240}
3241
7aeb09f9 3242bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
97a225e6 3243 unsigned long mark, int highest_zoneidx)
88f5acf8
MG
3244{
3245 long free_pages = zone_page_state(z, NR_FREE_PAGES);
3246
3247 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3248 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3249
97a225e6 3250 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
88f5acf8 3251 free_pages);
1da177e4
LT
3252}
3253
9276b1bc 3254#ifdef CONFIG_NUMA
61bb6cd2
GU
3255int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
3256
957f822a
DR
3257static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3258{
e02dc017 3259 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
a55c7454 3260 node_reclaim_distance;
957f822a 3261}
9276b1bc 3262#else /* CONFIG_NUMA */
957f822a
DR
3263static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3264{
3265 return true;
3266}
9276b1bc
PJ
3267#endif /* CONFIG_NUMA */
3268
6bb15450
MG
3269/*
3270 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3271 * fragmentation is subtle. If the preferred zone was HIGHMEM then
3272 * premature use of a lower zone may cause lowmem pressure problems that
3273 * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3274 * probably too small. It only makes sense to spread allocations to avoid
3275 * fragmentation between the Normal and DMA32 zones.
3276 */
3277static inline unsigned int
0a79cdad 3278alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
6bb15450 3279{
736838e9 3280 unsigned int alloc_flags;
0a79cdad 3281
736838e9
MN
3282 /*
3283 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3284 * to save a branch.
3285 */
3286 alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
0a79cdad
MG
3287
3288#ifdef CONFIG_ZONE_DMA32
8139ad04
AR
3289 if (!zone)
3290 return alloc_flags;
3291
6bb15450 3292 if (zone_idx(zone) != ZONE_NORMAL)
8118b82e 3293 return alloc_flags;
6bb15450
MG
3294
3295 /*
3296 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3297 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3298 * on UMA that if Normal is populated then so is DMA32.
3299 */
3300 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3301 if (nr_online_nodes > 1 && !populated_zone(--zone))
8118b82e 3302 return alloc_flags;
6bb15450 3303
8118b82e 3304 alloc_flags |= ALLOC_NOFRAGMENT;
0a79cdad
MG
3305#endif /* CONFIG_ZONE_DMA32 */
3306 return alloc_flags;
6bb15450 3307}
6bb15450 3308
8e3560d9
PT
3309/* Must be called after current_gfp_context() which can change gfp_mask */
3310static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
3311 unsigned int alloc_flags)
8510e69c
JK
3312{
3313#ifdef CONFIG_CMA
8e3560d9 3314 if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
8510e69c 3315 alloc_flags |= ALLOC_CMA;
8510e69c
JK
3316#endif
3317 return alloc_flags;
3318}
3319
7fb1d9fc 3320/*
0798e519 3321 * get_page_from_freelist goes through the zonelist trying to allocate
7fb1d9fc
RS
3322 * a page.
3323 */
3324static struct page *
a9263751
VB
3325get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3326 const struct alloc_context *ac)
753ee728 3327{
6bb15450 3328 struct zoneref *z;
5117f45d 3329 struct zone *zone;
8a87d695
WY
3330 struct pglist_data *last_pgdat = NULL;
3331 bool last_pgdat_dirty_ok = false;
6bb15450 3332 bool no_fallback;
3b8c0be4 3333
6bb15450 3334retry:
7fb1d9fc 3335 /*
9276b1bc 3336 * Scan zonelist, looking for a zone with enough free.
8e464522 3337 * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
7fb1d9fc 3338 */
6bb15450
MG
3339 no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3340 z = ac->preferred_zoneref;
30d8ec73
MN
3341 for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
3342 ac->nodemask) {
be06af00 3343 struct page *page;
e085dbc5
JW
3344 unsigned long mark;
3345
664eedde
MG
3346 if (cpusets_enabled() &&
3347 (alloc_flags & ALLOC_CPUSET) &&
002f2906 3348 !__cpuset_zone_allowed(zone, gfp_mask))
cd38b115 3349 continue;
a756cf59
JW
3350 /*
3351 * When allocating a page cache page for writing, we
281e3726
MG
3352 * want to get it from a node that is within its dirty
3353 * limit, such that no single node holds more than its
a756cf59 3354 * proportional share of globally allowed dirty pages.
281e3726 3355 * The dirty limits take into account the node's
a756cf59
JW
3356 * lowmem reserves and high watermark so that kswapd
3357 * should be able to balance it without having to
3358 * write pages from its LRU list.
3359 *
a756cf59 3360 * XXX: For now, allow allocations to potentially
281e3726 3361 * exceed the per-node dirty limit in the slowpath
c9ab0c4f 3362 * (spread_dirty_pages unset) before going into reclaim,
a756cf59 3363 * which is important when on a NUMA setup the allowed
281e3726 3364 * nodes are together not big enough to reach the
a756cf59 3365 * global limit. The proper fix for these situations
281e3726 3366 * will require awareness of nodes in the
a756cf59
JW
3367 * dirty-throttling and the flusher threads.
3368 */
3b8c0be4 3369 if (ac->spread_dirty_pages) {
8a87d695
WY
3370 if (last_pgdat != zone->zone_pgdat) {
3371 last_pgdat = zone->zone_pgdat;
3372 last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
3373 }
3b8c0be4 3374
8a87d695 3375 if (!last_pgdat_dirty_ok)
3b8c0be4 3376 continue;
3b8c0be4 3377 }
7fb1d9fc 3378
6bb15450
MG
3379 if (no_fallback && nr_online_nodes > 1 &&
3380 zone != ac->preferred_zoneref->zone) {
3381 int local_nid;
3382
3383 /*
3384 * If moving to a remote node, retry but allow
3385 * fragmenting fallbacks. Locality is more important
3386 * than fragmentation avoidance.
3387 */
3388 local_nid = zone_to_nid(ac->preferred_zoneref->zone);
3389 if (zone_to_nid(zone) != local_nid) {
3390 alloc_flags &= ~ALLOC_NOFRAGMENT;
3391 goto retry;
3392 }
3393 }
3394
a9214443 3395 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
48ee5f36 3396 if (!zone_watermark_fast(zone, order, mark,
f80b08fc
CTR
3397 ac->highest_zoneidx, alloc_flags,
3398 gfp_mask)) {
fa5e084e
MG
3399 int ret;
3400
c9e97a19
PT
3401#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3402 /*
3403 * Watermark failed for this zone, but see if we can
3404 * grow this zone if it contains deferred pages.
3405 */
076cf7ea 3406 if (deferred_pages_enabled()) {
c9e97a19
PT
3407 if (_deferred_grow_zone(zone, order))
3408 goto try_this_zone;
3409 }
3410#endif
5dab2911
MG
3411 /* Checked here to keep the fast path fast */
3412 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3413 if (alloc_flags & ALLOC_NO_WATERMARKS)
3414 goto try_this_zone;
3415
202e35db 3416 if (!node_reclaim_enabled() ||
c33d6c06 3417 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
cd38b115
MG
3418 continue;
3419
a5f5f91d 3420 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
fa5e084e 3421 switch (ret) {
a5f5f91d 3422 case NODE_RECLAIM_NOSCAN:
fa5e084e 3423 /* did not scan */
cd38b115 3424 continue;
a5f5f91d 3425 case NODE_RECLAIM_FULL:
fa5e084e 3426 /* scanned but unreclaimable */
cd38b115 3427 continue;
fa5e084e
MG
3428 default:
3429 /* did we reclaim enough */
fed2719e 3430 if (zone_watermark_ok(zone, order, mark,
97a225e6 3431 ac->highest_zoneidx, alloc_flags))
fed2719e
MG
3432 goto try_this_zone;
3433
fed2719e 3434 continue;
0798e519 3435 }
7fb1d9fc
RS
3436 }
3437
fa5e084e 3438try_this_zone:
066b2393 3439 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
0aaa29a5 3440 gfp_mask, alloc_flags, ac->migratetype);
75379191 3441 if (page) {
479f854a 3442 prep_new_page(page, order, gfp_mask, alloc_flags);
0aaa29a5
MG
3443
3444 /*
3445 * If this is a high-order atomic allocation then check
3446 * if the pageblock should be reserved for the future
3447 */
eb2e2b42 3448 if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
0aaa29a5
MG
3449 reserve_highatomic_pageblock(page, zone, order);
3450
75379191 3451 return page;
c9e97a19
PT
3452 } else {
3453#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3454 /* Try again if zone has deferred pages */
076cf7ea 3455 if (deferred_pages_enabled()) {
c9e97a19
PT
3456 if (_deferred_grow_zone(zone, order))
3457 goto try_this_zone;
3458 }
3459#endif
75379191 3460 }
54a6eb5c 3461 }
9276b1bc 3462
6bb15450
MG
3463 /*
3464 * It's possible on a UMA machine to get through all zones that are
3465 * fragmented. If avoiding fragmentation, reset and try again.
3466 */
3467 if (no_fallback) {
3468 alloc_flags &= ~ALLOC_NOFRAGMENT;
3469 goto retry;
3470 }
3471
4ffeaf35 3472 return NULL;
753ee728
MH
3473}
3474
9af744d7 3475static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
a238ab5b 3476{
a238ab5b 3477 unsigned int filter = SHOW_MEM_FILTER_NODES;
a238ab5b
DH
3478
3479 /*
3480 * This documents exceptions given to allocations in certain
3481 * contexts that are allowed to allocate outside current's set
3482 * of allowed nodes.
3483 */
3484 if (!(gfp_mask & __GFP_NOMEMALLOC))
cd04ae1e 3485 if (tsk_is_oom_victim(current) ||
a238ab5b
DH
3486 (current->flags & (PF_MEMALLOC | PF_EXITING)))
3487 filter &= ~SHOW_MEM_FILTER_NODES;
88dc6f20 3488 if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
a238ab5b
DH
3489 filter &= ~SHOW_MEM_FILTER_NODES;
3490
974f4367 3491 __show_mem(filter, nodemask, gfp_zone(gfp_mask));
aa187507
MH
3492}
3493
a8e99259 3494void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
aa187507
MH
3495{
3496 struct va_format vaf;
3497 va_list args;
1be334e5 3498 static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
aa187507 3499
c4dc63f0
BH
3500 if ((gfp_mask & __GFP_NOWARN) ||
3501 !__ratelimit(&nopage_rs) ||
3502 ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
aa187507
MH
3503 return;
3504
7877cdcc
MH
3505 va_start(args, fmt);
3506 vaf.fmt = fmt;
3507 vaf.va = &args;
ef8444ea 3508 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
0205f755
MH
3509 current->comm, &vaf, gfp_mask, &gfp_mask,
3510 nodemask_pr_args(nodemask));
7877cdcc 3511 va_end(args);
3ee9a4f0 3512
a8e99259 3513 cpuset_print_current_mems_allowed();
ef8444ea 3514 pr_cont("\n");
a238ab5b 3515 dump_stack();
685dbf6f 3516 warn_alloc_show_mem(gfp_mask, nodemask);
a238ab5b
DH
3517}
3518
6c18ba7a
MH
3519static inline struct page *
3520__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3521 unsigned int alloc_flags,
3522 const struct alloc_context *ac)
3523{
3524 struct page *page;
3525
3526 page = get_page_from_freelist(gfp_mask, order,
3527 alloc_flags|ALLOC_CPUSET, ac);
3528 /*
3529 * fallback to ignore cpuset restriction if our nodes
3530 * are depleted
3531 */
3532 if (!page)
3533 page = get_page_from_freelist(gfp_mask, order,
3534 alloc_flags, ac);
3535
3536 return page;
3537}
3538
11e33f6a
MG
3539static inline struct page *
3540__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
a9263751 3541 const struct alloc_context *ac, unsigned long *did_some_progress)
11e33f6a 3542{
6e0fc46d
DR
3543 struct oom_control oc = {
3544 .zonelist = ac->zonelist,
3545 .nodemask = ac->nodemask,
2a966b77 3546 .memcg = NULL,
6e0fc46d
DR
3547 .gfp_mask = gfp_mask,
3548 .order = order,
6e0fc46d 3549 };
11e33f6a
MG
3550 struct page *page;
3551
9879de73
JW
3552 *did_some_progress = 0;
3553
9879de73 3554 /*
dc56401f
JW
3555 * Acquire the oom lock. If that fails, somebody else is
3556 * making progress for us.
9879de73 3557 */
dc56401f 3558 if (!mutex_trylock(&oom_lock)) {
9879de73 3559 *did_some_progress = 1;
11e33f6a 3560 schedule_timeout_uninterruptible(1);
1da177e4
LT
3561 return NULL;
3562 }
6b1de916 3563
11e33f6a
MG
3564 /*
3565 * Go through the zonelist yet one more time, keep very high watermark
3566 * here, this is only to catch a parallel oom killing, we must fail if
e746bf73
TH
3567 * we're still under heavy pressure. But make sure that this reclaim
3568 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3569 * allocation which will never fail due to oom_lock already held.
11e33f6a 3570 */
e746bf73
TH
3571 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
3572 ~__GFP_DIRECT_RECLAIM, order,
3573 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
7fb1d9fc 3574 if (page)
11e33f6a
MG
3575 goto out;
3576
06ad276a
MH
3577 /* Coredumps can quickly deplete all memory reserves */
3578 if (current->flags & PF_DUMPCORE)
3579 goto out;
3580 /* The OOM killer will not help higher order allocs */
3581 if (order > PAGE_ALLOC_COSTLY_ORDER)
3582 goto out;
dcda9b04
MH
3583 /*
3584 * We have already exhausted all our reclaim opportunities without any
3585 * success so it is time to admit defeat. We will skip the OOM killer
3586 * because it is very likely that the caller has a more reasonable
3587 * fallback than shooting a random task.
cfb4a541
MN
3588 *
3589 * The OOM killer may not free memory on a specific node.
dcda9b04 3590 */
cfb4a541 3591 if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
dcda9b04 3592 goto out;
06ad276a 3593 /* The OOM killer does not needlessly kill tasks for lowmem */
97a225e6 3594 if (ac->highest_zoneidx < ZONE_NORMAL)
06ad276a
MH
3595 goto out;
3596 if (pm_suspended_storage())
3597 goto out;
3598 /*
3599 * XXX: GFP_NOFS allocations should rather fail than rely on
3600 * other request to make a forward progress.
3601 * We are in an unfortunate situation where out_of_memory cannot
3602 * do much for this context but let's try it to at least get
3603 * access to memory reserved if the current task is killed (see
3604 * out_of_memory). Once filesystems are ready to handle allocation
3605 * failures more gracefully we should just bail out here.
3606 */
3607
3c2c6488 3608 /* Exhausted what can be done so it's blame time */
3f913fc5
QZ
3609 if (out_of_memory(&oc) ||
3610 WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
c32b3cbe 3611 *did_some_progress = 1;
5020e285 3612
6c18ba7a
MH
3613 /*
3614 * Help non-failing allocations by giving them access to memory
3615 * reserves
3616 */
3617 if (gfp_mask & __GFP_NOFAIL)
3618 page = __alloc_pages_cpuset_fallback(gfp_mask, order,
5020e285 3619 ALLOC_NO_WATERMARKS, ac);
5020e285 3620 }
11e33f6a 3621out:
dc56401f 3622 mutex_unlock(&oom_lock);
11e33f6a
MG
3623 return page;
3624}
3625
33c2d214 3626/*
baf2f90b 3627 * Maximum number of compaction retries with a progress before OOM
33c2d214
MH
3628 * killer is consider as the only way to move forward.
3629 */
3630#define MAX_COMPACT_RETRIES 16
3631
56de7263
MG
3632#ifdef CONFIG_COMPACTION
3633/* Try memory compaction for high-order allocations before reclaim */
3634static struct page *
3635__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
c603844b 3636 unsigned int alloc_flags, const struct alloc_context *ac,
a5508cd8 3637 enum compact_priority prio, enum compact_result *compact_result)
56de7263 3638{
5e1f0f09 3639 struct page *page = NULL;
eb414681 3640 unsigned long pflags;
499118e9 3641 unsigned int noreclaim_flag;
53853e2d
VB
3642
3643 if (!order)
66199712 3644 return NULL;
66199712 3645
eb414681 3646 psi_memstall_enter(&pflags);
5bf18281 3647 delayacct_compact_start();
499118e9 3648 noreclaim_flag = memalloc_noreclaim_save();
eb414681 3649
c5d01d0d 3650 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
5e1f0f09 3651 prio, &page);
eb414681 3652
499118e9 3653 memalloc_noreclaim_restore(noreclaim_flag);
eb414681 3654 psi_memstall_leave(&pflags);
5bf18281 3655 delayacct_compact_end();
56de7263 3656
06dac2f4
CTR
3657 if (*compact_result == COMPACT_SKIPPED)
3658 return NULL;
98dd3b48
VB
3659 /*
3660 * At least in one zone compaction wasn't deferred or skipped, so let's
3661 * count a compaction stall
3662 */
3663 count_vm_event(COMPACTSTALL);
8fb74b9f 3664
5e1f0f09
MG
3665 /* Prep a captured page if available */
3666 if (page)
3667 prep_new_page(page, order, gfp_mask, alloc_flags);
3668
3669 /* Try get a page from the freelist if available */
3670 if (!page)
3671 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
53853e2d 3672
98dd3b48
VB
3673 if (page) {
3674 struct zone *zone = page_zone(page);
53853e2d 3675
98dd3b48
VB
3676 zone->compact_blockskip_flush = false;
3677 compaction_defer_reset(zone, order, true);
3678 count_vm_event(COMPACTSUCCESS);
3679 return page;
3680 }
56de7263 3681
98dd3b48
VB
3682 /*
3683 * It's bad if compaction run occurs and fails. The most likely reason
3684 * is that pages exist, but not enough to satisfy watermarks.
3685 */
3686 count_vm_event(COMPACTFAIL);
66199712 3687
98dd3b48 3688 cond_resched();
56de7263
MG
3689
3690 return NULL;
3691}
33c2d214 3692
3250845d
VB
3693static inline bool
3694should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3695 enum compact_result compact_result,
3696 enum compact_priority *compact_priority,
d9436498 3697 int *compaction_retries)
3250845d
VB
3698{
3699 int max_retries = MAX_COMPACT_RETRIES;
c2033b00 3700 int min_priority;
65190cff
MH
3701 bool ret = false;
3702 int retries = *compaction_retries;
3703 enum compact_priority priority = *compact_priority;
3250845d
VB
3704
3705 if (!order)
3706 return false;
3707
691d9497
AT
3708 if (fatal_signal_pending(current))
3709 return false;
3710
d9436498
VB
3711 if (compaction_made_progress(compact_result))
3712 (*compaction_retries)++;
3713
3250845d
VB
3714 /*
3715 * compaction considers all the zone as desperately out of memory
3716 * so it doesn't really make much sense to retry except when the
3717 * failure could be caused by insufficient priority
3718 */
d9436498
VB
3719 if (compaction_failed(compact_result))
3720 goto check_priority;
3250845d 3721
49433085
VB
3722 /*
3723 * compaction was skipped because there are not enough order-0 pages
3724 * to work with, so we retry only if it looks like reclaim can help.
3725 */
3726 if (compaction_needs_reclaim(compact_result)) {
3727 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3728 goto out;
3729 }
3730
3250845d
VB
3731 /*
3732 * make sure the compaction wasn't deferred or didn't bail out early
3733 * due to locks contention before we declare that we should give up.
49433085
VB
3734 * But the next retry should use a higher priority if allowed, so
3735 * we don't just keep bailing out endlessly.
3250845d 3736 */
65190cff 3737 if (compaction_withdrawn(compact_result)) {
49433085 3738 goto check_priority;
65190cff 3739 }
3250845d
VB
3740
3741 /*
dcda9b04 3742 * !costly requests are much more important than __GFP_RETRY_MAYFAIL
3250845d
VB
3743 * costly ones because they are de facto nofail and invoke OOM
3744 * killer to move on while costly can fail and users are ready
3745 * to cope with that. 1/4 retries is rather arbitrary but we
3746 * would need much more detailed feedback from compaction to
3747 * make a better decision.
3748 */
3749 if (order > PAGE_ALLOC_COSTLY_ORDER)
3750 max_retries /= 4;
65190cff
MH
3751 if (*compaction_retries <= max_retries) {
3752 ret = true;
3753 goto out;
3754 }
3250845d 3755
d9436498
VB
3756 /*
3757 * Make sure there are attempts at the highest priority if we exhausted
3758 * all retries or failed at the lower priorities.
3759 */
3760check_priority:
c2033b00
VB
3761 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3762 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
65190cff 3763
c2033b00 3764 if (*compact_priority > min_priority) {
d9436498
VB
3765 (*compact_priority)--;
3766 *compaction_retries = 0;
65190cff 3767 ret = true;
d9436498 3768 }
65190cff
MH
3769out:
3770 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
3771 return ret;
3250845d 3772}
56de7263
MG
3773#else
3774static inline struct page *
3775__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
c603844b 3776 unsigned int alloc_flags, const struct alloc_context *ac,
a5508cd8 3777 enum compact_priority prio, enum compact_result *compact_result)
56de7263 3778{
33c2d214 3779 *compact_result = COMPACT_SKIPPED;
56de7263
MG
3780 return NULL;
3781}
33c2d214
MH
3782
3783static inline bool
86a294a8
MH
3784should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3785 enum compact_result compact_result,
a5508cd8 3786 enum compact_priority *compact_priority,
d9436498 3787 int *compaction_retries)
33c2d214 3788{
31e49bfd
MH
3789 struct zone *zone;
3790 struct zoneref *z;
3791
3792 if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
3793 return false;
3794
3795 /*
3796 * There are setups with compaction disabled which would prefer to loop
3797 * inside the allocator rather than hit the oom killer prematurely.
3798 * Let's give them a good hope and keep retrying while the order-0
3799 * watermarks are OK.
3800 */
97a225e6
JK
3801 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3802 ac->highest_zoneidx, ac->nodemask) {
31e49bfd 3803 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
97a225e6 3804 ac->highest_zoneidx, alloc_flags))
31e49bfd
MH
3805 return true;
3806 }
33c2d214
MH
3807 return false;
3808}
3250845d 3809#endif /* CONFIG_COMPACTION */
56de7263 3810
d92a8cfc 3811#ifdef CONFIG_LOCKDEP
93781325 3812static struct lockdep_map __fs_reclaim_map =
d92a8cfc
PZ
3813 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3814
f920e413 3815static bool __need_reclaim(gfp_t gfp_mask)
d92a8cfc 3816{
d92a8cfc
PZ
3817 /* no reclaim without waiting on it */
3818 if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3819 return false;
3820
3821 /* this guy won't enter reclaim */
2e517d68 3822 if (current->flags & PF_MEMALLOC)
d92a8cfc
PZ
3823 return false;
3824
d92a8cfc
PZ
3825 if (gfp_mask & __GFP_NOLOCKDEP)
3826 return false;
3827
3828 return true;
3829}
3830
4f3eaf45 3831void __fs_reclaim_acquire(unsigned long ip)
93781325 3832{
4f3eaf45 3833 lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
93781325
OS
3834}
3835
4f3eaf45 3836void __fs_reclaim_release(unsigned long ip)
93781325 3837{
4f3eaf45 3838 lock_release(&__fs_reclaim_map, ip);
93781325
OS
3839}
3840
d92a8cfc
PZ
3841void fs_reclaim_acquire(gfp_t gfp_mask)
3842{
f920e413
DV
3843 gfp_mask = current_gfp_context(gfp_mask);
3844
3845 if (__need_reclaim(gfp_mask)) {
3846 if (gfp_mask & __GFP_FS)
4f3eaf45 3847 __fs_reclaim_acquire(_RET_IP_);
f920e413
DV
3848
3849#ifdef CONFIG_MMU_NOTIFIER
3850 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
3851 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
3852#endif
3853
3854 }
d92a8cfc
PZ
3855}
3856EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
3857
3858void fs_reclaim_release(gfp_t gfp_mask)
3859{
f920e413
DV
3860 gfp_mask = current_gfp_context(gfp_mask);
3861
3862 if (__need_reclaim(gfp_mask)) {
3863 if (gfp_mask & __GFP_FS)
4f3eaf45 3864 __fs_reclaim_release(_RET_IP_);
f920e413 3865 }
d92a8cfc
PZ
3866}
3867EXPORT_SYMBOL_GPL(fs_reclaim_release);
3868#endif
3869
3d36424b
MG
3870/*
3871 * Zonelists may change due to hotplug during allocation. Detect when zonelists
3872 * have been rebuilt so allocation retries. Reader side does not lock and
3873 * retries the allocation if zonelist changes. Writer side is protected by the
3874 * embedded spin_lock.
3875 */
3876static DEFINE_SEQLOCK(zonelist_update_seq);
3877
3878static unsigned int zonelist_iter_begin(void)
3879{
3880 if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
3881 return read_seqbegin(&zonelist_update_seq);
3882
3883 return 0;
3884}
3885
3886static unsigned int check_retry_zonelist(unsigned int seq)
3887{
3888 if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
3889 return read_seqretry(&zonelist_update_seq, seq);
3890
3891 return seq;
3892}
3893
bba90710 3894/* Perform direct synchronous page reclaim */
2187e17b 3895static unsigned long
a9263751
VB
3896__perform_reclaim(gfp_t gfp_mask, unsigned int order,
3897 const struct alloc_context *ac)
11e33f6a 3898{
499118e9 3899 unsigned int noreclaim_flag;
fa7fc75f 3900 unsigned long progress;
11e33f6a
MG
3901
3902 cond_resched();
3903
3904 /* We now go into synchronous reclaim */
3905 cpuset_memory_pressure_bump();
d92a8cfc 3906 fs_reclaim_acquire(gfp_mask);
93781325 3907 noreclaim_flag = memalloc_noreclaim_save();
11e33f6a 3908
a9263751
VB
3909 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3910 ac->nodemask);
11e33f6a 3911
499118e9 3912 memalloc_noreclaim_restore(noreclaim_flag);
93781325 3913 fs_reclaim_release(gfp_mask);
11e33f6a
MG
3914
3915 cond_resched();
3916
bba90710
MS
3917 return progress;
3918}
3919
3920/* The really slow allocator path where we enter direct reclaim */
3921static inline struct page *
3922__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
c603844b 3923 unsigned int alloc_flags, const struct alloc_context *ac,
a9263751 3924 unsigned long *did_some_progress)
bba90710
MS
3925{
3926 struct page *page = NULL;
fa7fc75f 3927 unsigned long pflags;
bba90710
MS
3928 bool drained = false;
3929
fa7fc75f 3930 psi_memstall_enter(&pflags);
a9263751 3931 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
9ee493ce 3932 if (unlikely(!(*did_some_progress)))
fa7fc75f 3933 goto out;
11e33f6a 3934
9ee493ce 3935retry:
31a6c190 3936 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
9ee493ce
MG
3937
3938 /*
3939 * If an allocation failed after direct reclaim, it could be because
0aaa29a5 3940 * pages are pinned on the per-cpu lists or in high alloc reserves.
047b9967 3941 * Shrink them and try again
9ee493ce
MG
3942 */
3943 if (!page && !drained) {
29fac03b 3944 unreserve_highatomic_pageblock(ac, false);
93481ff0 3945 drain_all_pages(NULL);
9ee493ce
MG
3946 drained = true;
3947 goto retry;
3948 }
fa7fc75f
SB
3949out:
3950 psi_memstall_leave(&pflags);
9ee493ce 3951
11e33f6a
MG
3952 return page;
3953}
3954
5ecd9d40
DR
3955static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
3956 const struct alloc_context *ac)
3a025760
JW
3957{
3958 struct zoneref *z;
3959 struct zone *zone;
e1a55637 3960 pg_data_t *last_pgdat = NULL;
97a225e6 3961 enum zone_type highest_zoneidx = ac->highest_zoneidx;
3a025760 3962
97a225e6 3963 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
5ecd9d40 3964 ac->nodemask) {
bc53008e
WY
3965 if (!managed_zone(zone))
3966 continue;
d137a7cb 3967 if (last_pgdat != zone->zone_pgdat) {
97a225e6 3968 wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
d137a7cb
CW
3969 last_pgdat = zone->zone_pgdat;
3970 }
e1a55637 3971 }
3a025760
JW
3972}
3973
c603844b 3974static inline unsigned int
eb2e2b42 3975gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
341ce06f 3976{
c603844b 3977 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1da177e4 3978
736838e9 3979 /*
524c4807 3980 * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
736838e9
MN
3981 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3982 * to save two branches.
3983 */
524c4807 3984 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
736838e9 3985 BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
933e312e 3986
341ce06f
PZ
3987 /*
3988 * The caller may dip into page reserves a bit more if the caller
3989 * cannot run direct reclaim, or if the caller has realtime scheduling
3990 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1ebbb218 3991 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
341ce06f 3992 */
736838e9
MN
3993 alloc_flags |= (__force int)
3994 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
1da177e4 3995
1ebbb218 3996 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
5c3240d9 3997 /*
b104a35d
DR
3998 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
3999 * if it can't schedule.
5c3240d9 4000 */
eb2e2b42 4001 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1ebbb218 4002 alloc_flags |= ALLOC_NON_BLOCK;
eb2e2b42
MG
4003
4004 if (order > 0)
4005 alloc_flags |= ALLOC_HIGHATOMIC;
4006 }
4007
523b9458 4008 /*
1ebbb218
MG
4009 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
4010 * GFP_ATOMIC) rather than fail, see the comment for
8e464522 4011 * cpuset_node_allowed().
523b9458 4012 */
1ebbb218
MG
4013 if (alloc_flags & ALLOC_MIN_RESERVE)
4014 alloc_flags &= ~ALLOC_CPUSET;
88dc6f20 4015 } else if (unlikely(rt_task(current)) && in_task())
c988dcbe 4016 alloc_flags |= ALLOC_MIN_RESERVE;
341ce06f 4017
8e3560d9 4018 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
8510e69c 4019
341ce06f
PZ
4020 return alloc_flags;
4021}
4022
cd04ae1e 4023static bool oom_reserves_allowed(struct task_struct *tsk)
072bb0aa 4024{
cd04ae1e
MH
4025 if (!tsk_is_oom_victim(tsk))
4026 return false;
4027
4028 /*
4029 * !MMU doesn't have oom reaper so give access to memory reserves
4030 * only to the thread with TIF_MEMDIE set
4031 */
4032 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
31a6c190
VB
4033 return false;
4034
cd04ae1e
MH
4035 return true;
4036}
4037
4038/*
4039 * Distinguish requests which really need access to full memory
4040 * reserves from oom victims which can live with a portion of it
4041 */
4042static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
4043{
4044 if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
4045 return 0;
31a6c190 4046 if (gfp_mask & __GFP_MEMALLOC)
cd04ae1e 4047 return ALLOC_NO_WATERMARKS;
31a6c190 4048 if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
cd04ae1e
MH
4049 return ALLOC_NO_WATERMARKS;
4050 if (!in_interrupt()) {
4051 if (current->flags & PF_MEMALLOC)
4052 return ALLOC_NO_WATERMARKS;
4053 else if (oom_reserves_allowed(current))
4054 return ALLOC_OOM;
4055 }
31a6c190 4056
cd04ae1e
MH
4057 return 0;
4058}
4059
4060bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
4061{
4062 return !!__gfp_pfmemalloc_flags(gfp_mask);
072bb0aa
MG
4063}
4064
0a0337e0
MH
4065/*
4066 * Checks whether it makes sense to retry the reclaim to make a forward progress
4067 * for the given allocation request.
491d79ae
JW
4068 *
4069 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4070 * without success, or when we couldn't even meet the watermark if we
4071 * reclaimed all remaining pages on the LRU lists.
0a0337e0
MH
4072 *
4073 * Returns true if a retry is viable or false to enter the oom path.
4074 */
4075static inline bool
4076should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4077 struct alloc_context *ac, int alloc_flags,
423b452e 4078 bool did_some_progress, int *no_progress_loops)
0a0337e0
MH
4079{
4080 struct zone *zone;
4081 struct zoneref *z;
15f570bf 4082 bool ret = false;
0a0337e0 4083
423b452e
VB
4084 /*
4085 * Costly allocations might have made a progress but this doesn't mean
4086 * their order will become available due to high fragmentation so
4087 * always increment the no progress counter for them
4088 */
4089 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4090 *no_progress_loops = 0;
4091 else
4092 (*no_progress_loops)++;
4093
0a0337e0
MH
4094 /*
4095 * Make sure we converge to OOM if we cannot make any progress
4096 * several times in the row.
4097 */
04c8716f
MK
4098 if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
4099 /* Before OOM, exhaust highatomic_reserve */
29fac03b 4100 return unreserve_highatomic_pageblock(ac, true);
04c8716f 4101 }
0a0337e0 4102
bca67592
MG
4103 /*
4104 * Keep reclaiming pages while there is a chance this will lead
4105 * somewhere. If none of the target zones can satisfy our allocation
4106 * request even if all reclaimable pages are considered then we are
4107 * screwed and have to go OOM.
0a0337e0 4108 */
97a225e6
JK
4109 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4110 ac->highest_zoneidx, ac->nodemask) {
0a0337e0 4111 unsigned long available;
ede37713 4112 unsigned long reclaimable;
d379f01d
MH
4113 unsigned long min_wmark = min_wmark_pages(zone);
4114 bool wmark;
0a0337e0 4115
5a1c84b4 4116 available = reclaimable = zone_reclaimable_pages(zone);
5a1c84b4 4117 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
0a0337e0
MH
4118
4119 /*
491d79ae
JW
4120 * Would the allocation succeed if we reclaimed all
4121 * reclaimable pages?
0a0337e0 4122 */
d379f01d 4123 wmark = __zone_watermark_ok(zone, order, min_wmark,
97a225e6 4124 ac->highest_zoneidx, alloc_flags, available);
d379f01d
MH
4125 trace_reclaim_retry_zone(z, order, reclaimable,
4126 available, min_wmark, *no_progress_loops, wmark);
4127 if (wmark) {
15f570bf 4128 ret = true;
132b0d21 4129 break;
0a0337e0
MH
4130 }
4131 }
4132
15f570bf
MH
4133 /*
4134 * Memory allocation/reclaim might be called from a WQ context and the
4135 * current implementation of the WQ concurrency control doesn't
4136 * recognize that a particular WQ is congested if the worker thread is
4137 * looping without ever sleeping. Therefore we have to do a short sleep
4138 * here rather than calling cond_resched().
4139 */
4140 if (current->flags & PF_WQ_WORKER)
4141 schedule_timeout_uninterruptible(1);
4142 else
4143 cond_resched();
4144 return ret;
0a0337e0
MH
4145}
4146
902b6281
VB
4147static inline bool
4148check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
4149{
4150 /*
4151 * It's possible that cpuset's mems_allowed and the nodemask from
4152 * mempolicy don't intersect. This should be normally dealt with by
4153 * policy_nodemask(), but it's possible to race with cpuset update in
4154 * such a way the check therein was true, and then it became false
4155 * before we got our cpuset_mems_cookie here.
4156 * This assumes that for all allocations, ac->nodemask can come only
4157 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4158 * when it does not intersect with the cpuset restrictions) or the
4159 * caller can deal with a violated nodemask.
4160 */
4161 if (cpusets_enabled() && ac->nodemask &&
4162 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
4163 ac->nodemask = NULL;
4164 return true;
4165 }
4166
4167 /*
4168 * When updating a task's mems_allowed or mempolicy nodemask, it is
4169 * possible to race with parallel threads in such a way that our
4170 * allocation can fail while the mask is being updated. If we are about
4171 * to fail, check if the cpuset changed during allocation and if so,
4172 * retry.
4173 */
4174 if (read_mems_allowed_retry(cpuset_mems_cookie))
4175 return true;
4176
4177 return false;
4178}
4179
11e33f6a
MG
4180static inline struct page *
4181__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
a9263751 4182 struct alloc_context *ac)
11e33f6a 4183{
d0164adc 4184 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
282722b0 4185 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
11e33f6a 4186 struct page *page = NULL;
c603844b 4187 unsigned int alloc_flags;
11e33f6a 4188 unsigned long did_some_progress;
5ce9bfef 4189 enum compact_priority compact_priority;
c5d01d0d 4190 enum compact_result compact_result;
5ce9bfef
VB
4191 int compaction_retries;
4192 int no_progress_loops;
5ce9bfef 4193 unsigned int cpuset_mems_cookie;
3d36424b 4194 unsigned int zonelist_iter_cookie;
cd04ae1e 4195 int reserve_flags;
1da177e4 4196
3d36424b 4197restart:
5ce9bfef
VB
4198 compaction_retries = 0;
4199 no_progress_loops = 0;
4200 compact_priority = DEF_COMPACT_PRIORITY;
4201 cpuset_mems_cookie = read_mems_allowed_begin();
3d36424b 4202 zonelist_iter_cookie = zonelist_iter_begin();
9a67f648
MH
4203
4204 /*
4205 * The fast path uses conservative alloc_flags to succeed only until
4206 * kswapd needs to be woken up, and to avoid the cost of setting up
4207 * alloc_flags precisely. So we do that now.
4208 */
eb2e2b42 4209 alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
9a67f648 4210
e47483bc
VB
4211 /*
4212 * We need to recalculate the starting point for the zonelist iterator
4213 * because we might have used different nodemask in the fast path, or
4214 * there was a cpuset modification and we are retrying - otherwise we
4215 * could end up iterating over non-eligible zones endlessly.
4216 */
4217 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
97a225e6 4218 ac->highest_zoneidx, ac->nodemask);
e47483bc
VB
4219 if (!ac->preferred_zoneref->zone)
4220 goto nopage;
4221
8ca1b5a4
FT
4222 /*
4223 * Check for insane configurations where the cpuset doesn't contain
4224 * any suitable zone to satisfy the request - e.g. non-movable
4225 * GFP_HIGHUSER allocations from MOVABLE nodes only.
4226 */
4227 if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
4228 struct zoneref *z = first_zones_zonelist(ac->zonelist,
4229 ac->highest_zoneidx,
4230 &cpuset_current_mems_allowed);
4231 if (!z->zone)
4232 goto nopage;
4233 }
4234
0a79cdad 4235 if (alloc_flags & ALLOC_KSWAPD)
5ecd9d40 4236 wake_all_kswapds(order, gfp_mask, ac);
23771235
VB
4237
4238 /*
4239 * The adjusted alloc_flags might result in immediate success, so try
4240 * that first
4241 */
4242 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4243 if (page)
4244 goto got_pg;
4245
a8161d1e
VB
4246 /*
4247 * For costly allocations, try direct compaction first, as it's likely
282722b0
VB
4248 * that we have enough base pages and don't need to reclaim. For non-
4249 * movable high-order allocations, do that as well, as compaction will
4250 * try prevent permanent fragmentation by migrating from blocks of the
4251 * same migratetype.
4252 * Don't try this for allocations that are allowed to ignore
4253 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
a8161d1e 4254 */
282722b0
VB
4255 if (can_direct_reclaim &&
4256 (costly_order ||
4257 (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
4258 && !gfp_pfmemalloc_allowed(gfp_mask)) {
a8161d1e
VB
4259 page = __alloc_pages_direct_compact(gfp_mask, order,
4260 alloc_flags, ac,
a5508cd8 4261 INIT_COMPACT_PRIORITY,
a8161d1e
VB
4262 &compact_result);
4263 if (page)
4264 goto got_pg;
4265
cc638f32
VB
4266 /*
4267 * Checks for costly allocations with __GFP_NORETRY, which
4268 * includes some THP page fault allocations
4269 */
4270 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
b39d0ee2
DR
4271 /*
4272 * If allocating entire pageblock(s) and compaction
4273 * failed because all zones are below low watermarks
4274 * or is prohibited because it recently failed at this
3f36d866
DR
4275 * order, fail immediately unless the allocator has
4276 * requested compaction and reclaim retry.
b39d0ee2
DR
4277 *
4278 * Reclaim is
4279 * - potentially very expensive because zones are far
4280 * below their low watermarks or this is part of very
4281 * bursty high order allocations,
4282 * - not guaranteed to help because isolate_freepages()
4283 * may not iterate over freed pages as part of its
4284 * linear scan, and
4285 * - unlikely to make entire pageblocks free on its
4286 * own.
4287 */
4288 if (compact_result == COMPACT_SKIPPED ||
4289 compact_result == COMPACT_DEFERRED)
4290 goto nopage;
a8161d1e 4291
a8161d1e 4292 /*
3eb2771b
VB
4293 * Looks like reclaim/compaction is worth trying, but
4294 * sync compaction could be very expensive, so keep
25160354 4295 * using async compaction.
a8161d1e 4296 */
a5508cd8 4297 compact_priority = INIT_COMPACT_PRIORITY;
a8161d1e
VB
4298 }
4299 }
23771235 4300
31a6c190 4301retry:
23771235 4302 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
0a79cdad 4303 if (alloc_flags & ALLOC_KSWAPD)
5ecd9d40 4304 wake_all_kswapds(order, gfp_mask, ac);
31a6c190 4305
cd04ae1e
MH
4306 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4307 if (reserve_flags)
ce96fa62
ML
4308 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
4309 (alloc_flags & ALLOC_KSWAPD);
23771235 4310
e46e7b77 4311 /*
d6a24df0
VB
4312 * Reset the nodemask and zonelist iterators if memory policies can be
4313 * ignored. These allocations are high priority and system rather than
4314 * user oriented.
e46e7b77 4315 */
cd04ae1e 4316 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
d6a24df0 4317 ac->nodemask = NULL;
e46e7b77 4318 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
97a225e6 4319 ac->highest_zoneidx, ac->nodemask);
e46e7b77
MG
4320 }
4321
23771235 4322 /* Attempt with potentially adjusted zonelist and alloc_flags */
31a6c190 4323 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
7fb1d9fc
RS
4324 if (page)
4325 goto got_pg;
1da177e4 4326
d0164adc 4327 /* Caller is not willing to reclaim, we can't balance anything */
9a67f648 4328 if (!can_direct_reclaim)
1da177e4
LT
4329 goto nopage;
4330
9a67f648
MH
4331 /* Avoid recursion of direct reclaim */
4332 if (current->flags & PF_MEMALLOC)
6583bb64
DR
4333 goto nopage;
4334
a8161d1e
VB
4335 /* Try direct reclaim and then allocating */
4336 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4337 &did_some_progress);
4338 if (page)
4339 goto got_pg;
4340
4341 /* Try direct compaction and then allocating */
a9263751 4342 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
a5508cd8 4343 compact_priority, &compact_result);
56de7263
MG
4344 if (page)
4345 goto got_pg;
75f30861 4346
9083905a
JW
4347 /* Do not loop if specifically requested */
4348 if (gfp_mask & __GFP_NORETRY)
a8161d1e 4349 goto nopage;
9083905a 4350
0a0337e0
MH
4351 /*
4352 * Do not retry costly high order allocations unless they are
dcda9b04 4353 * __GFP_RETRY_MAYFAIL
0a0337e0 4354 */
dcda9b04 4355 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
a8161d1e 4356 goto nopage;
0a0337e0 4357
0a0337e0 4358 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
423b452e 4359 did_some_progress > 0, &no_progress_loops))
0a0337e0
MH
4360 goto retry;
4361
33c2d214
MH
4362 /*
4363 * It doesn't make any sense to retry for the compaction if the order-0
4364 * reclaim is not able to make any progress because the current
4365 * implementation of the compaction depends on the sufficient amount
4366 * of free memory (see __compaction_suitable)
4367 */
4368 if (did_some_progress > 0 &&
86a294a8 4369 should_compact_retry(ac, order, alloc_flags,
a5508cd8 4370 compact_result, &compact_priority,
d9436498 4371 &compaction_retries))
33c2d214
MH
4372 goto retry;
4373
902b6281 4374
3d36424b
MG
4375 /*
4376 * Deal with possible cpuset update races or zonelist updates to avoid
4377 * a unnecessary OOM kill.
4378 */
4379 if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
4380 check_retry_zonelist(zonelist_iter_cookie))
4381 goto restart;
e47483bc 4382
9083905a
JW
4383 /* Reclaim has failed us, start killing things */
4384 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4385 if (page)
4386 goto got_pg;
4387
9a67f648 4388 /* Avoid allocations with no watermarks from looping endlessly */
cd04ae1e 4389 if (tsk_is_oom_victim(current) &&
8510e69c 4390 (alloc_flags & ALLOC_OOM ||
c288983d 4391 (gfp_mask & __GFP_NOMEMALLOC)))
9a67f648
MH
4392 goto nopage;
4393
9083905a 4394 /* Retry as long as the OOM killer is making progress */
0a0337e0
MH
4395 if (did_some_progress) {
4396 no_progress_loops = 0;
9083905a 4397 goto retry;
0a0337e0 4398 }
9083905a 4399
1da177e4 4400nopage:
3d36424b
MG
4401 /*
4402 * Deal with possible cpuset update races or zonelist updates to avoid
4403 * a unnecessary OOM kill.
4404 */
4405 if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
4406 check_retry_zonelist(zonelist_iter_cookie))
4407 goto restart;
5ce9bfef 4408
9a67f648
MH
4409 /*
4410 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4411 * we always retry
4412 */
4413 if (gfp_mask & __GFP_NOFAIL) {
4414 /*
4415 * All existing users of the __GFP_NOFAIL are blockable, so warn
4416 * of any new users that actually require GFP_NOWAIT
4417 */
3f913fc5 4418 if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
9a67f648
MH
4419 goto fail;
4420
4421 /*
4422 * PF_MEMALLOC request from this context is rather bizarre
4423 * because we cannot reclaim anything and only can loop waiting
4424 * for somebody to do a work for us
4425 */
3f913fc5 4426 WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
9a67f648
MH
4427
4428 /*
4429 * non failing costly orders are a hard requirement which we
4430 * are not prepared for much so let's warn about these users
4431 * so that we can identify them and convert them to something
4432 * else.
4433 */
896c4d52 4434 WARN_ON_ONCE_GFP(costly_order, gfp_mask);
9a67f648 4435
6c18ba7a 4436 /*
1ebbb218
MG
4437 * Help non-failing allocations by giving some access to memory
4438 * reserves normally used for high priority non-blocking
4439 * allocations but do not use ALLOC_NO_WATERMARKS because this
6c18ba7a 4440 * could deplete whole memory reserves which would just make
1ebbb218 4441 * the situation worse.
6c18ba7a 4442 */
1ebbb218 4443 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
6c18ba7a
MH
4444 if (page)
4445 goto got_pg;
4446
9a67f648
MH
4447 cond_resched();
4448 goto retry;
4449 }
4450fail:
a8e99259 4451 warn_alloc(gfp_mask, ac->nodemask,
7877cdcc 4452 "page allocation failure: order:%u", order);
1da177e4 4453got_pg:
072bb0aa 4454 return page;
1da177e4 4455}
11e33f6a 4456
9cd75558 4457static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
04ec6264 4458 int preferred_nid, nodemask_t *nodemask,
8e6a930b 4459 struct alloc_context *ac, gfp_t *alloc_gfp,
9cd75558 4460 unsigned int *alloc_flags)
11e33f6a 4461{
97a225e6 4462 ac->highest_zoneidx = gfp_zone(gfp_mask);
04ec6264 4463 ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
9cd75558 4464 ac->nodemask = nodemask;
01c0bfe0 4465 ac->migratetype = gfp_migratetype(gfp_mask);
11e33f6a 4466
682a3385 4467 if (cpusets_enabled()) {
8e6a930b 4468 *alloc_gfp |= __GFP_HARDWALL;
182f3d7a
MS
4469 /*
4470 * When we are in the interrupt context, it is irrelevant
4471 * to the current task context. It means that any node ok.
4472 */
88dc6f20 4473 if (in_task() && !ac->nodemask)
9cd75558 4474 ac->nodemask = &cpuset_current_mems_allowed;
51047820
VB
4475 else
4476 *alloc_flags |= ALLOC_CPUSET;
682a3385
MG
4477 }
4478
446ec838 4479 might_alloc(gfp_mask);
11e33f6a
MG
4480
4481 if (should_fail_alloc_page(gfp_mask, order))
9cd75558 4482 return false;
11e33f6a 4483
8e3560d9 4484 *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
d883c6cf 4485
c9ab0c4f 4486 /* Dirty zone balancing only done in the fast path */
9cd75558 4487 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
c9ab0c4f 4488
e46e7b77
MG
4489 /*
4490 * The preferred zone is used for statistics but crucially it is
4491 * also used as the starting point for the zonelist iterator. It
4492 * may get reset for allocations that ignore memory policies.
4493 */
9cd75558 4494 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
97a225e6 4495 ac->highest_zoneidx, ac->nodemask);
a0622d05
MN
4496
4497 return true;
9cd75558
MG
4498}
4499
387ba26f 4500/*
0f87d9d3 4501 * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
387ba26f
MG
4502 * @gfp: GFP flags for the allocation
4503 * @preferred_nid: The preferred NUMA node ID to allocate from
4504 * @nodemask: Set of nodes to allocate from, may be NULL
0f87d9d3
MG
4505 * @nr_pages: The number of pages desired on the list or array
4506 * @page_list: Optional list to store the allocated pages
4507 * @page_array: Optional array to store the pages
387ba26f
MG
4508 *
4509 * This is a batched version of the page allocator that attempts to
0f87d9d3
MG
4510 * allocate nr_pages quickly. Pages are added to page_list if page_list
4511 * is not NULL, otherwise it is assumed that the page_array is valid.
387ba26f 4512 *
0f87d9d3
MG
4513 * For lists, nr_pages is the number of pages that should be allocated.
4514 *
4515 * For arrays, only NULL elements are populated with pages and nr_pages
4516 * is the maximum number of pages that will be stored in the array.
4517 *
4518 * Returns the number of pages on the list or array.
387ba26f
MG
4519 */
4520unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
4521 nodemask_t *nodemask, int nr_pages,
0f87d9d3
MG
4522 struct list_head *page_list,
4523 struct page **page_array)
387ba26f
MG
4524{
4525 struct page *page;
4b23a68f 4526 unsigned long __maybe_unused UP_flags;
387ba26f
MG
4527 struct zone *zone;
4528 struct zoneref *z;
4529 struct per_cpu_pages *pcp;
4530 struct list_head *pcp_list;
4531 struct alloc_context ac;
4532 gfp_t alloc_gfp;
4533 unsigned int alloc_flags = ALLOC_WMARK_LOW;
3e23060b 4534 int nr_populated = 0, nr_account = 0;
387ba26f 4535
0f87d9d3
MG
4536 /*
4537 * Skip populated array elements to determine if any pages need
4538 * to be allocated before disabling IRQs.
4539 */
b08e50dd 4540 while (page_array && nr_populated < nr_pages && page_array[nr_populated])
0f87d9d3
MG
4541 nr_populated++;
4542
06147843
CL
4543 /* No pages requested? */
4544 if (unlikely(nr_pages <= 0))
4545 goto out;
4546
b3b64ebd
MG
4547 /* Already populated array? */
4548 if (unlikely(page_array && nr_pages - nr_populated == 0))
06147843 4549 goto out;
b3b64ebd 4550
8dcb3060 4551 /* Bulk allocator does not support memcg accounting. */
f7a449f7 4552 if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
8dcb3060
SB
4553 goto failed;
4554
387ba26f 4555 /* Use the single page allocator for one page. */
0f87d9d3 4556 if (nr_pages - nr_populated == 1)
387ba26f
MG
4557 goto failed;
4558
187ad460
MG
4559#ifdef CONFIG_PAGE_OWNER
4560 /*
4561 * PAGE_OWNER may recurse into the allocator to allocate space to
4562 * save the stack with pagesets.lock held. Releasing/reacquiring
4563 * removes much of the performance benefit of bulk allocation so
4564 * force the caller to allocate one page at a time as it'll have
4565 * similar performance to added complexity to the bulk allocator.
4566 */
4567 if (static_branch_unlikely(&page_owner_inited))
4568 goto failed;
4569#endif
4570
387ba26f
MG
4571 /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
4572 gfp &= gfp_allowed_mask;
4573 alloc_gfp = gfp;
4574 if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
06147843 4575 goto out;
387ba26f
MG
4576 gfp = alloc_gfp;
4577
4578 /* Find an allowed local zone that meets the low watermark. */
4579 for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
4580 unsigned long mark;
4581
4582 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
4583 !__cpuset_zone_allowed(zone, gfp)) {
4584 continue;
4585 }
4586
4587 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
4588 zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
4589 goto failed;
4590 }
4591
4592 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
4593 if (zone_watermark_fast(zone, 0, mark,
4594 zonelist_zone_idx(ac.preferred_zoneref),
4595 alloc_flags, gfp)) {
4596 break;
4597 }
4598 }
4599
4600 /*
4601 * If there are no allowed local zones that meets the watermarks then
4602 * try to allocate a single page and reclaim if necessary.
4603 */
ce76f9a1 4604 if (unlikely(!zone))
387ba26f
MG
4605 goto failed;
4606
57490774 4607 /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
4b23a68f 4608 pcp_trylock_prepare(UP_flags);
57490774 4609 pcp = pcp_spin_trylock(zone->per_cpu_pageset);
01b44456 4610 if (!pcp)
4b23a68f 4611 goto failed_irq;
387ba26f 4612
387ba26f 4613 /* Attempt the batch allocation */
44042b44 4614 pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
0f87d9d3
MG
4615 while (nr_populated < nr_pages) {
4616
4617 /* Skip existing pages */
4618 if (page_array && page_array[nr_populated]) {
4619 nr_populated++;
4620 continue;
4621 }
4622
44042b44 4623 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
387ba26f 4624 pcp, pcp_list);
ce76f9a1 4625 if (unlikely(!page)) {
c572e488 4626 /* Try and allocate at least one page */
4b23a68f 4627 if (!nr_account) {
57490774 4628 pcp_spin_unlock(pcp);
387ba26f 4629 goto failed_irq;
4b23a68f 4630 }
387ba26f
MG
4631 break;
4632 }
3e23060b 4633 nr_account++;
387ba26f
MG
4634
4635 prep_new_page(page, 0, gfp, 0);
0f87d9d3
MG
4636 if (page_list)
4637 list_add(&page->lru, page_list);
4638 else
4639 page_array[nr_populated] = page;
4640 nr_populated++;
387ba26f
MG
4641 }
4642
57490774 4643 pcp_spin_unlock(pcp);
4b23a68f 4644 pcp_trylock_finish(UP_flags);
43c95bcc 4645
3e23060b
MG
4646 __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
4647 zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
387ba26f 4648
06147843 4649out:
0f87d9d3 4650 return nr_populated;
387ba26f
MG
4651
4652failed_irq:
4b23a68f 4653 pcp_trylock_finish(UP_flags);
387ba26f
MG
4654
4655failed:
4656 page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
4657 if (page) {
0f87d9d3
MG
4658 if (page_list)
4659 list_add(&page->lru, page_list);
4660 else
4661 page_array[nr_populated] = page;
4662 nr_populated++;
387ba26f
MG
4663 }
4664
06147843 4665 goto out;
387ba26f
MG
4666}
4667EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
4668
9cd75558
MG
4669/*
4670 * This is the 'heart' of the zoned buddy allocator.
4671 */
84172f4b 4672struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
04ec6264 4673 nodemask_t *nodemask)
9cd75558
MG
4674{
4675 struct page *page;
4676 unsigned int alloc_flags = ALLOC_WMARK_LOW;
8e6a930b 4677 gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
9cd75558
MG
4678 struct alloc_context ac = { };
4679
c63ae43b
MH
4680 /*
4681 * There are several places where we assume that the order value is sane
4682 * so bail out early if the request is out of bound.
4683 */
23baf831 4684 if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
c63ae43b 4685 return NULL;
c63ae43b 4686
6e5e0f28 4687 gfp &= gfp_allowed_mask;
da6df1b0
PT
4688 /*
4689 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
4690 * resp. GFP_NOIO which has to be inherited for all allocation requests
4691 * from a particular context which has been marked by
8e3560d9
PT
4692 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
4693 * movable zones are not used during allocation.
da6df1b0
PT
4694 */
4695 gfp = current_gfp_context(gfp);
6e5e0f28
MWO
4696 alloc_gfp = gfp;
4697 if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
8e6a930b 4698 &alloc_gfp, &alloc_flags))
9cd75558
MG
4699 return NULL;
4700
6bb15450
MG
4701 /*
4702 * Forbid the first pass from falling back to types that fragment
4703 * memory until all local zones are considered.
4704 */
6e5e0f28 4705 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
6bb15450 4706
5117f45d 4707 /* First allocation attempt */
8e6a930b 4708 page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
4fcb0971
MG
4709 if (likely(page))
4710 goto out;
11e33f6a 4711
da6df1b0 4712 alloc_gfp = gfp;
4fcb0971 4713 ac.spread_dirty_pages = false;
23f086f9 4714
4741526b
MG
4715 /*
4716 * Restore the original nodemask if it was potentially replaced with
4717 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4718 */
97ce86f9 4719 ac.nodemask = nodemask;
16096c25 4720
8e6a930b 4721 page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
cc9a6c87 4722
4fcb0971 4723out:
f7a449f7 4724 if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
6e5e0f28 4725 unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
c4159a75
VD
4726 __free_pages(page, order);
4727 page = NULL;
4949148a
VD
4728 }
4729
8e6a930b 4730 trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
b073d7f8 4731 kmsan_alloc_page(page, order, alloc_gfp);
4fcb0971 4732
11e33f6a 4733 return page;
1da177e4 4734}
84172f4b 4735EXPORT_SYMBOL(__alloc_pages);
1da177e4 4736
cc09cb13
MWO
4737struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
4738 nodemask_t *nodemask)
4739{
4740 struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
4741 preferred_nid, nodemask);
4742
4743 if (page && order > 1)
4744 prep_transhuge_page(page);
4745 return (struct folio *)page;
4746}
4747EXPORT_SYMBOL(__folio_alloc);
4748
1da177e4 4749/*
9ea9a680
MH
4750 * Common helper functions. Never use with __GFP_HIGHMEM because the returned
4751 * address cannot represent highmem pages. Use alloc_pages and then kmap if
4752 * you need to access high mem.
1da177e4 4753 */
920c7a5d 4754unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1da177e4 4755{
945a1113
AM
4756 struct page *page;
4757
9ea9a680 4758 page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
1da177e4
LT
4759 if (!page)
4760 return 0;
4761 return (unsigned long) page_address(page);
4762}
1da177e4
LT
4763EXPORT_SYMBOL(__get_free_pages);
4764
920c7a5d 4765unsigned long get_zeroed_page(gfp_t gfp_mask)
1da177e4 4766{
dcc1be11 4767 return __get_free_page(gfp_mask | __GFP_ZERO);
1da177e4 4768}
1da177e4
LT
4769EXPORT_SYMBOL(get_zeroed_page);
4770
7f194fbb
MWO
4771/**
4772 * __free_pages - Free pages allocated with alloc_pages().
4773 * @page: The page pointer returned from alloc_pages().
4774 * @order: The order of the allocation.
4775 *
4776 * This function can free multi-page allocations that are not compound
4777 * pages. It does not check that the @order passed in matches that of
4778 * the allocation, so it is easy to leak memory. Freeing more memory
4779 * than was allocated will probably emit a warning.
4780 *
4781 * If the last reference to this page is speculative, it will be released
4782 * by put_page() which only frees the first page of a non-compound
4783 * allocation. To prevent the remaining pages from being leaked, we free
4784 * the subsequent pages here. If you want to use the page's reference
4785 * count to decide when to free the allocation, you should allocate a
4786 * compound page, and use put_page() instead of __free_pages().
4787 *
4788 * Context: May be called in interrupt context or while holding a normal
4789 * spinlock, but not in NMI context or while holding a raw spinlock.
4790 */
742aa7fb
AL
4791void __free_pages(struct page *page, unsigned int order)
4792{
462a8e08
DC
4793 /* get PageHead before we drop reference */
4794 int head = PageHead(page);
4795
742aa7fb
AL
4796 if (put_page_testzero(page))
4797 free_the_page(page, order);
462a8e08 4798 else if (!head)
e320d301
MWO
4799 while (order-- > 0)
4800 free_the_page(page + (1 << order), order);
742aa7fb 4801}
1da177e4
LT
4802EXPORT_SYMBOL(__free_pages);
4803
920c7a5d 4804void free_pages(unsigned long addr, unsigned int order)
1da177e4
LT
4805{
4806 if (addr != 0) {
725d704e 4807 VM_BUG_ON(!virt_addr_valid((void *)addr));
1da177e4
LT
4808 __free_pages(virt_to_page((void *)addr), order);
4809 }
4810}
4811
4812EXPORT_SYMBOL(free_pages);
4813
b63ae8ca
AD
4814/*
4815 * Page Fragment:
4816 * An arbitrary-length arbitrary-offset area of memory which resides
4817 * within a 0 or higher order page. Multiple fragments within that page
4818 * are individually refcounted, in the page's reference counter.
4819 *
4820 * The page_frag functions below provide a simple allocation framework for
4821 * page fragments. This is used by the network stack and network device
4822 * drivers to provide a backing region of memory for use as either an
4823 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
4824 */
2976db80
AD
4825static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
4826 gfp_t gfp_mask)
b63ae8ca
AD
4827{
4828 struct page *page = NULL;
4829 gfp_t gfp = gfp_mask;
4830
4831#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4832 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
4833 __GFP_NOMEMALLOC;
4834 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
4835 PAGE_FRAG_CACHE_MAX_ORDER);
4836 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
4837#endif
4838 if (unlikely(!page))
4839 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
4840
4841 nc->va = page ? page_address(page) : NULL;
4842
4843 return page;
4844}
4845
2976db80 4846void __page_frag_cache_drain(struct page *page, unsigned int count)
44fdffd7
AD
4847{
4848 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4849
742aa7fb
AL
4850 if (page_ref_sub_and_test(page, count))
4851 free_the_page(page, compound_order(page));
44fdffd7 4852}
2976db80 4853EXPORT_SYMBOL(__page_frag_cache_drain);
44fdffd7 4854
b358e212
KH
4855void *page_frag_alloc_align(struct page_frag_cache *nc,
4856 unsigned int fragsz, gfp_t gfp_mask,
4857 unsigned int align_mask)
b63ae8ca
AD
4858{
4859 unsigned int size = PAGE_SIZE;
4860 struct page *page;
4861 int offset;
4862
4863 if (unlikely(!nc->va)) {
4864refill:
2976db80 4865 page = __page_frag_cache_refill(nc, gfp_mask);
b63ae8ca
AD
4866 if (!page)
4867 return NULL;
4868
4869#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4870 /* if size can vary use size else just use PAGE_SIZE */
4871 size = nc->size;
4872#endif
4873 /* Even if we own the page, we do not use atomic_set().
4874 * This would break get_page_unless_zero() users.
4875 */
86447726 4876 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
b63ae8ca
AD
4877
4878 /* reset page count bias and offset to start of new frag */
2f064f34 4879 nc->pfmemalloc = page_is_pfmemalloc(page);
86447726 4880 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
b63ae8ca
AD
4881 nc->offset = size;
4882 }
4883
4884 offset = nc->offset - fragsz;
4885 if (unlikely(offset < 0)) {
4886 page = virt_to_page(nc->va);
4887
fe896d18 4888 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
b63ae8ca
AD
4889 goto refill;
4890
d8c19014
DZ
4891 if (unlikely(nc->pfmemalloc)) {
4892 free_the_page(page, compound_order(page));
4893 goto refill;
4894 }
4895
b63ae8ca
AD
4896#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4897 /* if size can vary use size else just use PAGE_SIZE */
4898 size = nc->size;
4899#endif
4900 /* OK, page count is 0, we can safely set it */
86447726 4901 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
b63ae8ca
AD
4902
4903 /* reset page count bias and offset to start of new frag */
86447726 4904 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
b63ae8ca 4905 offset = size - fragsz;
dac22531
ML
4906 if (unlikely(offset < 0)) {
4907 /*
4908 * The caller is trying to allocate a fragment
4909 * with fragsz > PAGE_SIZE but the cache isn't big
4910 * enough to satisfy the request, this may
4911 * happen in low memory conditions.
4912 * We don't release the cache page because
4913 * it could make memory pressure worse
4914 * so we simply return NULL here.
4915 */
4916 return NULL;
4917 }
b63ae8ca
AD
4918 }
4919
4920 nc->pagecnt_bias--;
b358e212 4921 offset &= align_mask;
b63ae8ca
AD
4922 nc->offset = offset;
4923
4924 return nc->va + offset;
4925}
b358e212 4926EXPORT_SYMBOL(page_frag_alloc_align);
b63ae8ca
AD
4927
4928/*
4929 * Frees a page fragment allocated out of either a compound or order 0 page.
4930 */
8c2dd3e4 4931void page_frag_free(void *addr)
b63ae8ca
AD
4932{
4933 struct page *page = virt_to_head_page(addr);
4934
742aa7fb
AL
4935 if (unlikely(put_page_testzero(page)))
4936 free_the_page(page, compound_order(page));
b63ae8ca 4937}
8c2dd3e4 4938EXPORT_SYMBOL(page_frag_free);
b63ae8ca 4939
d00181b9
KS
4940static void *make_alloc_exact(unsigned long addr, unsigned int order,
4941 size_t size)
ee85c2e1
AK
4942{
4943 if (addr) {
df48a5f7
LH
4944 unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
4945 struct page *page = virt_to_page((void *)addr);
4946 struct page *last = page + nr;
4947
4948 split_page_owner(page, 1 << order);
4949 split_page_memcg(page, 1 << order);
4950 while (page < --last)
4951 set_page_refcounted(last);
4952
4953 last = page + (1UL << order);
4954 for (page += nr; page < last; page++)
4955 __free_pages_ok(page, 0, FPI_TO_TAIL);
ee85c2e1
AK
4956 }
4957 return (void *)addr;
4958}
4959
2be0ffe2
TT
4960/**
4961 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4962 * @size: the number of bytes to allocate
63931eb9 4963 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
2be0ffe2
TT
4964 *
4965 * This function is similar to alloc_pages(), except that it allocates the
4966 * minimum number of pages to satisfy the request. alloc_pages() can only
4967 * allocate memory in power-of-two pages.
4968 *
4969 * This function is also limited by MAX_ORDER.
4970 *
4971 * Memory allocated by this function must be released by free_pages_exact().
a862f68a
MR
4972 *
4973 * Return: pointer to the allocated area or %NULL in case of error.
2be0ffe2
TT
4974 */
4975void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4976{
4977 unsigned int order = get_order(size);
4978 unsigned long addr;
4979
ba7f1b9e
ML
4980 if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
4981 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
63931eb9 4982
2be0ffe2 4983 addr = __get_free_pages(gfp_mask, order);
ee85c2e1 4984 return make_alloc_exact(addr, order, size);
2be0ffe2
TT
4985}
4986EXPORT_SYMBOL(alloc_pages_exact);
4987
ee85c2e1
AK
4988/**
4989 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4990 * pages on a node.
b5e6ab58 4991 * @nid: the preferred node ID where memory should be allocated
ee85c2e1 4992 * @size: the number of bytes to allocate
63931eb9 4993 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
ee85c2e1
AK
4994 *
4995 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
4996 * back.
a862f68a
MR
4997 *
4998 * Return: pointer to the allocated area or %NULL in case of error.
ee85c2e1 4999 */
e1931811 5000void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
ee85c2e1 5001{
d00181b9 5002 unsigned int order = get_order(size);
63931eb9
VB
5003 struct page *p;
5004
ba7f1b9e
ML
5005 if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
5006 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
63931eb9
VB
5007
5008 p = alloc_pages_node(nid, gfp_mask, order);
ee85c2e1
AK
5009 if (!p)
5010 return NULL;
5011 return make_alloc_exact((unsigned long)page_address(p), order, size);
5012}
ee85c2e1 5013
2be0ffe2
TT
5014/**
5015 * free_pages_exact - release memory allocated via alloc_pages_exact()
5016 * @virt: the value returned by alloc_pages_exact.
5017 * @size: size of allocation, same value as passed to alloc_pages_exact().
5018 *
5019 * Release the memory allocated by a previous call to alloc_pages_exact.
5020 */
5021void free_pages_exact(void *virt, size_t size)
5022{
5023 unsigned long addr = (unsigned long)virt;
5024 unsigned long end = addr + PAGE_ALIGN(size);
5025
5026 while (addr < end) {
5027 free_page(addr);
5028 addr += PAGE_SIZE;
5029 }
5030}
5031EXPORT_SYMBOL(free_pages_exact);
5032
e0fb5815
ZY
5033/**
5034 * nr_free_zone_pages - count number of pages beyond high watermark
5035 * @offset: The zone index of the highest zone
5036 *
a862f68a 5037 * nr_free_zone_pages() counts the number of pages which are beyond the
e0fb5815
ZY
5038 * high watermark within all zones at or below a given zone index. For each
5039 * zone, the number of pages is calculated as:
0e056eb5
MCC
5040 *
5041 * nr_free_zone_pages = managed_pages - high_pages
a862f68a
MR
5042 *
5043 * Return: number of pages beyond high watermark.
e0fb5815 5044 */
ebec3862 5045static unsigned long nr_free_zone_pages(int offset)
1da177e4 5046{
dd1a239f 5047 struct zoneref *z;
54a6eb5c
MG
5048 struct zone *zone;
5049
e310fd43 5050 /* Just pick one node, since fallback list is circular */
ebec3862 5051 unsigned long sum = 0;
1da177e4 5052
0e88460d 5053 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1da177e4 5054
54a6eb5c 5055 for_each_zone_zonelist(zone, z, zonelist, offset) {
9705bea5 5056 unsigned long size = zone_managed_pages(zone);
41858966 5057 unsigned long high = high_wmark_pages(zone);
e310fd43
MB
5058 if (size > high)
5059 sum += size - high;
1da177e4
LT
5060 }
5061
5062 return sum;
5063}
5064
e0fb5815
ZY
5065/**
5066 * nr_free_buffer_pages - count number of pages beyond high watermark
5067 *
5068 * nr_free_buffer_pages() counts the number of pages which are beyond the high
5069 * watermark within ZONE_DMA and ZONE_NORMAL.
a862f68a
MR
5070 *
5071 * Return: number of pages beyond high watermark within ZONE_DMA and
5072 * ZONE_NORMAL.
1da177e4 5073 */
ebec3862 5074unsigned long nr_free_buffer_pages(void)
1da177e4 5075{
af4ca457 5076 return nr_free_zone_pages(gfp_zone(GFP_USER));
1da177e4 5077}
c2f1a551 5078EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
1da177e4 5079
19770b32
MG
5080static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
5081{
5082 zoneref->zone = zone;
5083 zoneref->zone_idx = zone_idx(zone);
5084}
5085
1da177e4
LT
5086/*
5087 * Builds allocation fallback zone lists.
1a93205b
CL
5088 *
5089 * Add all populated zones of a node to the zonelist.
1da177e4 5090 */
9d3be21b 5091static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
1da177e4 5092{
1a93205b 5093 struct zone *zone;
bc732f1d 5094 enum zone_type zone_type = MAX_NR_ZONES;
9d3be21b 5095 int nr_zones = 0;
02a68a5e
CL
5096
5097 do {
2f6726e5 5098 zone_type--;
070f8032 5099 zone = pgdat->node_zones + zone_type;
e553f62f 5100 if (populated_zone(zone)) {
9d3be21b 5101 zoneref_set_zone(zone, &zonerefs[nr_zones++]);
070f8032 5102 check_highest_zone(zone_type);
1da177e4 5103 }
2f6726e5 5104 } while (zone_type);
bc732f1d 5105
070f8032 5106 return nr_zones;
1da177e4
LT
5107}
5108
5109#ifdef CONFIG_NUMA
f0c0b2b8
KH
5110
5111static int __parse_numa_zonelist_order(char *s)
5112{
c9bff3ee 5113 /*
f0953a1b 5114 * We used to support different zonelists modes but they turned
c9bff3ee
MH
5115 * out to be just not useful. Let's keep the warning in place
5116 * if somebody still use the cmd line parameter so that we do
5117 * not fail it silently
5118 */
5119 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
5120 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
f0c0b2b8
KH
5121 return -EINVAL;
5122 }
5123 return 0;
5124}
5125
c9bff3ee
MH
5126char numa_zonelist_order[] = "Node";
5127
f0c0b2b8
KH
5128/*
5129 * sysctl handler for numa_zonelist_order
5130 */
cccad5b9 5131int numa_zonelist_order_handler(struct ctl_table *table, int write,
32927393 5132 void *buffer, size_t *length, loff_t *ppos)
f0c0b2b8 5133{
32927393
CH
5134 if (write)
5135 return __parse_numa_zonelist_order(buffer);
5136 return proc_dostring(table, write, buffer, length, ppos);
f0c0b2b8
KH
5137}
5138
5139
f0c0b2b8
KH
5140static int node_load[MAX_NUMNODES];
5141
1da177e4 5142/**
4dc3b16b 5143 * find_next_best_node - find the next node that should appear in a given node's fallback list
1da177e4
LT
5144 * @node: node whose fallback list we're appending
5145 * @used_node_mask: nodemask_t of already used nodes
5146 *
5147 * We use a number of factors to determine which is the next node that should
5148 * appear on a given node's fallback list. The node should not have appeared
5149 * already in @node's fallback list, and it should be the next closest node
5150 * according to the distance array (which contains arbitrary distance values
5151 * from each node to each node in the system), and should also prefer nodes
5152 * with no CPUs, since presumably they'll have very little allocation pressure
5153 * on them otherwise.
a862f68a
MR
5154 *
5155 * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
1da177e4 5156 */
79c28a41 5157int find_next_best_node(int node, nodemask_t *used_node_mask)
1da177e4 5158{
4cf808eb 5159 int n, val;
1da177e4 5160 int min_val = INT_MAX;
00ef2d2f 5161 int best_node = NUMA_NO_NODE;
1da177e4 5162
4cf808eb
LT
5163 /* Use the local node if we haven't already */
5164 if (!node_isset(node, *used_node_mask)) {
5165 node_set(node, *used_node_mask);
5166 return node;
5167 }
1da177e4 5168
4b0ef1fe 5169 for_each_node_state(n, N_MEMORY) {
1da177e4
LT
5170
5171 /* Don't want a node to appear more than once */
5172 if (node_isset(n, *used_node_mask))
5173 continue;
5174
1da177e4
LT
5175 /* Use the distance array to find the distance */
5176 val = node_distance(node, n);
5177
4cf808eb
LT
5178 /* Penalize nodes under us ("prefer the next node") */
5179 val += (n < node);
5180
1da177e4 5181 /* Give preference to headless and unused nodes */
b630749f 5182 if (!cpumask_empty(cpumask_of_node(n)))
1da177e4
LT
5183 val += PENALTY_FOR_NODE_WITH_CPUS;
5184
5185 /* Slight preference for less loaded node */
37931324 5186 val *= MAX_NUMNODES;
1da177e4
LT
5187 val += node_load[n];
5188
5189 if (val < min_val) {
5190 min_val = val;
5191 best_node = n;
5192 }
5193 }
5194
5195 if (best_node >= 0)
5196 node_set(best_node, *used_node_mask);
5197
5198 return best_node;
5199}
5200
f0c0b2b8
KH
5201
5202/*
5203 * Build zonelists ordered by node and zones within node.
5204 * This results in maximum locality--normal zone overflows into local
5205 * DMA zone, if any--but risks exhausting DMA zone.
5206 */
9d3be21b
MH
5207static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
5208 unsigned nr_nodes)
1da177e4 5209{
9d3be21b
MH
5210 struct zoneref *zonerefs;
5211 int i;
5212
5213 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5214
5215 for (i = 0; i < nr_nodes; i++) {
5216 int nr_zones;
5217
5218 pg_data_t *node = NODE_DATA(node_order[i]);
f0c0b2b8 5219
9d3be21b
MH
5220 nr_zones = build_zonerefs_node(node, zonerefs);
5221 zonerefs += nr_zones;
5222 }
5223 zonerefs->zone = NULL;
5224 zonerefs->zone_idx = 0;
f0c0b2b8
KH
5225}
5226
523b9458
CL
5227/*
5228 * Build gfp_thisnode zonelists
5229 */
5230static void build_thisnode_zonelists(pg_data_t *pgdat)
5231{
9d3be21b
MH
5232 struct zoneref *zonerefs;
5233 int nr_zones;
523b9458 5234
9d3be21b
MH
5235 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5236 nr_zones = build_zonerefs_node(pgdat, zonerefs);
5237 zonerefs += nr_zones;
5238 zonerefs->zone = NULL;
5239 zonerefs->zone_idx = 0;
523b9458
CL
5240}
5241
f0c0b2b8
KH
5242/*
5243 * Build zonelists ordered by zone and nodes within zones.
5244 * This results in conserving DMA zone[s] until all Normal memory is
5245 * exhausted, but results in overflowing to remote node while memory
5246 * may still exist in local DMA zone.
5247 */
f0c0b2b8 5248
f0c0b2b8
KH
5249static void build_zonelists(pg_data_t *pgdat)
5250{
9d3be21b 5251 static int node_order[MAX_NUMNODES];
37931324 5252 int node, nr_nodes = 0;
d0ddf49b 5253 nodemask_t used_mask = NODE_MASK_NONE;
f0c0b2b8 5254 int local_node, prev_node;
1da177e4
LT
5255
5256 /* NUMA-aware ordering of nodes */
5257 local_node = pgdat->node_id;
1da177e4 5258 prev_node = local_node;
f0c0b2b8 5259
f0c0b2b8 5260 memset(node_order, 0, sizeof(node_order));
1da177e4
LT
5261 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5262 /*
5263 * We don't want to pressure a particular node.
5264 * So adding penalty to the first node in same
5265 * distance group to make it round-robin.
5266 */
957f822a
DR
5267 if (node_distance(local_node, node) !=
5268 node_distance(local_node, prev_node))
37931324 5269 node_load[node] += 1;
f0c0b2b8 5270
9d3be21b 5271 node_order[nr_nodes++] = node;
1da177e4 5272 prev_node = node;
1da177e4 5273 }
523b9458 5274
9d3be21b 5275 build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
523b9458 5276 build_thisnode_zonelists(pgdat);
6cf25392
BR
5277 pr_info("Fallback order for Node %d: ", local_node);
5278 for (node = 0; node < nr_nodes; node++)
5279 pr_cont("%d ", node_order[node]);
5280 pr_cont("\n");
1da177e4
LT
5281}
5282
7aac7898
LS
5283#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5284/*
5285 * Return node id of node used for "local" allocations.
5286 * I.e., first node id of first zone in arg node's generic zonelist.
5287 * Used for initializing percpu 'numa_mem', which is used primarily
5288 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5289 */
5290int local_memory_node(int node)
5291{
c33d6c06 5292 struct zoneref *z;
7aac7898 5293
c33d6c06 5294 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
7aac7898 5295 gfp_zone(GFP_KERNEL),
c33d6c06 5296 NULL);
c1093b74 5297 return zone_to_nid(z->zone);
7aac7898
LS
5298}
5299#endif
f0c0b2b8 5300
6423aa81
JK
5301static void setup_min_unmapped_ratio(void);
5302static void setup_min_slab_ratio(void);
1da177e4
LT
5303#else /* CONFIG_NUMA */
5304
f0c0b2b8 5305static void build_zonelists(pg_data_t *pgdat)
1da177e4 5306{
19655d34 5307 int node, local_node;
9d3be21b
MH
5308 struct zoneref *zonerefs;
5309 int nr_zones;
1da177e4
LT
5310
5311 local_node = pgdat->node_id;
1da177e4 5312
9d3be21b
MH
5313 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5314 nr_zones = build_zonerefs_node(pgdat, zonerefs);
5315 zonerefs += nr_zones;
1da177e4 5316
54a6eb5c
MG
5317 /*
5318 * Now we build the zonelist so that it contains the zones
5319 * of all the other nodes.
5320 * We don't want to pressure a particular node, so when
5321 * building the zones for node N, we make sure that the
5322 * zones coming right after the local ones are those from
5323 * node N+1 (modulo N)
5324 */
5325 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
5326 if (!node_online(node))
5327 continue;
9d3be21b
MH
5328 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5329 zonerefs += nr_zones;
1da177e4 5330 }
54a6eb5c
MG
5331 for (node = 0; node < local_node; node++) {
5332 if (!node_online(node))
5333 continue;
9d3be21b
MH
5334 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5335 zonerefs += nr_zones;
54a6eb5c
MG
5336 }
5337
9d3be21b
MH
5338 zonerefs->zone = NULL;
5339 zonerefs->zone_idx = 0;
1da177e4
LT
5340}
5341
5342#endif /* CONFIG_NUMA */
5343
99dcc3e5
CL
5344/*
5345 * Boot pageset table. One per cpu which is going to be used for all
5346 * zones and all nodes. The parameters will be set in such a way
5347 * that an item put on a list will immediately be handed over to
5348 * the buddy list. This is safe since pageset manipulation is done
5349 * with interrupts disabled.
5350 *
5351 * The boot_pagesets must be kept even after bootup is complete for
5352 * unused processors and/or zones. They do play a role for bootstrapping
5353 * hotplugged processors.
5354 *
5355 * zoneinfo_show() and maybe other functions do
5356 * not check if the processor is online before following the pageset pointer.
5357 * Other parts of the kernel may not check if the zone is available.
5358 */
28f836b6 5359static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
952eaf81
VB
5360/* These effectively disable the pcplists in the boot pageset completely */
5361#define BOOT_PAGESET_HIGH 0
5362#define BOOT_PAGESET_BATCH 1
28f836b6
MG
5363static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
5364static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
99dcc3e5 5365
11cd8638 5366static void __build_all_zonelists(void *data)
1da177e4 5367{
6811378e 5368 int nid;
afb6ebb3 5369 int __maybe_unused cpu;
9adb62a5 5370 pg_data_t *self = data;
1007843a 5371 unsigned long flags;
b93e0f32 5372
1007843a
TH
5373 /*
5374 * Explicitly disable this CPU's interrupts before taking seqlock
5375 * to prevent any IRQ handler from calling into the page allocator
5376 * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
5377 */
5378 local_irq_save(flags);
5379 /*
5380 * Explicitly disable this CPU's synchronous printk() before taking
5381 * seqlock to prevent any printk() from trying to hold port->lock, for
5382 * tty_insert_flip_string_and_push_buffer() on other CPU might be
5383 * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
5384 */
5385 printk_deferred_enter();
3d36424b 5386 write_seqlock(&zonelist_update_seq);
9276b1bc 5387
7f9cfb31
BL
5388#ifdef CONFIG_NUMA
5389 memset(node_load, 0, sizeof(node_load));
5390#endif
9adb62a5 5391
c1152583
WY
5392 /*
5393 * This node is hotadded and no memory is yet present. So just
5394 * building zonelists is fine - no need to touch other nodes.
5395 */
9adb62a5
JL
5396 if (self && !node_online(self->node_id)) {
5397 build_zonelists(self);
c1152583 5398 } else {
09f49dca
MH
5399 /*
5400 * All possible nodes have pgdat preallocated
5401 * in free_area_init
5402 */
5403 for_each_node(nid) {
c1152583 5404 pg_data_t *pgdat = NODE_DATA(nid);
7ea1530a 5405
c1152583
WY
5406 build_zonelists(pgdat);
5407 }
99dcc3e5 5408
7aac7898
LS
5409#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5410 /*
5411 * We now know the "local memory node" for each node--
5412 * i.e., the node of the first zone in the generic zonelist.
5413 * Set up numa_mem percpu variable for on-line cpus. During
5414 * boot, only the boot cpu should be on-line; we'll init the
5415 * secondary cpus' numa_mem as they come on-line. During
5416 * node/memory hotplug, we'll fixup all on-line cpus.
5417 */
d9c9a0b9 5418 for_each_online_cpu(cpu)
7aac7898 5419 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
afb6ebb3 5420#endif
d9c9a0b9 5421 }
b93e0f32 5422
3d36424b 5423 write_sequnlock(&zonelist_update_seq);
1007843a
TH
5424 printk_deferred_exit();
5425 local_irq_restore(flags);
6811378e
YG
5426}
5427
061f67bc
RV
5428static noinline void __init
5429build_all_zonelists_init(void)
5430{
afb6ebb3
MH
5431 int cpu;
5432
061f67bc 5433 __build_all_zonelists(NULL);
afb6ebb3
MH
5434
5435 /*
5436 * Initialize the boot_pagesets that are going to be used
5437 * for bootstrapping processors. The real pagesets for
5438 * each zone will be allocated later when the per cpu
5439 * allocator is available.
5440 *
5441 * boot_pagesets are used also for bootstrapping offline
5442 * cpus if the system is already booted because the pagesets
5443 * are needed to initialize allocators on a specific cpu too.
5444 * F.e. the percpu allocator needs the page allocator which
5445 * needs the percpu allocator in order to allocate its pagesets
5446 * (a chicken-egg dilemma).
5447 */
5448 for_each_possible_cpu(cpu)
28f836b6 5449 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
afb6ebb3 5450
061f67bc
RV
5451 mminit_verify_zonelist();
5452 cpuset_init_current_mems_allowed();
5453}
5454
4eaf3f64 5455/*
4eaf3f64 5456 * unless system_state == SYSTEM_BOOTING.
061f67bc 5457 *
72675e13 5458 * __ref due to call of __init annotated helper build_all_zonelists_init
061f67bc 5459 * [protected by SYSTEM_BOOTING].
4eaf3f64 5460 */
72675e13 5461void __ref build_all_zonelists(pg_data_t *pgdat)
6811378e 5462{
0a18e607
DH
5463 unsigned long vm_total_pages;
5464
6811378e 5465 if (system_state == SYSTEM_BOOTING) {
061f67bc 5466 build_all_zonelists_init();
6811378e 5467 } else {
11cd8638 5468 __build_all_zonelists(pgdat);
6811378e
YG
5469 /* cpuset refresh routine should be here */
5470 }
56b9413b
DH
5471 /* Get the number of free pages beyond high watermark in all zones. */
5472 vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
9ef9acb0
MG
5473 /*
5474 * Disable grouping by mobility if the number of pages in the
5475 * system is too low to allow the mechanism to work. It would be
5476 * more accurate, but expensive to check per-zone. This check is
5477 * made on memory-hotadd so a system can start with mobility
5478 * disabled and enable it later
5479 */
d9c23400 5480 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
9ef9acb0
MG
5481 page_group_by_mobility_disabled = 1;
5482 else
5483 page_group_by_mobility_disabled = 0;
5484
ce0725f7 5485 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
756a025f 5486 nr_online_nodes,
756a025f
JP
5487 page_group_by_mobility_disabled ? "off" : "on",
5488 vm_total_pages);
f0c0b2b8 5489#ifdef CONFIG_NUMA
f88dfff5 5490 pr_info("Policy zone: %s\n", zone_names[policy_zone]);
f0c0b2b8 5491#endif
1da177e4
LT
5492}
5493
9420f89d 5494static int zone_batchsize(struct zone *zone)
1da177e4 5495{
9420f89d
MRI
5496#ifdef CONFIG_MMU
5497 int batch;
1da177e4 5498
9420f89d
MRI
5499 /*
5500 * The number of pages to batch allocate is either ~0.1%
5501 * of the zone or 1MB, whichever is smaller. The batch
5502 * size is striking a balance between allocation latency
5503 * and zone lock contention.
5504 */
5505 batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
5506 batch /= 4; /* We effectively *= 4 below */
5507 if (batch < 1)
5508 batch = 1;
22b31eec 5509
4b94ffdc 5510 /*
9420f89d
MRI
5511 * Clamp the batch to a 2^n - 1 value. Having a power
5512 * of 2 value was found to be more likely to have
5513 * suboptimal cache aliasing properties in some cases.
5514 *
5515 * For example if 2 tasks are alternately allocating
5516 * batches of pages, one task can end up with a lot
5517 * of pages of one half of the possible page colors
5518 * and the other with pages of the other colors.
4b94ffdc 5519 */
9420f89d 5520 batch = rounddown_pow_of_two(batch + batch/2) - 1;
966cf44f 5521
9420f89d 5522 return batch;
3a6be87f
DH
5523
5524#else
5525 /* The deferral and batching of frees should be suppressed under NOMMU
5526 * conditions.
5527 *
5528 * The problem is that NOMMU needs to be able to allocate large chunks
5529 * of contiguous memory as there's no hardware page translation to
5530 * assemble apparent contiguous memory from discontiguous pages.
5531 *
5532 * Queueing large contiguous runs of pages for batching, however,
5533 * causes the pages to actually be freed in smaller chunks. As there
5534 * can be a significant delay between the individual batches being
5535 * recycled, this leads to the once large chunks of space being
5536 * fragmented and becoming unavailable for high-order allocations.
5537 */
5538 return 0;
5539#endif
e7c8d5c9
CL
5540}
5541
04f8cfea 5542static int zone_highsize(struct zone *zone, int batch, int cpu_online)
b92ca18e 5543{
9420f89d
MRI
5544#ifdef CONFIG_MMU
5545 int high;
5546 int nr_split_cpus;
5547 unsigned long total_pages;
c13291a5 5548
9420f89d 5549 if (!percpu_pagelist_high_fraction) {
2a1e274a 5550 /*
9420f89d
MRI
5551 * By default, the high value of the pcp is based on the zone
5552 * low watermark so that if they are full then background
5553 * reclaim will not be started prematurely.
2a1e274a 5554 */
9420f89d
MRI
5555 total_pages = low_wmark_pages(zone);
5556 } else {
2a1e274a 5557 /*
9420f89d
MRI
5558 * If percpu_pagelist_high_fraction is configured, the high
5559 * value is based on a fraction of the managed pages in the
5560 * zone.
2a1e274a 5561 */
9420f89d 5562 total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
2a1e274a
MG
5563 }
5564
5565 /*
9420f89d
MRI
5566 * Split the high value across all online CPUs local to the zone. Note
5567 * that early in boot that CPUs may not be online yet and that during
5568 * CPU hotplug that the cpumask is not yet updated when a CPU is being
5569 * onlined. For memory nodes that have no CPUs, split pcp->high across
5570 * all online CPUs to mitigate the risk that reclaim is triggered
5571 * prematurely due to pages stored on pcp lists.
2a1e274a 5572 */
9420f89d
MRI
5573 nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
5574 if (!nr_split_cpus)
5575 nr_split_cpus = num_online_cpus();
5576 high = total_pages / nr_split_cpus;
2a1e274a 5577
9420f89d
MRI
5578 /*
5579 * Ensure high is at least batch*4. The multiple is based on the
5580 * historical relationship between high and batch.
5581 */
5582 high = max(high, batch << 2);
37b07e41 5583
9420f89d
MRI
5584 return high;
5585#else
5586 return 0;
5587#endif
37b07e41
LS
5588}
5589
51930df5 5590/*
9420f89d
MRI
5591 * pcp->high and pcp->batch values are related and generally batch is lower
5592 * than high. They are also related to pcp->count such that count is lower
5593 * than high, and as soon as it reaches high, the pcplist is flushed.
5594 *
5595 * However, guaranteeing these relations at all times would require e.g. write
5596 * barriers here but also careful usage of read barriers at the read side, and
5597 * thus be prone to error and bad for performance. Thus the update only prevents
5598 * store tearing. Any new users of pcp->batch and pcp->high should ensure they
5599 * can cope with those fields changing asynchronously, and fully trust only the
5600 * pcp->count field on the local CPU with interrupts disabled.
5601 *
5602 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5603 * outside of boot time (or some other assurance that no concurrent updaters
5604 * exist).
51930df5 5605 */
9420f89d
MRI
5606static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
5607 unsigned long batch)
51930df5 5608{
9420f89d
MRI
5609 WRITE_ONCE(pcp->batch, batch);
5610 WRITE_ONCE(pcp->high, high);
51930df5
MR
5611}
5612
9420f89d 5613static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
c713216d 5614{
9420f89d 5615 int pindex;
90cae1fe 5616
9420f89d
MRI
5617 memset(pcp, 0, sizeof(*pcp));
5618 memset(pzstats, 0, sizeof(*pzstats));
90cae1fe 5619
9420f89d
MRI
5620 spin_lock_init(&pcp->lock);
5621 for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
5622 INIT_LIST_HEAD(&pcp->lists[pindex]);
2a1e274a 5623
9420f89d
MRI
5624 /*
5625 * Set batch and high values safe for a boot pageset. A true percpu
5626 * pageset's initialization will update them subsequently. Here we don't
5627 * need to be as careful as pageset_update() as nobody can access the
5628 * pageset yet.
5629 */
5630 pcp->high = BOOT_PAGESET_HIGH;
5631 pcp->batch = BOOT_PAGESET_BATCH;
5632 pcp->free_factor = 0;
5633}
c713216d 5634
9420f89d
MRI
5635static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
5636 unsigned long batch)
5637{
5638 struct per_cpu_pages *pcp;
5639 int cpu;
2a1e274a 5640
9420f89d
MRI
5641 for_each_possible_cpu(cpu) {
5642 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5643 pageset_update(pcp, high, batch);
2a1e274a 5644 }
9420f89d 5645}
c713216d 5646
9420f89d
MRI
5647/*
5648 * Calculate and set new high and batch values for all per-cpu pagesets of a
5649 * zone based on the zone's size.
5650 */
5651static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
5652{
5653 int new_high, new_batch;
09f49dca 5654
9420f89d
MRI
5655 new_batch = max(1, zone_batchsize(zone));
5656 new_high = zone_highsize(zone, new_batch, cpu_online);
09f49dca 5657
9420f89d
MRI
5658 if (zone->pageset_high == new_high &&
5659 zone->pageset_batch == new_batch)
5660 return;
37b07e41 5661
9420f89d
MRI
5662 zone->pageset_high = new_high;
5663 zone->pageset_batch = new_batch;
122e093c 5664
9420f89d 5665 __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
c713216d 5666}
2a1e274a 5667
9420f89d 5668void __meminit setup_zone_pageset(struct zone *zone)
2a1e274a 5669{
9420f89d 5670 int cpu;
2a1e274a 5671
9420f89d
MRI
5672 /* Size may be 0 on !SMP && !NUMA */
5673 if (sizeof(struct per_cpu_zonestat) > 0)
5674 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
2a1e274a 5675
9420f89d
MRI
5676 zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
5677 for_each_possible_cpu(cpu) {
5678 struct per_cpu_pages *pcp;
5679 struct per_cpu_zonestat *pzstats;
2a1e274a 5680
9420f89d
MRI
5681 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5682 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
5683 per_cpu_pages_init(pcp, pzstats);
a5c6d650 5684 }
9420f89d
MRI
5685
5686 zone_set_pageset_high_and_batch(zone, 0);
2a1e274a 5687}
ed7ed365 5688
7e63efef 5689/*
9420f89d
MRI
5690 * The zone indicated has a new number of managed_pages; batch sizes and percpu
5691 * page high values need to be recalculated.
7e63efef 5692 */
9420f89d 5693static void zone_pcp_update(struct zone *zone, int cpu_online)
7e63efef 5694{
9420f89d
MRI
5695 mutex_lock(&pcp_batch_high_lock);
5696 zone_set_pageset_high_and_batch(zone, cpu_online);
5697 mutex_unlock(&pcp_batch_high_lock);
7e63efef
MG
5698}
5699
5700/*
9420f89d
MRI
5701 * Allocate per cpu pagesets and initialize them.
5702 * Before this call only boot pagesets were available.
7e63efef 5703 */
9420f89d 5704void __init setup_per_cpu_pageset(void)
7e63efef 5705{
9420f89d
MRI
5706 struct pglist_data *pgdat;
5707 struct zone *zone;
5708 int __maybe_unused cpu;
5709
5710 for_each_populated_zone(zone)
5711 setup_zone_pageset(zone);
5712
5713#ifdef CONFIG_NUMA
5714 /*
5715 * Unpopulated zones continue using the boot pagesets.
5716 * The numa stats for these pagesets need to be reset.
5717 * Otherwise, they will end up skewing the stats of
5718 * the nodes these zones are associated with.
5719 */
5720 for_each_possible_cpu(cpu) {
5721 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
5722 memset(pzstats->vm_numa_event, 0,
5723 sizeof(pzstats->vm_numa_event));
5724 }
5725#endif
5726
5727 for_each_online_pgdat(pgdat)
5728 pgdat->per_cpu_nodestats =
5729 alloc_percpu(struct per_cpu_nodestat);
7e63efef
MG
5730}
5731
9420f89d
MRI
5732__meminit void zone_pcp_init(struct zone *zone)
5733{
5734 /*
5735 * per cpu subsystem is not up at this point. The following code
5736 * relies on the ability of the linker to provide the
5737 * offset of a (static) per cpu variable into the per cpu area.
5738 */
5739 zone->per_cpu_pageset = &boot_pageset;
5740 zone->per_cpu_zonestats = &boot_zonestats;
5741 zone->pageset_high = BOOT_PAGESET_HIGH;
5742 zone->pageset_batch = BOOT_PAGESET_BATCH;
5743
5744 if (populated_zone(zone))
5745 pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
5746 zone->present_pages, zone_batchsize(zone));
5747}
ed7ed365 5748
c3d5f5f0
JL
5749void adjust_managed_page_count(struct page *page, long count)
5750{
9705bea5 5751 atomic_long_add(count, &page_zone(page)->managed_pages);
ca79b0c2 5752 totalram_pages_add(count);
3dcc0571
JL
5753#ifdef CONFIG_HIGHMEM
5754 if (PageHighMem(page))
ca79b0c2 5755 totalhigh_pages_add(count);
3dcc0571 5756#endif
c3d5f5f0 5757}
3dcc0571 5758EXPORT_SYMBOL(adjust_managed_page_count);
c3d5f5f0 5759
e5cb113f 5760unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
69afade7 5761{
11199692
JL
5762 void *pos;
5763 unsigned long pages = 0;
69afade7 5764
11199692
JL
5765 start = (void *)PAGE_ALIGN((unsigned long)start);
5766 end = (void *)((unsigned long)end & PAGE_MASK);
5767 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
0d834328
DH
5768 struct page *page = virt_to_page(pos);
5769 void *direct_map_addr;
5770
5771 /*
5772 * 'direct_map_addr' might be different from 'pos'
5773 * because some architectures' virt_to_page()
5774 * work with aliases. Getting the direct map
5775 * address ensures that we get a _writeable_
5776 * alias for the memset().
5777 */
5778 direct_map_addr = page_address(page);
c746170d
VF
5779 /*
5780 * Perform a kasan-unchecked memset() since this memory
5781 * has not been initialized.
5782 */
5783 direct_map_addr = kasan_reset_tag(direct_map_addr);
dbe67df4 5784 if ((unsigned int)poison <= 0xFF)
0d834328
DH
5785 memset(direct_map_addr, poison, PAGE_SIZE);
5786
5787 free_reserved_page(page);
69afade7
JL
5788 }
5789
5790 if (pages && s)
ff7ed9e4 5791 pr_info("Freeing %s memory: %ldK\n", s, K(pages));
69afade7
JL
5792
5793 return pages;
5794}
5795
005fd4bb 5796static int page_alloc_cpu_dead(unsigned int cpu)
1da177e4 5797{
04f8cfea 5798 struct zone *zone;
1da177e4 5799
005fd4bb 5800 lru_add_drain_cpu(cpu);
96f97c43 5801 mlock_drain_remote(cpu);
005fd4bb 5802 drain_pages(cpu);
9f8f2172 5803
005fd4bb
SAS
5804 /*
5805 * Spill the event counters of the dead processor
5806 * into the current processors event counters.
5807 * This artificially elevates the count of the current
5808 * processor.
5809 */
5810 vm_events_fold_cpu(cpu);
9f8f2172 5811
005fd4bb
SAS
5812 /*
5813 * Zero the differential counters of the dead processor
5814 * so that the vm statistics are consistent.
5815 *
5816 * This is only okay since the processor is dead and cannot
5817 * race with what we are doing.
5818 */
5819 cpu_vm_stats_fold(cpu);
04f8cfea
MG
5820
5821 for_each_populated_zone(zone)
5822 zone_pcp_update(zone, 0);
5823
5824 return 0;
5825}
5826
5827static int page_alloc_cpu_online(unsigned int cpu)
5828{
5829 struct zone *zone;
5830
5831 for_each_populated_zone(zone)
5832 zone_pcp_update(zone, 1);
005fd4bb 5833 return 0;
1da177e4 5834}
1da177e4 5835
c4fbed4b 5836void __init page_alloc_init_cpuhp(void)
1da177e4 5837{
005fd4bb
SAS
5838 int ret;
5839
04f8cfea
MG
5840 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
5841 "mm/page_alloc:pcp",
5842 page_alloc_cpu_online,
005fd4bb
SAS
5843 page_alloc_cpu_dead);
5844 WARN_ON(ret < 0);
1da177e4
LT
5845}
5846
cb45b0e9 5847/*
34b10060 5848 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
cb45b0e9
HA
5849 * or min_free_kbytes changes.
5850 */
5851static void calculate_totalreserve_pages(void)
5852{
5853 struct pglist_data *pgdat;
5854 unsigned long reserve_pages = 0;
2f6726e5 5855 enum zone_type i, j;
cb45b0e9
HA
5856
5857 for_each_online_pgdat(pgdat) {
281e3726
MG
5858
5859 pgdat->totalreserve_pages = 0;
5860
cb45b0e9
HA
5861 for (i = 0; i < MAX_NR_ZONES; i++) {
5862 struct zone *zone = pgdat->node_zones + i;
3484b2de 5863 long max = 0;
9705bea5 5864 unsigned long managed_pages = zone_managed_pages(zone);
cb45b0e9
HA
5865
5866 /* Find valid and maximum lowmem_reserve in the zone */
5867 for (j = i; j < MAX_NR_ZONES; j++) {
5868 if (zone->lowmem_reserve[j] > max)
5869 max = zone->lowmem_reserve[j];
5870 }
5871
41858966
MG
5872 /* we treat the high watermark as reserved pages. */
5873 max += high_wmark_pages(zone);
cb45b0e9 5874
3d6357de
AK
5875 if (max > managed_pages)
5876 max = managed_pages;
a8d01437 5877
281e3726 5878 pgdat->totalreserve_pages += max;
a8d01437 5879
cb45b0e9
HA
5880 reserve_pages += max;
5881 }
5882 }
5883 totalreserve_pages = reserve_pages;
5884}
5885
1da177e4
LT
5886/*
5887 * setup_per_zone_lowmem_reserve - called whenever
34b10060 5888 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
1da177e4
LT
5889 * has a correct pages reserved value, so an adequate number of
5890 * pages are left in the zone after a successful __alloc_pages().
5891 */
5892static void setup_per_zone_lowmem_reserve(void)
5893{
5894 struct pglist_data *pgdat;
470c61d7 5895 enum zone_type i, j;
1da177e4 5896
ec936fc5 5897 for_each_online_pgdat(pgdat) {
470c61d7
LS
5898 for (i = 0; i < MAX_NR_ZONES - 1; i++) {
5899 struct zone *zone = &pgdat->node_zones[i];
5900 int ratio = sysctl_lowmem_reserve_ratio[i];
5901 bool clear = !ratio || !zone_managed_pages(zone);
5902 unsigned long managed_pages = 0;
5903
5904 for (j = i + 1; j < MAX_NR_ZONES; j++) {
f7ec1044
LS
5905 struct zone *upper_zone = &pgdat->node_zones[j];
5906
5907 managed_pages += zone_managed_pages(upper_zone);
470c61d7 5908
f7ec1044
LS
5909 if (clear)
5910 zone->lowmem_reserve[j] = 0;
5911 else
470c61d7 5912 zone->lowmem_reserve[j] = managed_pages / ratio;
1da177e4
LT
5913 }
5914 }
5915 }
cb45b0e9
HA
5916
5917 /* update totalreserve_pages */
5918 calculate_totalreserve_pages();
1da177e4
LT
5919}
5920
cfd3da1e 5921static void __setup_per_zone_wmarks(void)
1da177e4
LT
5922{
5923 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5924 unsigned long lowmem_pages = 0;
5925 struct zone *zone;
5926 unsigned long flags;
5927
5928 /* Calculate total number of !ZONE_HIGHMEM pages */
5929 for_each_zone(zone) {
5930 if (!is_highmem(zone))
9705bea5 5931 lowmem_pages += zone_managed_pages(zone);
1da177e4
LT
5932 }
5933
5934 for_each_zone(zone) {
ac924c60
AM
5935 u64 tmp;
5936
1125b4e3 5937 spin_lock_irqsave(&zone->lock, flags);
9705bea5 5938 tmp = (u64)pages_min * zone_managed_pages(zone);
ac924c60 5939 do_div(tmp, lowmem_pages);
1da177e4
LT
5940 if (is_highmem(zone)) {
5941 /*
669ed175
NP
5942 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5943 * need highmem pages, so cap pages_min to a small
5944 * value here.
5945 *
41858966 5946 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
8bb4e7a2 5947 * deltas control async page reclaim, and so should
669ed175 5948 * not be capped for highmem.
1da177e4 5949 */
90ae8d67 5950 unsigned long min_pages;
1da177e4 5951
9705bea5 5952 min_pages = zone_managed_pages(zone) / 1024;
90ae8d67 5953 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
a9214443 5954 zone->_watermark[WMARK_MIN] = min_pages;
1da177e4 5955 } else {
669ed175
NP
5956 /*
5957 * If it's a lowmem zone, reserve a number of pages
1da177e4
LT
5958 * proportionate to the zone's size.
5959 */
a9214443 5960 zone->_watermark[WMARK_MIN] = tmp;
1da177e4
LT
5961 }
5962
795ae7a0
JW
5963 /*
5964 * Set the kswapd watermarks distance according to the
5965 * scale factor in proportion to available memory, but
5966 * ensure a minimum size on small systems.
5967 */
5968 tmp = max_t(u64, tmp >> 2,
9705bea5 5969 mult_frac(zone_managed_pages(zone),
795ae7a0
JW
5970 watermark_scale_factor, 10000));
5971
aa092591 5972 zone->watermark_boost = 0;
a9214443 5973 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
c574bbe9
HY
5974 zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
5975 zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
49f223a9 5976
1125b4e3 5977 spin_unlock_irqrestore(&zone->lock, flags);
1da177e4 5978 }
cb45b0e9
HA
5979
5980 /* update totalreserve_pages */
5981 calculate_totalreserve_pages();
1da177e4
LT
5982}
5983
cfd3da1e
MG
5984/**
5985 * setup_per_zone_wmarks - called when min_free_kbytes changes
5986 * or when memory is hot-{added|removed}
5987 *
5988 * Ensures that the watermark[min,low,high] values for each zone are set
5989 * correctly with respect to min_free_kbytes.
5990 */
5991void setup_per_zone_wmarks(void)
5992{
b92ca18e 5993 struct zone *zone;
b93e0f32
MH
5994 static DEFINE_SPINLOCK(lock);
5995
5996 spin_lock(&lock);
cfd3da1e 5997 __setup_per_zone_wmarks();
b93e0f32 5998 spin_unlock(&lock);
b92ca18e
MG
5999
6000 /*
6001 * The watermark size have changed so update the pcpu batch
6002 * and high limits or the limits may be inappropriate.
6003 */
6004 for_each_zone(zone)
04f8cfea 6005 zone_pcp_update(zone, 0);
cfd3da1e
MG
6006}
6007
1da177e4
LT
6008/*
6009 * Initialise min_free_kbytes.
6010 *
6011 * For small machines we want it small (128k min). For large machines
8beeae86 6012 * we want it large (256MB max). But it is not linear, because network
1da177e4
LT
6013 * bandwidth does not increase linearly with machine size. We use
6014 *
b8af2941 6015 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
1da177e4
LT
6016 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
6017 *
6018 * which yields
6019 *
6020 * 16MB: 512k
6021 * 32MB: 724k
6022 * 64MB: 1024k
6023 * 128MB: 1448k
6024 * 256MB: 2048k
6025 * 512MB: 2896k
6026 * 1024MB: 4096k
6027 * 2048MB: 5792k
6028 * 4096MB: 8192k
6029 * 8192MB: 11584k
6030 * 16384MB: 16384k
6031 */
bd3400ea 6032void calculate_min_free_kbytes(void)
1da177e4
LT
6033{
6034 unsigned long lowmem_kbytes;
5f12733e 6035 int new_min_free_kbytes;
1da177e4
LT
6036
6037 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5f12733e
MH
6038 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
6039
59d336bd
WS
6040 if (new_min_free_kbytes > user_min_free_kbytes)
6041 min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
6042 else
5f12733e
MH
6043 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
6044 new_min_free_kbytes, user_min_free_kbytes);
59d336bd 6045
bd3400ea
LF
6046}
6047
6048int __meminit init_per_zone_wmark_min(void)
6049{
6050 calculate_min_free_kbytes();
bc75d33f 6051 setup_per_zone_wmarks();
a6cccdc3 6052 refresh_zone_stat_thresholds();
1da177e4 6053 setup_per_zone_lowmem_reserve();
6423aa81
JK
6054
6055#ifdef CONFIG_NUMA
6056 setup_min_unmapped_ratio();
6057 setup_min_slab_ratio();
6058#endif
6059
4aab2be0
VB
6060 khugepaged_min_free_kbytes_update();
6061
1da177e4
LT
6062 return 0;
6063}
e08d3fdf 6064postcore_initcall(init_per_zone_wmark_min)
1da177e4
LT
6065
6066/*
b8af2941 6067 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
1da177e4
LT
6068 * that we can call two helper functions whenever min_free_kbytes
6069 * changes.
6070 */
cccad5b9 6071int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
32927393 6072 void *buffer, size_t *length, loff_t *ppos)
1da177e4 6073{
da8c757b
HP
6074 int rc;
6075
6076 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6077 if (rc)
6078 return rc;
6079
5f12733e
MH
6080 if (write) {
6081 user_min_free_kbytes = min_free_kbytes;
bc75d33f 6082 setup_per_zone_wmarks();
5f12733e 6083 }
1da177e4
LT
6084 return 0;
6085}
6086
795ae7a0 6087int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
32927393 6088 void *buffer, size_t *length, loff_t *ppos)
795ae7a0
JW
6089{
6090 int rc;
6091
6092 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6093 if (rc)
6094 return rc;
6095
6096 if (write)
6097 setup_per_zone_wmarks();
6098
6099 return 0;
6100}
6101
9614634f 6102#ifdef CONFIG_NUMA
6423aa81 6103static void setup_min_unmapped_ratio(void)
9614634f 6104{
6423aa81 6105 pg_data_t *pgdat;
9614634f 6106 struct zone *zone;
9614634f 6107
a5f5f91d 6108 for_each_online_pgdat(pgdat)
81cbcbc2 6109 pgdat->min_unmapped_pages = 0;
a5f5f91d 6110
9614634f 6111 for_each_zone(zone)
9705bea5
AK
6112 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
6113 sysctl_min_unmapped_ratio) / 100;
9614634f 6114}
0ff38490 6115
6423aa81
JK
6116
6117int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
32927393 6118 void *buffer, size_t *length, loff_t *ppos)
0ff38490 6119{
0ff38490
CL
6120 int rc;
6121
8d65af78 6122 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
0ff38490
CL
6123 if (rc)
6124 return rc;
6125
6423aa81
JK
6126 setup_min_unmapped_ratio();
6127
6128 return 0;
6129}
6130
6131static void setup_min_slab_ratio(void)
6132{
6133 pg_data_t *pgdat;
6134 struct zone *zone;
6135
a5f5f91d
MG
6136 for_each_online_pgdat(pgdat)
6137 pgdat->min_slab_pages = 0;
6138
0ff38490 6139 for_each_zone(zone)
9705bea5
AK
6140 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
6141 sysctl_min_slab_ratio) / 100;
6423aa81
JK
6142}
6143
6144int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
32927393 6145 void *buffer, size_t *length, loff_t *ppos)
6423aa81
JK
6146{
6147 int rc;
6148
6149 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6150 if (rc)
6151 return rc;
6152
6153 setup_min_slab_ratio();
6154
0ff38490
CL
6155 return 0;
6156}
9614634f
CL
6157#endif
6158
1da177e4
LT
6159/*
6160 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
6161 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
6162 * whenever sysctl_lowmem_reserve_ratio changes.
6163 *
6164 * The reserve ratio obviously has absolutely no relation with the
41858966 6165 * minimum watermarks. The lowmem reserve ratio can only make sense
1da177e4
LT
6166 * if in function of the boot time zone sizes.
6167 */
cccad5b9 6168int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
32927393 6169 void *buffer, size_t *length, loff_t *ppos)
1da177e4 6170{
86aaf255
BH
6171 int i;
6172
8d65af78 6173 proc_dointvec_minmax(table, write, buffer, length, ppos);
86aaf255
BH
6174
6175 for (i = 0; i < MAX_NR_ZONES; i++) {
6176 if (sysctl_lowmem_reserve_ratio[i] < 1)
6177 sysctl_lowmem_reserve_ratio[i] = 0;
6178 }
6179
1da177e4
LT
6180 setup_per_zone_lowmem_reserve();
6181 return 0;
6182}
6183
8ad4b1fb 6184/*
74f44822
MG
6185 * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
6186 * cpu. It is the fraction of total pages in each zone that a hot per cpu
b8af2941 6187 * pagelist can have before it gets flushed back to buddy allocator.
8ad4b1fb 6188 */
74f44822
MG
6189int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
6190 int write, void *buffer, size_t *length, loff_t *ppos)
8ad4b1fb
RS
6191{
6192 struct zone *zone;
74f44822 6193 int old_percpu_pagelist_high_fraction;
8ad4b1fb
RS
6194 int ret;
6195
7cd2b0a3 6196 mutex_lock(&pcp_batch_high_lock);
74f44822 6197 old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
7cd2b0a3 6198
8d65af78 6199 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
7cd2b0a3
DR
6200 if (!write || ret < 0)
6201 goto out;
6202
6203 /* Sanity checking to avoid pcp imbalance */
74f44822
MG
6204 if (percpu_pagelist_high_fraction &&
6205 percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
6206 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
7cd2b0a3
DR
6207 ret = -EINVAL;
6208 goto out;
6209 }
6210
6211 /* No change? */
74f44822 6212 if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
7cd2b0a3 6213 goto out;
c8e251fa 6214
cb1ef534 6215 for_each_populated_zone(zone)
74f44822 6216 zone_set_pageset_high_and_batch(zone, 0);
7cd2b0a3 6217out:
c8e251fa 6218 mutex_unlock(&pcp_batch_high_lock);
7cd2b0a3 6219 return ret;
8ad4b1fb
RS
6220}
6221
8df995f6 6222#ifdef CONFIG_CONTIG_ALLOC
a1394bdd
MK
6223#if defined(CONFIG_DYNAMIC_DEBUG) || \
6224 (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
6225/* Usage: See admin-guide/dynamic-debug-howto.rst */
6226static void alloc_contig_dump_pages(struct list_head *page_list)
6227{
6228 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
6229
6230 if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
6231 struct page *page;
6232
6233 dump_stack();
6234 list_for_each_entry(page, page_list, lru)
6235 dump_page(page, "migration failure");
6236 }
6237}
6238#else
6239static inline void alloc_contig_dump_pages(struct list_head *page_list)
6240{
6241}
6242#endif
6243
041d3a8c 6244/* [start, end) must belong to a single zone. */
b2c9e2fb 6245int __alloc_contig_migrate_range(struct compact_control *cc,
bb13ffeb 6246 unsigned long start, unsigned long end)
041d3a8c
MN
6247{
6248 /* This function is based on compact_zone() from compaction.c. */
730ec8c0 6249 unsigned int nr_reclaimed;
041d3a8c
MN
6250 unsigned long pfn = start;
6251 unsigned int tries = 0;
6252 int ret = 0;
8b94e0b8
JK
6253 struct migration_target_control mtc = {
6254 .nid = zone_to_nid(cc->zone),
6255 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
6256 };
041d3a8c 6257
361a2a22 6258 lru_cache_disable();
041d3a8c 6259
bb13ffeb 6260 while (pfn < end || !list_empty(&cc->migratepages)) {
041d3a8c
MN
6261 if (fatal_signal_pending(current)) {
6262 ret = -EINTR;
6263 break;
6264 }
6265
bb13ffeb
MG
6266 if (list_empty(&cc->migratepages)) {
6267 cc->nr_migratepages = 0;
c2ad7a1f
OS
6268 ret = isolate_migratepages_range(cc, pfn, end);
6269 if (ret && ret != -EAGAIN)
041d3a8c 6270 break;
c2ad7a1f 6271 pfn = cc->migrate_pfn;
041d3a8c
MN
6272 tries = 0;
6273 } else if (++tries == 5) {
c8e28b47 6274 ret = -EBUSY;
041d3a8c
MN
6275 break;
6276 }
6277
beb51eaa
MK
6278 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6279 &cc->migratepages);
6280 cc->nr_migratepages -= nr_reclaimed;
02c6de8d 6281
8b94e0b8 6282 ret = migrate_pages(&cc->migratepages, alloc_migration_target,
5ac95884 6283 NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
c8e28b47
OS
6284
6285 /*
6286 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
6287 * to retry again over this error, so do the same here.
6288 */
6289 if (ret == -ENOMEM)
6290 break;
041d3a8c 6291 }
d479960e 6292
361a2a22 6293 lru_cache_enable();
2a6f5124 6294 if (ret < 0) {
3f913fc5 6295 if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
151e084a 6296 alloc_contig_dump_pages(&cc->migratepages);
2a6f5124
SP
6297 putback_movable_pages(&cc->migratepages);
6298 return ret;
6299 }
6300 return 0;
041d3a8c
MN
6301}
6302
6303/**
6304 * alloc_contig_range() -- tries to allocate given range of pages
6305 * @start: start PFN to allocate
6306 * @end: one-past-the-last PFN to allocate
f0953a1b 6307 * @migratetype: migratetype of the underlying pageblocks (either
0815f3d8
MN
6308 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6309 * in range must have the same migratetype and it must
6310 * be either of the two.
ca96b625 6311 * @gfp_mask: GFP mask to use during compaction
041d3a8c 6312 *
11ac3e87
ZY
6313 * The PFN range does not have to be pageblock aligned. The PFN range must
6314 * belong to a single zone.
041d3a8c 6315 *
2c7452a0
MK
6316 * The first thing this routine does is attempt to MIGRATE_ISOLATE all
6317 * pageblocks in the range. Once isolated, the pageblocks should not
6318 * be modified by others.
041d3a8c 6319 *
a862f68a 6320 * Return: zero on success or negative error code. On success all
041d3a8c
MN
6321 * pages which PFN is in [start, end) are allocated for the caller and
6322 * need to be freed with free_contig_range().
6323 */
0815f3d8 6324int alloc_contig_range(unsigned long start, unsigned long end,
ca96b625 6325 unsigned migratetype, gfp_t gfp_mask)
041d3a8c 6326{
041d3a8c 6327 unsigned long outer_start, outer_end;
b2c9e2fb 6328 int order;
d00181b9 6329 int ret = 0;
041d3a8c 6330
bb13ffeb
MG
6331 struct compact_control cc = {
6332 .nr_migratepages = 0,
6333 .order = -1,
6334 .zone = page_zone(pfn_to_page(start)),
e0b9daeb 6335 .mode = MIGRATE_SYNC,
bb13ffeb 6336 .ignore_skip_hint = true,
2583d671 6337 .no_set_skip_hint = true,
7dea19f9 6338 .gfp_mask = current_gfp_context(gfp_mask),
b06eda09 6339 .alloc_contig = true,
bb13ffeb
MG
6340 };
6341 INIT_LIST_HEAD(&cc.migratepages);
6342
041d3a8c
MN
6343 /*
6344 * What we do here is we mark all pageblocks in range as
6345 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6346 * have different sizes, and due to the way page allocator
b2c9e2fb 6347 * work, start_isolate_page_range() has special handlings for this.
041d3a8c
MN
6348 *
6349 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6350 * migrate the pages from an unaligned range (ie. pages that
b2c9e2fb 6351 * we are interested in). This will put all the pages in
041d3a8c
MN
6352 * range back to page allocator as MIGRATE_ISOLATE.
6353 *
6354 * When this is done, we take the pages in range from page
6355 * allocator removing them from the buddy system. This way
6356 * page allocator will never consider using them.
6357 *
6358 * This lets us mark the pageblocks back as
6359 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6360 * aligned range but not in the unaligned, original range are
6361 * put back to page allocator so that buddy can use them.
6362 */
6363
6e263fff 6364 ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
3fa0c7c7 6365 if (ret)
b2c9e2fb 6366 goto done;
041d3a8c 6367
7612921f
VB
6368 drain_all_pages(cc.zone);
6369
8ef5849f
JK
6370 /*
6371 * In case of -EBUSY, we'd like to know which page causes problem.
63cd4489
MK
6372 * So, just fall through. test_pages_isolated() has a tracepoint
6373 * which will report the busy page.
6374 *
6375 * It is possible that busy pages could become available before
6376 * the call to test_pages_isolated, and the range will actually be
6377 * allocated. So, if we fall through be sure to clear ret so that
6378 * -EBUSY is not accidentally used or returned to caller.
8ef5849f 6379 */
bb13ffeb 6380 ret = __alloc_contig_migrate_range(&cc, start, end);
8ef5849f 6381 if (ret && ret != -EBUSY)
041d3a8c 6382 goto done;
68d68ff6 6383 ret = 0;
041d3a8c
MN
6384
6385 /*
b2c9e2fb 6386 * Pages from [start, end) are within a pageblock_nr_pages
041d3a8c
MN
6387 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6388 * more, all pages in [start, end) are free in page allocator.
6389 * What we are going to do is to allocate all pages from
6390 * [start, end) (that is remove them from page allocator).
6391 *
6392 * The only problem is that pages at the beginning and at the
6393 * end of interesting range may be not aligned with pages that
6394 * page allocator holds, ie. they can be part of higher order
6395 * pages. Because of this, we reserve the bigger range and
6396 * once this is done free the pages we are not interested in.
6397 *
6398 * We don't have to hold zone->lock here because the pages are
6399 * isolated thus they won't get removed from buddy.
6400 */
6401
041d3a8c
MN
6402 order = 0;
6403 outer_start = start;
6404 while (!PageBuddy(pfn_to_page(outer_start))) {
23baf831 6405 if (++order > MAX_ORDER) {
8ef5849f
JK
6406 outer_start = start;
6407 break;
041d3a8c
MN
6408 }
6409 outer_start &= ~0UL << order;
6410 }
6411
8ef5849f 6412 if (outer_start != start) {
ab130f91 6413 order = buddy_order(pfn_to_page(outer_start));
8ef5849f
JK
6414
6415 /*
6416 * outer_start page could be small order buddy page and
6417 * it doesn't include start page. Adjust outer_start
6418 * in this case to report failed page properly
6419 * on tracepoint in test_pages_isolated()
6420 */
6421 if (outer_start + (1UL << order) <= start)
6422 outer_start = start;
6423 }
6424
041d3a8c 6425 /* Make sure the range is really isolated. */
756d25be 6426 if (test_pages_isolated(outer_start, end, 0)) {
041d3a8c
MN
6427 ret = -EBUSY;
6428 goto done;
6429 }
6430
49f223a9 6431 /* Grab isolated pages from freelists. */
bb13ffeb 6432 outer_end = isolate_freepages_range(&cc, outer_start, end);
041d3a8c
MN
6433 if (!outer_end) {
6434 ret = -EBUSY;
6435 goto done;
6436 }
6437
6438 /* Free head and tail (if any) */
6439 if (start != outer_start)
6440 free_contig_range(outer_start, start - outer_start);
6441 if (end != outer_end)
6442 free_contig_range(end, outer_end - end);
6443
6444done:
6e263fff 6445 undo_isolate_page_range(start, end, migratetype);
041d3a8c
MN
6446 return ret;
6447}
255f5985 6448EXPORT_SYMBOL(alloc_contig_range);
5e27a2df
AK
6449
6450static int __alloc_contig_pages(unsigned long start_pfn,
6451 unsigned long nr_pages, gfp_t gfp_mask)
6452{
6453 unsigned long end_pfn = start_pfn + nr_pages;
6454
6455 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
6456 gfp_mask);
6457}
6458
6459static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
6460 unsigned long nr_pages)
6461{
6462 unsigned long i, end_pfn = start_pfn + nr_pages;
6463 struct page *page;
6464
6465 for (i = start_pfn; i < end_pfn; i++) {
6466 page = pfn_to_online_page(i);
6467 if (!page)
6468 return false;
6469
6470 if (page_zone(page) != z)
6471 return false;
6472
6473 if (PageReserved(page))
4d73ba5f
MG
6474 return false;
6475
6476 if (PageHuge(page))
5e27a2df 6477 return false;
5e27a2df
AK
6478 }
6479 return true;
6480}
6481
6482static bool zone_spans_last_pfn(const struct zone *zone,
6483 unsigned long start_pfn, unsigned long nr_pages)
6484{
6485 unsigned long last_pfn = start_pfn + nr_pages - 1;
6486
6487 return zone_spans_pfn(zone, last_pfn);
6488}
6489
6490/**
6491 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
6492 * @nr_pages: Number of contiguous pages to allocate
6493 * @gfp_mask: GFP mask to limit search and used during compaction
6494 * @nid: Target node
6495 * @nodemask: Mask for other possible nodes
6496 *
6497 * This routine is a wrapper around alloc_contig_range(). It scans over zones
6498 * on an applicable zonelist to find a contiguous pfn range which can then be
6499 * tried for allocation with alloc_contig_range(). This routine is intended
6500 * for allocation requests which can not be fulfilled with the buddy allocator.
6501 *
6502 * The allocated memory is always aligned to a page boundary. If nr_pages is a
eaab8e75
AK
6503 * power of two, then allocated range is also guaranteed to be aligned to same
6504 * nr_pages (e.g. 1GB request would be aligned to 1GB).
5e27a2df
AK
6505 *
6506 * Allocated pages can be freed with free_contig_range() or by manually calling
6507 * __free_page() on each allocated page.
6508 *
6509 * Return: pointer to contiguous pages on success, or NULL if not successful.
6510 */
6511struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
6512 int nid, nodemask_t *nodemask)
6513{
6514 unsigned long ret, pfn, flags;
6515 struct zonelist *zonelist;
6516 struct zone *zone;
6517 struct zoneref *z;
6518
6519 zonelist = node_zonelist(nid, gfp_mask);
6520 for_each_zone_zonelist_nodemask(zone, z, zonelist,
6521 gfp_zone(gfp_mask), nodemask) {
6522 spin_lock_irqsave(&zone->lock, flags);
6523
6524 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
6525 while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
6526 if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
6527 /*
6528 * We release the zone lock here because
6529 * alloc_contig_range() will also lock the zone
6530 * at some point. If there's an allocation
6531 * spinning on this lock, it may win the race
6532 * and cause alloc_contig_range() to fail...
6533 */
6534 spin_unlock_irqrestore(&zone->lock, flags);
6535 ret = __alloc_contig_pages(pfn, nr_pages,
6536 gfp_mask);
6537 if (!ret)
6538 return pfn_to_page(pfn);
6539 spin_lock_irqsave(&zone->lock, flags);
6540 }
6541 pfn += nr_pages;
6542 }
6543 spin_unlock_irqrestore(&zone->lock, flags);
6544 }
6545 return NULL;
6546}
4eb0716e 6547#endif /* CONFIG_CONTIG_ALLOC */
041d3a8c 6548
78fa5150 6549void free_contig_range(unsigned long pfn, unsigned long nr_pages)
041d3a8c 6550{
78fa5150 6551 unsigned long count = 0;
bcc2b02f
MS
6552
6553 for (; nr_pages--; pfn++) {
6554 struct page *page = pfn_to_page(pfn);
6555
6556 count += page_count(page) != 1;
6557 __free_page(page);
6558 }
78fa5150 6559 WARN(count != 0, "%lu pages are still in use!\n", count);
041d3a8c 6560}
255f5985 6561EXPORT_SYMBOL(free_contig_range);
041d3a8c 6562
ec6e8c7e
VB
6563/*
6564 * Effectively disable pcplists for the zone by setting the high limit to 0
6565 * and draining all cpus. A concurrent page freeing on another CPU that's about
6566 * to put the page on pcplist will either finish before the drain and the page
6567 * will be drained, or observe the new high limit and skip the pcplist.
6568 *
6569 * Must be paired with a call to zone_pcp_enable().
6570 */
6571void zone_pcp_disable(struct zone *zone)
6572{
6573 mutex_lock(&pcp_batch_high_lock);
6574 __zone_set_pageset_high_and_batch(zone, 0, 1);
6575 __drain_all_pages(zone, true);
6576}
6577
6578void zone_pcp_enable(struct zone *zone)
6579{
6580 __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
6581 mutex_unlock(&pcp_batch_high_lock);
6582}
6583
340175b7
JL
6584void zone_pcp_reset(struct zone *zone)
6585{
5a883813 6586 int cpu;
28f836b6 6587 struct per_cpu_zonestat *pzstats;
340175b7 6588
28f836b6 6589 if (zone->per_cpu_pageset != &boot_pageset) {
5a883813 6590 for_each_online_cpu(cpu) {
28f836b6
MG
6591 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
6592 drain_zonestat(zone, pzstats);
5a883813 6593 }
28f836b6 6594 free_percpu(zone->per_cpu_pageset);
28f836b6 6595 zone->per_cpu_pageset = &boot_pageset;
022e7fa0
ML
6596 if (zone->per_cpu_zonestats != &boot_zonestats) {
6597 free_percpu(zone->per_cpu_zonestats);
6598 zone->per_cpu_zonestats = &boot_zonestats;
6599 }
340175b7 6600 }
340175b7
JL
6601}
6602
6dcd73d7 6603#ifdef CONFIG_MEMORY_HOTREMOVE
0c0e6195 6604/*
257bea71
DH
6605 * All pages in the range must be in a single zone, must not contain holes,
6606 * must span full sections, and must be isolated before calling this function.
0c0e6195 6607 */
257bea71 6608void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
0c0e6195 6609{
257bea71 6610 unsigned long pfn = start_pfn;
0c0e6195
KH
6611 struct page *page;
6612 struct zone *zone;
0ee5f4f3 6613 unsigned int order;
0c0e6195 6614 unsigned long flags;
5557c766 6615
2d070eab 6616 offline_mem_sections(pfn, end_pfn);
0c0e6195
KH
6617 zone = page_zone(pfn_to_page(pfn));
6618 spin_lock_irqsave(&zone->lock, flags);
0c0e6195 6619 while (pfn < end_pfn) {
0c0e6195 6620 page = pfn_to_page(pfn);
b023f468
WC
6621 /*
6622 * The HWPoisoned page may be not in buddy system, and
6623 * page_count() is not 0.
6624 */
6625 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6626 pfn++;
b023f468
WC
6627 continue;
6628 }
aa218795
DH
6629 /*
6630 * At this point all remaining PageOffline() pages have a
6631 * reference count of 0 and can simply be skipped.
6632 */
6633 if (PageOffline(page)) {
6634 BUG_ON(page_count(page));
6635 BUG_ON(PageBuddy(page));
6636 pfn++;
aa218795
DH
6637 continue;
6638 }
b023f468 6639
0c0e6195
KH
6640 BUG_ON(page_count(page));
6641 BUG_ON(!PageBuddy(page));
ab130f91 6642 order = buddy_order(page);
6ab01363 6643 del_page_from_free_list(page, zone, order);
0c0e6195
KH
6644 pfn += (1 << order);
6645 }
6646 spin_unlock_irqrestore(&zone->lock, flags);
6647}
6648#endif
8d22ba1b 6649
8446b59b
ED
6650/*
6651 * This function returns a stable result only if called under zone lock.
6652 */
8d22ba1b
WF
6653bool is_free_buddy_page(struct page *page)
6654{
8d22ba1b 6655 unsigned long pfn = page_to_pfn(page);
7aeb09f9 6656 unsigned int order;
8d22ba1b 6657
23baf831 6658 for (order = 0; order <= MAX_ORDER; order++) {
8d22ba1b
WF
6659 struct page *page_head = page - (pfn & ((1 << order) - 1));
6660
8446b59b
ED
6661 if (PageBuddy(page_head) &&
6662 buddy_order_unsafe(page_head) >= order)
8d22ba1b
WF
6663 break;
6664 }
8d22ba1b 6665
23baf831 6666 return order <= MAX_ORDER;
8d22ba1b 6667}
a581865e 6668EXPORT_SYMBOL(is_free_buddy_page);
d4ae9916
NH
6669
6670#ifdef CONFIG_MEMORY_FAILURE
6671/*
06be6ff3
OS
6672 * Break down a higher-order page in sub-pages, and keep our target out of
6673 * buddy allocator.
d4ae9916 6674 */
06be6ff3
OS
6675static void break_down_buddy_pages(struct zone *zone, struct page *page,
6676 struct page *target, int low, int high,
6677 int migratetype)
6678{
6679 unsigned long size = 1 << high;
6680 struct page *current_buddy, *next_page;
6681
6682 while (high > low) {
6683 high--;
6684 size >>= 1;
6685
6686 if (target >= &page[size]) {
6687 next_page = page + size;
6688 current_buddy = page;
6689 } else {
6690 next_page = page;
6691 current_buddy = page + size;
6692 }
6693
6694 if (set_page_guard(zone, current_buddy, high, migratetype))
6695 continue;
6696
6697 if (current_buddy != target) {
6698 add_to_free_list(current_buddy, zone, high, migratetype);
ab130f91 6699 set_buddy_order(current_buddy, high);
06be6ff3
OS
6700 page = next_page;
6701 }
6702 }
6703}
6704
6705/*
6706 * Take a page that will be marked as poisoned off the buddy allocator.
6707 */
6708bool take_page_off_buddy(struct page *page)
d4ae9916
NH
6709{
6710 struct zone *zone = page_zone(page);
6711 unsigned long pfn = page_to_pfn(page);
6712 unsigned long flags;
6713 unsigned int order;
06be6ff3 6714 bool ret = false;
d4ae9916
NH
6715
6716 spin_lock_irqsave(&zone->lock, flags);
23baf831 6717 for (order = 0; order <= MAX_ORDER; order++) {
d4ae9916 6718 struct page *page_head = page - (pfn & ((1 << order) - 1));
ab130f91 6719 int page_order = buddy_order(page_head);
d4ae9916 6720
ab130f91 6721 if (PageBuddy(page_head) && page_order >= order) {
06be6ff3
OS
6722 unsigned long pfn_head = page_to_pfn(page_head);
6723 int migratetype = get_pfnblock_migratetype(page_head,
6724 pfn_head);
6725
ab130f91 6726 del_page_from_free_list(page_head, zone, page_order);
06be6ff3 6727 break_down_buddy_pages(zone, page_head, page, 0,
ab130f91 6728 page_order, migratetype);
bf181c58 6729 SetPageHWPoisonTakenOff(page);
bac9c6fa
DH
6730 if (!is_migrate_isolate(migratetype))
6731 __mod_zone_freepage_state(zone, -1, migratetype);
06be6ff3 6732 ret = true;
d4ae9916
NH
6733 break;
6734 }
06be6ff3
OS
6735 if (page_count(page_head) > 0)
6736 break;
d4ae9916
NH
6737 }
6738 spin_unlock_irqrestore(&zone->lock, flags);
06be6ff3 6739 return ret;
d4ae9916 6740}
bf181c58
NH
6741
6742/*
6743 * Cancel takeoff done by take_page_off_buddy().
6744 */
6745bool put_page_back_buddy(struct page *page)
6746{
6747 struct zone *zone = page_zone(page);
6748 unsigned long pfn = page_to_pfn(page);
6749 unsigned long flags;
6750 int migratetype = get_pfnblock_migratetype(page, pfn);
6751 bool ret = false;
6752
6753 spin_lock_irqsave(&zone->lock, flags);
6754 if (put_page_testzero(page)) {
6755 ClearPageHWPoisonTakenOff(page);
6756 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
6757 if (TestClearPageHWPoison(page)) {
bf181c58
NH
6758 ret = true;
6759 }
6760 }
6761 spin_unlock_irqrestore(&zone->lock, flags);
6762
6763 return ret;
6764}
d4ae9916 6765#endif
62b31070
BH
6766
6767#ifdef CONFIG_ZONE_DMA
6768bool has_managed_dma(void)
6769{
6770 struct pglist_data *pgdat;
6771
6772 for_each_online_pgdat(pgdat) {
6773 struct zone *zone = &pgdat->node_zones[ZONE_DMA];
6774
6775 if (managed_zone(zone))
6776 return true;
6777 }
6778 return false;
6779}
6780#endif /* CONFIG_ZONE_DMA */