]> git.ipfire.org Git - thirdparty/linux.git/blame - mm/page_alloc.c
mm: make compound_head const-preserving
[thirdparty/linux.git] / mm / page_alloc.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * linux/mm/page_alloc.c
4 *
5 * Manages the free list, the system allocates free pages here.
6 * Note that kmalloc() lives in slab.c
7 *
8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 * Swap reorganised 29.12.95, Stephen Tweedie
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
12 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
13 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
14 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
15 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
16 */
17
1da177e4
LT
18#include <linux/stddef.h>
19#include <linux/mm.h>
ca79b0c2 20#include <linux/highmem.h>
1da177e4
LT
21#include <linux/swap.h>
22#include <linux/interrupt.h>
23#include <linux/pagemap.h>
10ed273f 24#include <linux/jiffies.h>
edbe7d23 25#include <linux/memblock.h>
1da177e4 26#include <linux/compiler.h>
9f158333 27#include <linux/kernel.h>
b8c73fc2 28#include <linux/kasan.h>
1da177e4
LT
29#include <linux/module.h>
30#include <linux/suspend.h>
31#include <linux/pagevec.h>
32#include <linux/blkdev.h>
33#include <linux/slab.h>
a238ab5b 34#include <linux/ratelimit.h>
5a3135c2 35#include <linux/oom.h>
1da177e4
LT
36#include <linux/topology.h>
37#include <linux/sysctl.h>
38#include <linux/cpu.h>
39#include <linux/cpuset.h>
bdc8cb98 40#include <linux/memory_hotplug.h>
1da177e4
LT
41#include <linux/nodemask.h>
42#include <linux/vmalloc.h>
a6cccdc3 43#include <linux/vmstat.h>
4be38e35 44#include <linux/mempolicy.h>
4b94ffdc 45#include <linux/memremap.h>
6811378e 46#include <linux/stop_machine.h>
97500a4a 47#include <linux/random.h>
c713216d
MG
48#include <linux/sort.h>
49#include <linux/pfn.h>
3fcfab16 50#include <linux/backing-dev.h>
933e312e 51#include <linux/fault-inject.h>
a5d76b54 52#include <linux/page-isolation.h>
3ac7fe5a 53#include <linux/debugobjects.h>
dbb1f81c 54#include <linux/kmemleak.h>
56de7263 55#include <linux/compaction.h>
0d3d062a 56#include <trace/events/kmem.h>
d379f01d 57#include <trace/events/oom.h>
268bb0ce 58#include <linux/prefetch.h>
6e543d57 59#include <linux/mm_inline.h>
f920e413 60#include <linux/mmu_notifier.h>
041d3a8c 61#include <linux/migrate.h>
949f7ec5 62#include <linux/hugetlb.h>
8bd75c77 63#include <linux/sched/rt.h>
5b3cc15a 64#include <linux/sched/mm.h>
48c96a36 65#include <linux/page_owner.h>
0e1cc95b 66#include <linux/kthread.h>
4949148a 67#include <linux/memcontrol.h>
42c269c8 68#include <linux/ftrace.h>
d92a8cfc 69#include <linux/lockdep.h>
556b969a 70#include <linux/nmi.h>
eb414681 71#include <linux/psi.h>
e4443149 72#include <linux/padata.h>
4aab2be0 73#include <linux/khugepaged.h>
ba8f3587 74#include <linux/buffer_head.h>
7ee3d4e8 75#include <asm/sections.h>
1da177e4 76#include <asm/tlbflush.h>
ac924c60 77#include <asm/div64.h>
1da177e4 78#include "internal.h"
e900a918 79#include "shuffle.h"
36e66c55 80#include "page_reporting.h"
1da177e4 81
f04a5d5d
DH
82/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
83typedef int __bitwise fpi_t;
84
85/* No special request */
86#define FPI_NONE ((__force fpi_t)0)
87
88/*
89 * Skip free page reporting notification for the (possibly merged) page.
90 * This does not hinder free page reporting from grabbing the page,
91 * reporting it and marking it "reported" - it only skips notifying
92 * the free page reporting infrastructure about a newly freed page. For
93 * example, used when temporarily pulling a page from a freelist and
94 * putting it back unmodified.
95 */
96#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
97
47b6a24a
DH
98/*
99 * Place the (possibly merged) page to the tail of the freelist. Will ignore
100 * page shuffling (relevant code - e.g., memory onlining - is expected to
101 * shuffle the whole zone).
102 *
103 * Note: No code should rely on this flag for correctness - it's purely
104 * to allow for optimizations when handing back either fresh pages
105 * (memory onlining) or untouched pages (page isolation, free page
106 * reporting).
107 */
108#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
109
2c335680
AK
110/*
111 * Don't poison memory with KASAN (only for the tag-based modes).
112 * During boot, all non-reserved memblock memory is exposed to page_alloc.
113 * Poisoning all that memory lengthens boot time, especially on systems with
114 * large amount of RAM. This flag is used to skip that poisoning.
115 * This is only done for the tag-based KASAN modes, as those are able to
116 * detect memory corruptions with the memory tags assigned by default.
117 * All memory allocated normally after boot gets poisoned as usual.
118 */
119#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
120
c8e251fa
CS
121/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
122static DEFINE_MUTEX(pcp_batch_high_lock);
7cd2b0a3 123#define MIN_PERCPU_PAGELIST_FRACTION (8)
c8e251fa 124
72812019
LS
125#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
126DEFINE_PER_CPU(int, numa_node);
127EXPORT_PER_CPU_SYMBOL(numa_node);
128#endif
129
4518085e
KW
130DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
131
7aac7898
LS
132#ifdef CONFIG_HAVE_MEMORYLESS_NODES
133/*
134 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
135 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
136 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
137 * defined in <linux/topology.h>.
138 */
139DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
140EXPORT_PER_CPU_SYMBOL(_numa_mem_);
141#endif
142
bd233f53 143/* work_structs for global per-cpu drains */
d9367bd0
WY
144struct pcpu_drain {
145 struct zone *zone;
146 struct work_struct work;
147};
8b885f53
JY
148static DEFINE_MUTEX(pcpu_drain_mutex);
149static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
bd233f53 150
38addce8 151#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
58bea414 152volatile unsigned long latent_entropy __latent_entropy;
38addce8
ER
153EXPORT_SYMBOL(latent_entropy);
154#endif
155
1da177e4 156/*
13808910 157 * Array of node states.
1da177e4 158 */
13808910
CL
159nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
160 [N_POSSIBLE] = NODE_MASK_ALL,
161 [N_ONLINE] = { { [0] = 1UL } },
162#ifndef CONFIG_NUMA
163 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
164#ifdef CONFIG_HIGHMEM
165 [N_HIGH_MEMORY] = { { [0] = 1UL } },
20b2f52b 166#endif
20b2f52b 167 [N_MEMORY] = { { [0] = 1UL } },
13808910
CL
168 [N_CPU] = { { [0] = 1UL } },
169#endif /* NUMA */
170};
171EXPORT_SYMBOL(node_states);
172
ca79b0c2
AK
173atomic_long_t _totalram_pages __read_mostly;
174EXPORT_SYMBOL(_totalram_pages);
cb45b0e9 175unsigned long totalreserve_pages __read_mostly;
e48322ab 176unsigned long totalcma_pages __read_mostly;
ab8fabd4 177
1b76b02f 178int percpu_pagelist_fraction;
dcce284a 179gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
51cba1eb 180DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
6471384a
AP
181EXPORT_SYMBOL(init_on_alloc);
182
51cba1eb 183DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
6471384a
AP
184EXPORT_SYMBOL(init_on_free);
185
04013513
VB
186static bool _init_on_alloc_enabled_early __read_mostly
187 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
6471384a
AP
188static int __init early_init_on_alloc(char *buf)
189{
6471384a 190
04013513 191 return kstrtobool(buf, &_init_on_alloc_enabled_early);
6471384a
AP
192}
193early_param("init_on_alloc", early_init_on_alloc);
194
04013513
VB
195static bool _init_on_free_enabled_early __read_mostly
196 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
6471384a
AP
197static int __init early_init_on_free(char *buf)
198{
04013513 199 return kstrtobool(buf, &_init_on_free_enabled_early);
6471384a
AP
200}
201early_param("init_on_free", early_init_on_free);
1da177e4 202
bb14c2c7
VB
203/*
204 * A cached value of the page's pageblock's migratetype, used when the page is
205 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
206 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
207 * Also the migratetype set in the page does not necessarily match the pcplist
208 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
209 * other index - this ensures that it will be put on the correct CMA freelist.
210 */
211static inline int get_pcppage_migratetype(struct page *page)
212{
213 return page->index;
214}
215
216static inline void set_pcppage_migratetype(struct page *page, int migratetype)
217{
218 page->index = migratetype;
219}
220
452aa699
RW
221#ifdef CONFIG_PM_SLEEP
222/*
223 * The following functions are used by the suspend/hibernate code to temporarily
224 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
225 * while devices are suspended. To avoid races with the suspend/hibernate code,
55f2503c
PL
226 * they should always be called with system_transition_mutex held
227 * (gfp_allowed_mask also should only be modified with system_transition_mutex
228 * held, unless the suspend/hibernate code is guaranteed not to run in parallel
229 * with that modification).
452aa699 230 */
c9e664f1
RW
231
232static gfp_t saved_gfp_mask;
233
234void pm_restore_gfp_mask(void)
452aa699 235{
55f2503c 236 WARN_ON(!mutex_is_locked(&system_transition_mutex));
c9e664f1
RW
237 if (saved_gfp_mask) {
238 gfp_allowed_mask = saved_gfp_mask;
239 saved_gfp_mask = 0;
240 }
452aa699
RW
241}
242
c9e664f1 243void pm_restrict_gfp_mask(void)
452aa699 244{
55f2503c 245 WARN_ON(!mutex_is_locked(&system_transition_mutex));
c9e664f1
RW
246 WARN_ON(saved_gfp_mask);
247 saved_gfp_mask = gfp_allowed_mask;
d0164adc 248 gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
452aa699 249}
f90ac398
MG
250
251bool pm_suspended_storage(void)
252{
d0164adc 253 if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
f90ac398
MG
254 return false;
255 return true;
256}
452aa699
RW
257#endif /* CONFIG_PM_SLEEP */
258
d9c23400 259#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
d00181b9 260unsigned int pageblock_order __read_mostly;
d9c23400
MG
261#endif
262
7fef431b
DH
263static void __free_pages_ok(struct page *page, unsigned int order,
264 fpi_t fpi_flags);
a226f6c8 265
1da177e4
LT
266/*
267 * results with 256, 32 in the lowmem_reserve sysctl:
268 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
269 * 1G machine -> (16M dma, 784M normal, 224M high)
270 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
271 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
84109e15 272 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
a2f1b424
AK
273 *
274 * TBD: should special case ZONE_DMA32 machines here - in those we normally
275 * don't need any ZONE_NORMAL reservation
1da177e4 276 */
d3cda233 277int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
4b51d669 278#ifdef CONFIG_ZONE_DMA
d3cda233 279 [ZONE_DMA] = 256,
4b51d669 280#endif
fb0e7942 281#ifdef CONFIG_ZONE_DMA32
d3cda233 282 [ZONE_DMA32] = 256,
fb0e7942 283#endif
d3cda233 284 [ZONE_NORMAL] = 32,
e53ef38d 285#ifdef CONFIG_HIGHMEM
d3cda233 286 [ZONE_HIGHMEM] = 0,
e53ef38d 287#endif
d3cda233 288 [ZONE_MOVABLE] = 0,
2f1b6248 289};
1da177e4 290
15ad7cdc 291static char * const zone_names[MAX_NR_ZONES] = {
4b51d669 292#ifdef CONFIG_ZONE_DMA
2f1b6248 293 "DMA",
4b51d669 294#endif
fb0e7942 295#ifdef CONFIG_ZONE_DMA32
2f1b6248 296 "DMA32",
fb0e7942 297#endif
2f1b6248 298 "Normal",
e53ef38d 299#ifdef CONFIG_HIGHMEM
2a1e274a 300 "HighMem",
e53ef38d 301#endif
2a1e274a 302 "Movable",
033fbae9
DW
303#ifdef CONFIG_ZONE_DEVICE
304 "Device",
305#endif
2f1b6248
CL
306};
307
c999fbd3 308const char * const migratetype_names[MIGRATE_TYPES] = {
60f30350
VB
309 "Unmovable",
310 "Movable",
311 "Reclaimable",
312 "HighAtomic",
313#ifdef CONFIG_CMA
314 "CMA",
315#endif
316#ifdef CONFIG_MEMORY_ISOLATION
317 "Isolate",
318#endif
319};
320
ae70eddd
AK
321compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
322 [NULL_COMPOUND_DTOR] = NULL,
323 [COMPOUND_PAGE_DTOR] = free_compound_page,
f1e61557 324#ifdef CONFIG_HUGETLB_PAGE
ae70eddd 325 [HUGETLB_PAGE_DTOR] = free_huge_page,
f1e61557 326#endif
9a982250 327#ifdef CONFIG_TRANSPARENT_HUGEPAGE
ae70eddd 328 [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
9a982250 329#endif
f1e61557
KS
330};
331
1da177e4 332int min_free_kbytes = 1024;
42aa83cb 333int user_min_free_kbytes = -1;
24512228
MG
334#ifdef CONFIG_DISCONTIGMEM
335/*
336 * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
337 * are not on separate NUMA nodes. Functionally this works but with
338 * watermark_boost_factor, it can reclaim prematurely as the ranges can be
339 * quite small. By default, do not boost watermarks on discontigmem as in
340 * many cases very high-order allocations like THP are likely to be
341 * unsupported and the premature reclaim offsets the advantage of long-term
342 * fragmentation avoidance.
343 */
344int watermark_boost_factor __read_mostly;
345#else
1c30844d 346int watermark_boost_factor __read_mostly = 15000;
24512228 347#endif
795ae7a0 348int watermark_scale_factor = 10;
1da177e4 349
bbe5d993
OS
350static unsigned long nr_kernel_pages __initdata;
351static unsigned long nr_all_pages __initdata;
352static unsigned long dma_reserve __initdata;
1da177e4 353
bbe5d993
OS
354static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
355static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
7f16f91f 356static unsigned long required_kernelcore __initdata;
a5c6d650 357static unsigned long required_kernelcore_percent __initdata;
7f16f91f 358static unsigned long required_movablecore __initdata;
a5c6d650 359static unsigned long required_movablecore_percent __initdata;
bbe5d993 360static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
7f16f91f 361static bool mirrored_kernelcore __meminitdata;
0ee332c1
TH
362
363/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
364int movable_zone;
365EXPORT_SYMBOL(movable_zone);
c713216d 366
418508c1 367#if MAX_NUMNODES > 1
b9726c26 368unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
ce0725f7 369unsigned int nr_online_nodes __read_mostly = 1;
418508c1 370EXPORT_SYMBOL(nr_node_ids);
62bc62a8 371EXPORT_SYMBOL(nr_online_nodes);
418508c1
MS
372#endif
373
9ef9acb0
MG
374int page_group_by_mobility_disabled __read_mostly;
375
3a80a7fa 376#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3c0c12cc
WL
377/*
378 * During boot we initialize deferred pages on-demand, as needed, but once
379 * page_alloc_init_late() has finished, the deferred pages are all initialized,
380 * and we can permanently disable that path.
381 */
382static DEFINE_STATIC_KEY_TRUE(deferred_pages);
383
384/*
385 * Calling kasan_free_pages() only after deferred memory initialization
386 * has completed. Poisoning pages during deferred memory init will greatly
387 * lengthen the process and cause problem in large memory systems as the
388 * deferred pages initialization is done with interrupt disabled.
389 *
390 * Assuming that there will be no reference to those newly initialized
391 * pages before they are ever allocated, this should have no effect on
392 * KASAN memory tracking as the poison will be properly inserted at page
393 * allocation time. The only corner case is when pages are allocated by
394 * on-demand allocation and then freed again before the deferred pages
395 * initialization is done, but this is not likely to happen.
396 */
2c335680 397static inline void kasan_free_nondeferred_pages(struct page *page, int order,
1bb5eab3 398 bool init, fpi_t fpi_flags)
3c0c12cc 399{
2c335680
AK
400 if (static_branch_unlikely(&deferred_pages))
401 return;
402 if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
403 (fpi_flags & FPI_SKIP_KASAN_POISON))
404 return;
1bb5eab3 405 kasan_free_pages(page, order, init);
3c0c12cc
WL
406}
407
3a80a7fa 408/* Returns true if the struct page for the pfn is uninitialised */
0e1cc95b 409static inline bool __meminit early_page_uninitialised(unsigned long pfn)
3a80a7fa 410{
ef70b6f4
MG
411 int nid = early_pfn_to_nid(pfn);
412
413 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
3a80a7fa
MG
414 return true;
415
416 return false;
417}
418
419/*
d3035be4 420 * Returns true when the remaining initialisation should be deferred until
3a80a7fa
MG
421 * later in the boot cycle when it can be parallelised.
422 */
d3035be4
PT
423static bool __meminit
424defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
3a80a7fa 425{
d3035be4
PT
426 static unsigned long prev_end_pfn, nr_initialised;
427
428 /*
429 * prev_end_pfn static that contains the end of previous zone
430 * No need to protect because called very early in boot before smp_init.
431 */
432 if (prev_end_pfn != end_pfn) {
433 prev_end_pfn = end_pfn;
434 nr_initialised = 0;
435 }
436
3c2c6488 437 /* Always populate low zones for address-constrained allocations */
d3035be4 438 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
3a80a7fa 439 return false;
23b68cfa 440
dc2da7b4
BH
441 if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
442 return true;
23b68cfa
WY
443 /*
444 * We start only with one section of pages, more pages are added as
445 * needed until the rest of deferred pages are initialized.
446 */
d3035be4 447 nr_initialised++;
23b68cfa 448 if ((nr_initialised > PAGES_PER_SECTION) &&
d3035be4
PT
449 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
450 NODE_DATA(nid)->first_deferred_pfn = pfn;
451 return true;
3a80a7fa 452 }
d3035be4 453 return false;
3a80a7fa
MG
454}
455#else
2c335680 456static inline void kasan_free_nondeferred_pages(struct page *page, int order,
1bb5eab3 457 bool init, fpi_t fpi_flags)
2c335680
AK
458{
459 if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
460 (fpi_flags & FPI_SKIP_KASAN_POISON))
461 return;
1bb5eab3 462 kasan_free_pages(page, order, init);
2c335680 463}
3c0c12cc 464
3a80a7fa
MG
465static inline bool early_page_uninitialised(unsigned long pfn)
466{
467 return false;
468}
469
d3035be4 470static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
3a80a7fa 471{
d3035be4 472 return false;
3a80a7fa
MG
473}
474#endif
475
0b423ca2
MG
476/* Return a pointer to the bitmap storing bits affecting a block of pages */
477static inline unsigned long *get_pageblock_bitmap(struct page *page,
478 unsigned long pfn)
479{
480#ifdef CONFIG_SPARSEMEM
f1eca35a 481 return section_to_usemap(__pfn_to_section(pfn));
0b423ca2
MG
482#else
483 return page_zone(page)->pageblock_flags;
484#endif /* CONFIG_SPARSEMEM */
485}
486
487static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
488{
489#ifdef CONFIG_SPARSEMEM
490 pfn &= (PAGES_PER_SECTION-1);
0b423ca2
MG
491#else
492 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
0b423ca2 493#endif /* CONFIG_SPARSEMEM */
399b795b 494 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
0b423ca2
MG
495}
496
535b81e2
WY
497static __always_inline
498unsigned long __get_pfnblock_flags_mask(struct page *page,
0b423ca2 499 unsigned long pfn,
0b423ca2
MG
500 unsigned long mask)
501{
502 unsigned long *bitmap;
503 unsigned long bitidx, word_bitidx;
504 unsigned long word;
505
506 bitmap = get_pageblock_bitmap(page, pfn);
507 bitidx = pfn_to_bitidx(page, pfn);
508 word_bitidx = bitidx / BITS_PER_LONG;
509 bitidx &= (BITS_PER_LONG-1);
510
511 word = bitmap[word_bitidx];
d93d5ab9 512 return (word >> bitidx) & mask;
0b423ca2
MG
513}
514
a00cda3f
MCC
515/**
516 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
517 * @page: The page within the block of interest
518 * @pfn: The target page frame number
519 * @mask: mask of bits that the caller is interested in
520 *
521 * Return: pageblock_bits flags
522 */
0b423ca2 523unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
0b423ca2
MG
524 unsigned long mask)
525{
535b81e2 526 return __get_pfnblock_flags_mask(page, pfn, mask);
0b423ca2
MG
527}
528
529static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
530{
535b81e2 531 return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
0b423ca2
MG
532}
533
534/**
535 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
536 * @page: The page within the block of interest
537 * @flags: The flags to set
538 * @pfn: The target page frame number
0b423ca2
MG
539 * @mask: mask of bits that the caller is interested in
540 */
541void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
542 unsigned long pfn,
0b423ca2
MG
543 unsigned long mask)
544{
545 unsigned long *bitmap;
546 unsigned long bitidx, word_bitidx;
547 unsigned long old_word, word;
548
549 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
125b860b 550 BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
0b423ca2
MG
551
552 bitmap = get_pageblock_bitmap(page, pfn);
553 bitidx = pfn_to_bitidx(page, pfn);
554 word_bitidx = bitidx / BITS_PER_LONG;
555 bitidx &= (BITS_PER_LONG-1);
556
557 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
558
d93d5ab9
WY
559 mask <<= bitidx;
560 flags <<= bitidx;
0b423ca2
MG
561
562 word = READ_ONCE(bitmap[word_bitidx]);
563 for (;;) {
564 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
565 if (word == old_word)
566 break;
567 word = old_word;
568 }
569}
3a80a7fa 570
ee6f509c 571void set_pageblock_migratetype(struct page *page, int migratetype)
b2a0ac88 572{
5d0f3f72
KM
573 if (unlikely(page_group_by_mobility_disabled &&
574 migratetype < MIGRATE_PCPTYPES))
49255c61
MG
575 migratetype = MIGRATE_UNMOVABLE;
576
d93d5ab9 577 set_pfnblock_flags_mask(page, (unsigned long)migratetype,
535b81e2 578 page_to_pfn(page), MIGRATETYPE_MASK);
b2a0ac88
MG
579}
580
13e7444b 581#ifdef CONFIG_DEBUG_VM
c6a57e19 582static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
1da177e4 583{
bdc8cb98
DH
584 int ret = 0;
585 unsigned seq;
586 unsigned long pfn = page_to_pfn(page);
b5e6a5a2 587 unsigned long sp, start_pfn;
c6a57e19 588
bdc8cb98
DH
589 do {
590 seq = zone_span_seqbegin(zone);
b5e6a5a2
CS
591 start_pfn = zone->zone_start_pfn;
592 sp = zone->spanned_pages;
108bcc96 593 if (!zone_spans_pfn(zone, pfn))
bdc8cb98
DH
594 ret = 1;
595 } while (zone_span_seqretry(zone, seq));
596
b5e6a5a2 597 if (ret)
613813e8
DH
598 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
599 pfn, zone_to_nid(zone), zone->name,
600 start_pfn, start_pfn + sp);
b5e6a5a2 601
bdc8cb98 602 return ret;
c6a57e19
DH
603}
604
605static int page_is_consistent(struct zone *zone, struct page *page)
606{
14e07298 607 if (!pfn_valid_within(page_to_pfn(page)))
c6a57e19 608 return 0;
1da177e4 609 if (zone != page_zone(page))
c6a57e19
DH
610 return 0;
611
612 return 1;
613}
614/*
615 * Temporary debugging check for pages not lying within a given zone.
616 */
d73d3c9f 617static int __maybe_unused bad_range(struct zone *zone, struct page *page)
c6a57e19
DH
618{
619 if (page_outside_zone_boundaries(zone, page))
1da177e4 620 return 1;
c6a57e19
DH
621 if (!page_is_consistent(zone, page))
622 return 1;
623
1da177e4
LT
624 return 0;
625}
13e7444b 626#else
d73d3c9f 627static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
13e7444b
NP
628{
629 return 0;
630}
631#endif
632
82a3241a 633static void bad_page(struct page *page, const char *reason)
1da177e4 634{
d936cf9b
HD
635 static unsigned long resume;
636 static unsigned long nr_shown;
637 static unsigned long nr_unshown;
638
639 /*
640 * Allow a burst of 60 reports, then keep quiet for that minute;
641 * or allow a steady drip of one report per second.
642 */
643 if (nr_shown == 60) {
644 if (time_before(jiffies, resume)) {
645 nr_unshown++;
646 goto out;
647 }
648 if (nr_unshown) {
ff8e8116 649 pr_alert(
1e9e6365 650 "BUG: Bad page state: %lu messages suppressed\n",
d936cf9b
HD
651 nr_unshown);
652 nr_unshown = 0;
653 }
654 nr_shown = 0;
655 }
656 if (nr_shown++ == 0)
657 resume = jiffies + 60 * HZ;
658
ff8e8116 659 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
3dc14741 660 current->comm, page_to_pfn(page));
d2f07ec0 661 dump_page(page, reason);
3dc14741 662
4f31888c 663 print_modules();
1da177e4 664 dump_stack();
d936cf9b 665out:
8cc3b392 666 /* Leave bad fields for debug, except PageBuddy could make trouble */
22b751c3 667 page_mapcount_reset(page); /* remove PageBuddy */
373d4d09 668 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1da177e4
LT
669}
670
1da177e4
LT
671/*
672 * Higher-order pages are called "compound pages". They are structured thusly:
673 *
1d798ca3 674 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
1da177e4 675 *
1d798ca3
KS
676 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
677 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
1da177e4 678 *
1d798ca3
KS
679 * The first tail page's ->compound_dtor holds the offset in array of compound
680 * page destructors. See compound_page_dtors.
1da177e4 681 *
1d798ca3 682 * The first tail page's ->compound_order holds the order of allocation.
41d78ba5 683 * This usage means that zero-order pages may not be compound.
1da177e4 684 */
d98c7a09 685
9a982250 686void free_compound_page(struct page *page)
d98c7a09 687{
7ae88534 688 mem_cgroup_uncharge(page);
7fef431b 689 __free_pages_ok(page, compound_order(page), FPI_NONE);
d98c7a09
HD
690}
691
d00181b9 692void prep_compound_page(struct page *page, unsigned int order)
18229df5
AW
693{
694 int i;
695 int nr_pages = 1 << order;
696
18229df5
AW
697 __SetPageHead(page);
698 for (i = 1; i < nr_pages; i++) {
699 struct page *p = page + i;
58a84aa9 700 set_page_count(p, 0);
1c290f64 701 p->mapping = TAIL_MAPPING;
1d798ca3 702 set_compound_head(p, page);
18229df5 703 }
1378a5ee
MWO
704
705 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
706 set_compound_order(page, order);
53f9263b 707 atomic_set(compound_mapcount_ptr(page), -1);
47e29d32
JH
708 if (hpage_pincount_available(page))
709 atomic_set(compound_pincount_ptr(page), 0);
18229df5
AW
710}
711
c0a32fc5
SG
712#ifdef CONFIG_DEBUG_PAGEALLOC
713unsigned int _debug_guardpage_minorder;
96a2b03f 714
8e57f8ac
VB
715bool _debug_pagealloc_enabled_early __read_mostly
716 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
717EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
96a2b03f 718DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
505f6d22 719EXPORT_SYMBOL(_debug_pagealloc_enabled);
96a2b03f
VB
720
721DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
e30825f1 722
031bc574
JK
723static int __init early_debug_pagealloc(char *buf)
724{
8e57f8ac 725 return kstrtobool(buf, &_debug_pagealloc_enabled_early);
031bc574
JK
726}
727early_param("debug_pagealloc", early_debug_pagealloc);
728
c0a32fc5
SG
729static int __init debug_guardpage_minorder_setup(char *buf)
730{
731 unsigned long res;
732
733 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
1170532b 734 pr_err("Bad debug_guardpage_minorder value\n");
c0a32fc5
SG
735 return 0;
736 }
737 _debug_guardpage_minorder = res;
1170532b 738 pr_info("Setting debug_guardpage_minorder to %lu\n", res);
c0a32fc5
SG
739 return 0;
740}
f1c1e9f7 741early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
c0a32fc5 742
acbc15a4 743static inline bool set_page_guard(struct zone *zone, struct page *page,
2847cf95 744 unsigned int order, int migratetype)
c0a32fc5 745{
e30825f1 746 if (!debug_guardpage_enabled())
acbc15a4
JK
747 return false;
748
749 if (order >= debug_guardpage_minorder())
750 return false;
e30825f1 751
3972f6bb 752 __SetPageGuard(page);
2847cf95
JK
753 INIT_LIST_HEAD(&page->lru);
754 set_page_private(page, order);
755 /* Guard pages are not available for any usage */
756 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
acbc15a4
JK
757
758 return true;
c0a32fc5
SG
759}
760
2847cf95
JK
761static inline void clear_page_guard(struct zone *zone, struct page *page,
762 unsigned int order, int migratetype)
c0a32fc5 763{
e30825f1
JK
764 if (!debug_guardpage_enabled())
765 return;
766
3972f6bb 767 __ClearPageGuard(page);
e30825f1 768
2847cf95
JK
769 set_page_private(page, 0);
770 if (!is_migrate_isolate(migratetype))
771 __mod_zone_freepage_state(zone, (1 << order), migratetype);
c0a32fc5
SG
772}
773#else
acbc15a4
JK
774static inline bool set_page_guard(struct zone *zone, struct page *page,
775 unsigned int order, int migratetype) { return false; }
2847cf95
JK
776static inline void clear_page_guard(struct zone *zone, struct page *page,
777 unsigned int order, int migratetype) {}
c0a32fc5
SG
778#endif
779
04013513
VB
780/*
781 * Enable static keys related to various memory debugging and hardening options.
782 * Some override others, and depend on early params that are evaluated in the
783 * order of appearance. So we need to first gather the full picture of what was
784 * enabled, and then make decisions.
785 */
786void init_mem_debugging_and_hardening(void)
787{
9df65f52
ST
788 bool page_poisoning_requested = false;
789
790#ifdef CONFIG_PAGE_POISONING
791 /*
792 * Page poisoning is debug page alloc for some arches. If
793 * either of those options are enabled, enable poisoning.
794 */
795 if (page_poisoning_enabled() ||
796 (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
797 debug_pagealloc_enabled())) {
798 static_branch_enable(&_page_poisoning_enabled);
799 page_poisoning_requested = true;
800 }
801#endif
802
04013513 803 if (_init_on_alloc_enabled_early) {
9df65f52 804 if (page_poisoning_requested)
04013513
VB
805 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
806 "will take precedence over init_on_alloc\n");
807 else
808 static_branch_enable(&init_on_alloc);
809 }
810 if (_init_on_free_enabled_early) {
9df65f52 811 if (page_poisoning_requested)
04013513
VB
812 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
813 "will take precedence over init_on_free\n");
814 else
815 static_branch_enable(&init_on_free);
816 }
817
818#ifdef CONFIG_DEBUG_PAGEALLOC
819 if (!debug_pagealloc_enabled())
820 return;
821
822 static_branch_enable(&_debug_pagealloc_enabled);
823
824 if (!debug_guardpage_minorder())
825 return;
826
827 static_branch_enable(&_debug_guardpage_enabled);
828#endif
829}
830
ab130f91 831static inline void set_buddy_order(struct page *page, unsigned int order)
6aa3001b 832{
4c21e2f2 833 set_page_private(page, order);
676165a8 834 __SetPageBuddy(page);
1da177e4
LT
835}
836
1da177e4
LT
837/*
838 * This function checks whether a page is free && is the buddy
6e292b9b 839 * we can coalesce a page and its buddy if
13ad59df 840 * (a) the buddy is not in a hole (check before calling!) &&
676165a8 841 * (b) the buddy is in the buddy system &&
cb2b95e1
AW
842 * (c) a page and its buddy have the same order &&
843 * (d) a page and its buddy are in the same zone.
676165a8 844 *
6e292b9b
MW
845 * For recording whether a page is in the buddy system, we set PageBuddy.
846 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
1da177e4 847 *
676165a8 848 * For recording page's order, we use page_private(page).
1da177e4 849 */
fe925c0c 850static inline bool page_is_buddy(struct page *page, struct page *buddy,
7aeb09f9 851 unsigned int order)
1da177e4 852{
fe925c0c 853 if (!page_is_guard(buddy) && !PageBuddy(buddy))
854 return false;
4c5018ce 855
ab130f91 856 if (buddy_order(buddy) != order)
fe925c0c 857 return false;
c0a32fc5 858
fe925c0c 859 /*
860 * zone check is done late to avoid uselessly calculating
861 * zone/node ids for pages that could never merge.
862 */
863 if (page_zone_id(page) != page_zone_id(buddy))
864 return false;
d34c5fa0 865
fe925c0c 866 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
4c5018ce 867
fe925c0c 868 return true;
1da177e4
LT
869}
870
5e1f0f09
MG
871#ifdef CONFIG_COMPACTION
872static inline struct capture_control *task_capc(struct zone *zone)
873{
874 struct capture_control *capc = current->capture_control;
875
deba0487 876 return unlikely(capc) &&
5e1f0f09
MG
877 !(current->flags & PF_KTHREAD) &&
878 !capc->page &&
deba0487 879 capc->cc->zone == zone ? capc : NULL;
5e1f0f09
MG
880}
881
882static inline bool
883compaction_capture(struct capture_control *capc, struct page *page,
884 int order, int migratetype)
885{
886 if (!capc || order != capc->cc->order)
887 return false;
888
889 /* Do not accidentally pollute CMA or isolated regions*/
890 if (is_migrate_cma(migratetype) ||
891 is_migrate_isolate(migratetype))
892 return false;
893
894 /*
f0953a1b 895 * Do not let lower order allocations pollute a movable pageblock.
5e1f0f09
MG
896 * This might let an unmovable request use a reclaimable pageblock
897 * and vice-versa but no more than normal fallback logic which can
898 * have trouble finding a high-order free page.
899 */
900 if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
901 return false;
902
903 capc->page = page;
904 return true;
905}
906
907#else
908static inline struct capture_control *task_capc(struct zone *zone)
909{
910 return NULL;
911}
912
913static inline bool
914compaction_capture(struct capture_control *capc, struct page *page,
915 int order, int migratetype)
916{
917 return false;
918}
919#endif /* CONFIG_COMPACTION */
920
6ab01363
AD
921/* Used for pages not on another list */
922static inline void add_to_free_list(struct page *page, struct zone *zone,
923 unsigned int order, int migratetype)
924{
925 struct free_area *area = &zone->free_area[order];
926
927 list_add(&page->lru, &area->free_list[migratetype]);
928 area->nr_free++;
929}
930
931/* Used for pages not on another list */
932static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
933 unsigned int order, int migratetype)
934{
935 struct free_area *area = &zone->free_area[order];
936
937 list_add_tail(&page->lru, &area->free_list[migratetype]);
938 area->nr_free++;
939}
940
293ffa5e
DH
941/*
942 * Used for pages which are on another list. Move the pages to the tail
943 * of the list - so the moved pages won't immediately be considered for
944 * allocation again (e.g., optimization for memory onlining).
945 */
6ab01363
AD
946static inline void move_to_free_list(struct page *page, struct zone *zone,
947 unsigned int order, int migratetype)
948{
949 struct free_area *area = &zone->free_area[order];
950
293ffa5e 951 list_move_tail(&page->lru, &area->free_list[migratetype]);
6ab01363
AD
952}
953
954static inline void del_page_from_free_list(struct page *page, struct zone *zone,
955 unsigned int order)
956{
36e66c55
AD
957 /* clear reported state and update reported page count */
958 if (page_reported(page))
959 __ClearPageReported(page);
960
6ab01363
AD
961 list_del(&page->lru);
962 __ClearPageBuddy(page);
963 set_page_private(page, 0);
964 zone->free_area[order].nr_free--;
965}
966
a2129f24
AD
967/*
968 * If this is not the largest possible page, check if the buddy
969 * of the next-highest order is free. If it is, it's possible
970 * that pages are being freed that will coalesce soon. In case,
971 * that is happening, add the free page to the tail of the list
972 * so it's less likely to be used soon and more likely to be merged
973 * as a higher order page
974 */
975static inline bool
976buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
977 struct page *page, unsigned int order)
978{
979 struct page *higher_page, *higher_buddy;
980 unsigned long combined_pfn;
981
982 if (order >= MAX_ORDER - 2)
983 return false;
984
985 if (!pfn_valid_within(buddy_pfn))
986 return false;
987
988 combined_pfn = buddy_pfn & pfn;
989 higher_page = page + (combined_pfn - pfn);
990 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
991 higher_buddy = higher_page + (buddy_pfn - combined_pfn);
992
993 return pfn_valid_within(buddy_pfn) &&
994 page_is_buddy(higher_page, higher_buddy, order + 1);
995}
996
1da177e4
LT
997/*
998 * Freeing function for a buddy system allocator.
999 *
1000 * The concept of a buddy system is to maintain direct-mapped table
1001 * (containing bit values) for memory blocks of various "orders".
1002 * The bottom level table contains the map for the smallest allocatable
1003 * units of memory (here, pages), and each level above it describes
1004 * pairs of units from the levels below, hence, "buddies".
1005 * At a high level, all that happens here is marking the table entry
1006 * at the bottom level available, and propagating the changes upward
1007 * as necessary, plus some accounting needed to play nicely with other
1008 * parts of the VM system.
1009 * At each level, we keep a list of pages, which are heads of continuous
6e292b9b
MW
1010 * free pages of length of (1 << order) and marked with PageBuddy.
1011 * Page's order is recorded in page_private(page) field.
1da177e4 1012 * So when we are allocating or freeing one, we can derive the state of the
5f63b720
MN
1013 * other. That is, if we allocate a small block, and both were
1014 * free, the remainder of the region must be split into blocks.
1da177e4 1015 * If a block is freed, and its buddy is also free, then this
5f63b720 1016 * triggers coalescing into a block of larger size.
1da177e4 1017 *
6d49e352 1018 * -- nyc
1da177e4
LT
1019 */
1020
48db57f8 1021static inline void __free_one_page(struct page *page,
dc4b0caf 1022 unsigned long pfn,
ed0ae21d 1023 struct zone *zone, unsigned int order,
f04a5d5d 1024 int migratetype, fpi_t fpi_flags)
1da177e4 1025{
a2129f24 1026 struct capture_control *capc = task_capc(zone);
3f649ab7 1027 unsigned long buddy_pfn;
a2129f24 1028 unsigned long combined_pfn;
d9dddbf5 1029 unsigned int max_order;
a2129f24
AD
1030 struct page *buddy;
1031 bool to_tail;
d9dddbf5 1032
7ad69832 1033 max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
1da177e4 1034
d29bb978 1035 VM_BUG_ON(!zone_is_initialized(zone));
6e9f0d58 1036 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
1da177e4 1037
ed0ae21d 1038 VM_BUG_ON(migratetype == -1);
d9dddbf5 1039 if (likely(!is_migrate_isolate(migratetype)))
8f82b55d 1040 __mod_zone_freepage_state(zone, 1 << order, migratetype);
ed0ae21d 1041
76741e77 1042 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
309381fe 1043 VM_BUG_ON_PAGE(bad_range(zone, page), page);
1da177e4 1044
d9dddbf5 1045continue_merging:
7ad69832 1046 while (order < max_order) {
5e1f0f09
MG
1047 if (compaction_capture(capc, page, order, migratetype)) {
1048 __mod_zone_freepage_state(zone, -(1 << order),
1049 migratetype);
1050 return;
1051 }
76741e77
VB
1052 buddy_pfn = __find_buddy_pfn(pfn, order);
1053 buddy = page + (buddy_pfn - pfn);
13ad59df
VB
1054
1055 if (!pfn_valid_within(buddy_pfn))
1056 goto done_merging;
cb2b95e1 1057 if (!page_is_buddy(page, buddy, order))
d9dddbf5 1058 goto done_merging;
c0a32fc5
SG
1059 /*
1060 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
1061 * merge with it and move up one order.
1062 */
b03641af 1063 if (page_is_guard(buddy))
2847cf95 1064 clear_page_guard(zone, buddy, order, migratetype);
b03641af 1065 else
6ab01363 1066 del_page_from_free_list(buddy, zone, order);
76741e77
VB
1067 combined_pfn = buddy_pfn & pfn;
1068 page = page + (combined_pfn - pfn);
1069 pfn = combined_pfn;
1da177e4
LT
1070 order++;
1071 }
7ad69832 1072 if (order < MAX_ORDER - 1) {
d9dddbf5
VB
1073 /* If we are here, it means order is >= pageblock_order.
1074 * We want to prevent merge between freepages on isolate
1075 * pageblock and normal pageblock. Without this, pageblock
1076 * isolation could cause incorrect freepage or CMA accounting.
1077 *
1078 * We don't want to hit this code for the more frequent
1079 * low-order merging.
1080 */
1081 if (unlikely(has_isolate_pageblock(zone))) {
1082 int buddy_mt;
1083
76741e77
VB
1084 buddy_pfn = __find_buddy_pfn(pfn, order);
1085 buddy = page + (buddy_pfn - pfn);
d9dddbf5
VB
1086 buddy_mt = get_pageblock_migratetype(buddy);
1087
1088 if (migratetype != buddy_mt
1089 && (is_migrate_isolate(migratetype) ||
1090 is_migrate_isolate(buddy_mt)))
1091 goto done_merging;
1092 }
7ad69832 1093 max_order = order + 1;
d9dddbf5
VB
1094 goto continue_merging;
1095 }
1096
1097done_merging:
ab130f91 1098 set_buddy_order(page, order);
6dda9d55 1099
47b6a24a
DH
1100 if (fpi_flags & FPI_TO_TAIL)
1101 to_tail = true;
1102 else if (is_shuffle_order(order))
a2129f24 1103 to_tail = shuffle_pick_tail();
97500a4a 1104 else
a2129f24 1105 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
97500a4a 1106
a2129f24 1107 if (to_tail)
6ab01363 1108 add_to_free_list_tail(page, zone, order, migratetype);
a2129f24 1109 else
6ab01363 1110 add_to_free_list(page, zone, order, migratetype);
36e66c55
AD
1111
1112 /* Notify page reporting subsystem of freed page */
f04a5d5d 1113 if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
36e66c55 1114 page_reporting_notify_free(order);
1da177e4
LT
1115}
1116
7bfec6f4
MG
1117/*
1118 * A bad page could be due to a number of fields. Instead of multiple branches,
1119 * try and check multiple fields with one check. The caller must do a detailed
1120 * check if necessary.
1121 */
1122static inline bool page_expected_state(struct page *page,
1123 unsigned long check_flags)
1124{
1125 if (unlikely(atomic_read(&page->_mapcount) != -1))
1126 return false;
1127
1128 if (unlikely((unsigned long)page->mapping |
1129 page_ref_count(page) |
1130#ifdef CONFIG_MEMCG
48060834 1131 page->memcg_data |
7bfec6f4
MG
1132#endif
1133 (page->flags & check_flags)))
1134 return false;
1135
1136 return true;
1137}
1138
58b7f119 1139static const char *page_bad_reason(struct page *page, unsigned long flags)
1da177e4 1140{
82a3241a 1141 const char *bad_reason = NULL;
f0b791a3 1142
53f9263b 1143 if (unlikely(atomic_read(&page->_mapcount) != -1))
f0b791a3
DH
1144 bad_reason = "nonzero mapcount";
1145 if (unlikely(page->mapping != NULL))
1146 bad_reason = "non-NULL mapping";
fe896d18 1147 if (unlikely(page_ref_count(page) != 0))
0139aa7b 1148 bad_reason = "nonzero _refcount";
58b7f119
WY
1149 if (unlikely(page->flags & flags)) {
1150 if (flags == PAGE_FLAGS_CHECK_AT_PREP)
1151 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
1152 else
1153 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
f0b791a3 1154 }
9edad6ea 1155#ifdef CONFIG_MEMCG
48060834 1156 if (unlikely(page->memcg_data))
9edad6ea
JW
1157 bad_reason = "page still charged to cgroup";
1158#endif
58b7f119
WY
1159 return bad_reason;
1160}
1161
1162static void check_free_page_bad(struct page *page)
1163{
1164 bad_page(page,
1165 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
bb552ac6
MG
1166}
1167
534fe5e3 1168static inline int check_free_page(struct page *page)
bb552ac6 1169{
da838d4f 1170 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
bb552ac6 1171 return 0;
bb552ac6
MG
1172
1173 /* Something has gone sideways, find it */
0d0c48a2 1174 check_free_page_bad(page);
7bfec6f4 1175 return 1;
1da177e4
LT
1176}
1177
4db7548c
MG
1178static int free_tail_pages_check(struct page *head_page, struct page *page)
1179{
1180 int ret = 1;
1181
1182 /*
1183 * We rely page->lru.next never has bit 0 set, unless the page
1184 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
1185 */
1186 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
1187
1188 if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
1189 ret = 0;
1190 goto out;
1191 }
1192 switch (page - head_page) {
1193 case 1:
4da1984e 1194 /* the first tail page: ->mapping may be compound_mapcount() */
4db7548c 1195 if (unlikely(compound_mapcount(page))) {
82a3241a 1196 bad_page(page, "nonzero compound_mapcount");
4db7548c
MG
1197 goto out;
1198 }
1199 break;
1200 case 2:
1201 /*
1202 * the second tail page: ->mapping is
fa3015b7 1203 * deferred_list.next -- ignore value.
4db7548c
MG
1204 */
1205 break;
1206 default:
1207 if (page->mapping != TAIL_MAPPING) {
82a3241a 1208 bad_page(page, "corrupted mapping in tail page");
4db7548c
MG
1209 goto out;
1210 }
1211 break;
1212 }
1213 if (unlikely(!PageTail(page))) {
82a3241a 1214 bad_page(page, "PageTail not set");
4db7548c
MG
1215 goto out;
1216 }
1217 if (unlikely(compound_head(page) != head_page)) {
82a3241a 1218 bad_page(page, "compound_head not consistent");
4db7548c
MG
1219 goto out;
1220 }
1221 ret = 0;
1222out:
1223 page->mapping = NULL;
1224 clear_compound_head(page);
1225 return ret;
1226}
1227
6471384a
AP
1228static void kernel_init_free_pages(struct page *page, int numpages)
1229{
1230 int i;
1231
9e15afa5
QC
1232 /* s390's use of memset() could override KASAN redzones. */
1233 kasan_disable_current();
aa1ef4d7 1234 for (i = 0; i < numpages; i++) {
acb35b17 1235 u8 tag = page_kasan_tag(page + i);
aa1ef4d7 1236 page_kasan_tag_reset(page + i);
6471384a 1237 clear_highpage(page + i);
acb35b17 1238 page_kasan_tag_set(page + i, tag);
aa1ef4d7 1239 }
9e15afa5 1240 kasan_enable_current();
6471384a
AP
1241}
1242
e2769dbd 1243static __always_inline bool free_pages_prepare(struct page *page,
2c335680 1244 unsigned int order, bool check_free, fpi_t fpi_flags)
4db7548c 1245{
e2769dbd 1246 int bad = 0;
1bb5eab3 1247 bool init;
4db7548c 1248
4db7548c
MG
1249 VM_BUG_ON_PAGE(PageTail(page), page);
1250
e2769dbd 1251 trace_mm_page_free(page, order);
e2769dbd 1252
79f5f8fa
OS
1253 if (unlikely(PageHWPoison(page)) && !order) {
1254 /*
1255 * Do not let hwpoison pages hit pcplists/buddy
1256 * Untie memcg state and reset page's owner
1257 */
18b2db3b 1258 if (memcg_kmem_enabled() && PageMemcgKmem(page))
79f5f8fa
OS
1259 __memcg_kmem_uncharge_page(page, order);
1260 reset_page_owner(page, order);
1261 return false;
1262 }
1263
e2769dbd
MG
1264 /*
1265 * Check tail pages before head page information is cleared to
1266 * avoid checking PageCompound for order-0 pages.
1267 */
1268 if (unlikely(order)) {
1269 bool compound = PageCompound(page);
1270 int i;
1271
1272 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
4db7548c 1273
9a73f61b
KS
1274 if (compound)
1275 ClearPageDoubleMap(page);
e2769dbd
MG
1276 for (i = 1; i < (1 << order); i++) {
1277 if (compound)
1278 bad += free_tail_pages_check(page, page + i);
534fe5e3 1279 if (unlikely(check_free_page(page + i))) {
e2769dbd
MG
1280 bad++;
1281 continue;
1282 }
1283 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1284 }
1285 }
bda807d4 1286 if (PageMappingFlags(page))
4db7548c 1287 page->mapping = NULL;
18b2db3b 1288 if (memcg_kmem_enabled() && PageMemcgKmem(page))
f4b00eab 1289 __memcg_kmem_uncharge_page(page, order);
e2769dbd 1290 if (check_free)
534fe5e3 1291 bad += check_free_page(page);
e2769dbd
MG
1292 if (bad)
1293 return false;
4db7548c 1294
e2769dbd
MG
1295 page_cpupid_reset_last(page);
1296 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1297 reset_page_owner(page, order);
4db7548c
MG
1298
1299 if (!PageHighMem(page)) {
1300 debug_check_no_locks_freed(page_address(page),
e2769dbd 1301 PAGE_SIZE << order);
4db7548c 1302 debug_check_no_obj_freed(page_address(page),
e2769dbd 1303 PAGE_SIZE << order);
4db7548c 1304 }
6471384a 1305
8db26a3d
VB
1306 kernel_poison_pages(page, 1 << order);
1307
f9d79e8d 1308 /*
1bb5eab3
AK
1309 * As memory initialization might be integrated into KASAN,
1310 * kasan_free_pages and kernel_init_free_pages must be
1311 * kept together to avoid discrepancies in behavior.
1312 *
f9d79e8d
AK
1313 * With hardware tag-based KASAN, memory tags must be set before the
1314 * page becomes unavailable via debug_pagealloc or arch_free_page.
1315 */
1bb5eab3
AK
1316 init = want_init_on_free();
1317 if (init && !kasan_has_integrated_init())
1318 kernel_init_free_pages(page, 1 << order);
1319 kasan_free_nondeferred_pages(page, order, init, fpi_flags);
f9d79e8d 1320
234fdce8
QC
1321 /*
1322 * arch_free_page() can make the page's contents inaccessible. s390
1323 * does this. So nothing which can access the page's contents should
1324 * happen after this.
1325 */
1326 arch_free_page(page, order);
1327
77bc7fd6 1328 debug_pagealloc_unmap_pages(page, 1 << order);
d6332692 1329
4db7548c
MG
1330 return true;
1331}
1332
e2769dbd 1333#ifdef CONFIG_DEBUG_VM
4462b32c
VB
1334/*
1335 * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
1336 * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
1337 * moved from pcp lists to free lists.
1338 */
1339static bool free_pcp_prepare(struct page *page)
e2769dbd 1340{
2c335680 1341 return free_pages_prepare(page, 0, true, FPI_NONE);
e2769dbd
MG
1342}
1343
4462b32c 1344static bool bulkfree_pcp_prepare(struct page *page)
e2769dbd 1345{
8e57f8ac 1346 if (debug_pagealloc_enabled_static())
534fe5e3 1347 return check_free_page(page);
4462b32c
VB
1348 else
1349 return false;
e2769dbd
MG
1350}
1351#else
4462b32c
VB
1352/*
1353 * With DEBUG_VM disabled, order-0 pages being freed are checked only when
1354 * moving from pcp lists to free list in order to reduce overhead. With
1355 * debug_pagealloc enabled, they are checked also immediately when being freed
1356 * to the pcp lists.
1357 */
e2769dbd
MG
1358static bool free_pcp_prepare(struct page *page)
1359{
8e57f8ac 1360 if (debug_pagealloc_enabled_static())
2c335680 1361 return free_pages_prepare(page, 0, true, FPI_NONE);
4462b32c 1362 else
2c335680 1363 return free_pages_prepare(page, 0, false, FPI_NONE);
e2769dbd
MG
1364}
1365
4db7548c
MG
1366static bool bulkfree_pcp_prepare(struct page *page)
1367{
534fe5e3 1368 return check_free_page(page);
4db7548c
MG
1369}
1370#endif /* CONFIG_DEBUG_VM */
1371
97334162
AL
1372static inline void prefetch_buddy(struct page *page)
1373{
1374 unsigned long pfn = page_to_pfn(page);
1375 unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
1376 struct page *buddy = page + (buddy_pfn - pfn);
1377
1378 prefetch(buddy);
1379}
1380
1da177e4 1381/*
5f8dcc21 1382 * Frees a number of pages from the PCP lists
1da177e4 1383 * Assumes all pages on list are in same zone, and of same order.
207f36ee 1384 * count is the number of pages to free.
1da177e4
LT
1385 *
1386 * If the zone was previously in an "all pages pinned" state then look to
1387 * see if this freeing clears that state.
1388 *
1389 * And clear the zone's pages_scanned counter, to hold off the "all pages are
1390 * pinned" detection logic.
1391 */
5f8dcc21
MG
1392static void free_pcppages_bulk(struct zone *zone, int count,
1393 struct per_cpu_pages *pcp)
1da177e4 1394{
5f8dcc21 1395 int migratetype = 0;
a6f9edd6 1396 int batch_free = 0;
5c3ad2eb 1397 int prefetch_nr = READ_ONCE(pcp->batch);
3777999d 1398 bool isolated_pageblocks;
0a5f4e5b
AL
1399 struct page *page, *tmp;
1400 LIST_HEAD(head);
f2260e6b 1401
88e8ac11
CTR
1402 /*
1403 * Ensure proper count is passed which otherwise would stuck in the
1404 * below while (list_empty(list)) loop.
1405 */
1406 count = min(pcp->count, count);
e5b31ac2 1407 while (count) {
5f8dcc21
MG
1408 struct list_head *list;
1409
1410 /*
a6f9edd6
MG
1411 * Remove pages from lists in a round-robin fashion. A
1412 * batch_free count is maintained that is incremented when an
1413 * empty list is encountered. This is so more pages are freed
1414 * off fuller lists instead of spinning excessively around empty
1415 * lists
5f8dcc21
MG
1416 */
1417 do {
a6f9edd6 1418 batch_free++;
5f8dcc21
MG
1419 if (++migratetype == MIGRATE_PCPTYPES)
1420 migratetype = 0;
1421 list = &pcp->lists[migratetype];
1422 } while (list_empty(list));
48db57f8 1423
1d16871d
NK
1424 /* This is the only non-empty list. Free them all. */
1425 if (batch_free == MIGRATE_PCPTYPES)
e5b31ac2 1426 batch_free = count;
1d16871d 1427
a6f9edd6 1428 do {
a16601c5 1429 page = list_last_entry(list, struct page, lru);
0a5f4e5b 1430 /* must delete to avoid corrupting pcp list */
a6f9edd6 1431 list_del(&page->lru);
77ba9062 1432 pcp->count--;
aa016d14 1433
4db7548c
MG
1434 if (bulkfree_pcp_prepare(page))
1435 continue;
1436
0a5f4e5b 1437 list_add_tail(&page->lru, &head);
97334162
AL
1438
1439 /*
1440 * We are going to put the page back to the global
1441 * pool, prefetch its buddy to speed up later access
1442 * under zone->lock. It is believed the overhead of
1443 * an additional test and calculating buddy_pfn here
1444 * can be offset by reduced memory latency later. To
1445 * avoid excessive prefetching due to large count, only
1446 * prefetch buddy for the first pcp->batch nr of pages.
1447 */
5c3ad2eb 1448 if (prefetch_nr) {
97334162 1449 prefetch_buddy(page);
5c3ad2eb
VB
1450 prefetch_nr--;
1451 }
e5b31ac2 1452 } while (--count && --batch_free && !list_empty(list));
1da177e4 1453 }
0a5f4e5b
AL
1454
1455 spin_lock(&zone->lock);
1456 isolated_pageblocks = has_isolate_pageblock(zone);
1457
1458 /*
1459 * Use safe version since after __free_one_page(),
1460 * page->lru.next will not point to original list.
1461 */
1462 list_for_each_entry_safe(page, tmp, &head, lru) {
1463 int mt = get_pcppage_migratetype(page);
1464 /* MIGRATE_ISOLATE page should not go to pcplists */
1465 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1466 /* Pageblock could have been isolated meanwhile */
1467 if (unlikely(isolated_pageblocks))
1468 mt = get_pageblock_migratetype(page);
1469
f04a5d5d 1470 __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
0a5f4e5b
AL
1471 trace_mm_page_pcpu_drain(page, 0, mt);
1472 }
d34b0733 1473 spin_unlock(&zone->lock);
1da177e4
LT
1474}
1475
dc4b0caf
MG
1476static void free_one_page(struct zone *zone,
1477 struct page *page, unsigned long pfn,
7aeb09f9 1478 unsigned int order,
7fef431b 1479 int migratetype, fpi_t fpi_flags)
1da177e4 1480{
d34b0733 1481 spin_lock(&zone->lock);
ad53f92e
JK
1482 if (unlikely(has_isolate_pageblock(zone) ||
1483 is_migrate_isolate(migratetype))) {
1484 migratetype = get_pfnblock_migratetype(page, pfn);
ad53f92e 1485 }
7fef431b 1486 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
d34b0733 1487 spin_unlock(&zone->lock);
48db57f8
NP
1488}
1489
1e8ce83c 1490static void __meminit __init_single_page(struct page *page, unsigned long pfn,
d0dc12e8 1491 unsigned long zone, int nid)
1e8ce83c 1492{
d0dc12e8 1493 mm_zero_struct_page(page);
1e8ce83c 1494 set_page_links(page, zone, nid, pfn);
1e8ce83c
RH
1495 init_page_count(page);
1496 page_mapcount_reset(page);
1497 page_cpupid_reset_last(page);
2813b9c0 1498 page_kasan_tag_reset(page);
1e8ce83c 1499
1e8ce83c
RH
1500 INIT_LIST_HEAD(&page->lru);
1501#ifdef WANT_PAGE_VIRTUAL
1502 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1503 if (!is_highmem_idx(zone))
1504 set_page_address(page, __va(pfn << PAGE_SHIFT));
1505#endif
1506}
1507
7e18adb4 1508#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
57148a64 1509static void __meminit init_reserved_page(unsigned long pfn)
7e18adb4
MG
1510{
1511 pg_data_t *pgdat;
1512 int nid, zid;
1513
1514 if (!early_page_uninitialised(pfn))
1515 return;
1516
1517 nid = early_pfn_to_nid(pfn);
1518 pgdat = NODE_DATA(nid);
1519
1520 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1521 struct zone *zone = &pgdat->node_zones[zid];
1522
1523 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
1524 break;
1525 }
d0dc12e8 1526 __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
7e18adb4
MG
1527}
1528#else
1529static inline void init_reserved_page(unsigned long pfn)
1530{
1531}
1532#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1533
92923ca3
NZ
1534/*
1535 * Initialised pages do not have PageReserved set. This function is
1536 * called for each range allocated by the bootmem allocator and
1537 * marks the pages PageReserved. The remaining valid pages are later
1538 * sent to the buddy page allocator.
1539 */
4b50bcc7 1540void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
92923ca3
NZ
1541{
1542 unsigned long start_pfn = PFN_DOWN(start);
1543 unsigned long end_pfn = PFN_UP(end);
1544
7e18adb4
MG
1545 for (; start_pfn < end_pfn; start_pfn++) {
1546 if (pfn_valid(start_pfn)) {
1547 struct page *page = pfn_to_page(start_pfn);
1548
1549 init_reserved_page(start_pfn);
1d798ca3
KS
1550
1551 /* Avoid false-positive PageTail() */
1552 INIT_LIST_HEAD(&page->lru);
1553
d483da5b
AD
1554 /*
1555 * no need for atomic set_bit because the struct
1556 * page is not visible yet so nobody should
1557 * access it yet.
1558 */
1559 __SetPageReserved(page);
7e18adb4
MG
1560 }
1561 }
92923ca3
NZ
1562}
1563
7fef431b
DH
1564static void __free_pages_ok(struct page *page, unsigned int order,
1565 fpi_t fpi_flags)
ec95f53a 1566{
d34b0733 1567 unsigned long flags;
95e34412 1568 int migratetype;
dc4b0caf 1569 unsigned long pfn = page_to_pfn(page);
ec95f53a 1570
2c335680 1571 if (!free_pages_prepare(page, order, true, fpi_flags))
ec95f53a
KM
1572 return;
1573
cfc47a28 1574 migratetype = get_pfnblock_migratetype(page, pfn);
d34b0733
MG
1575 local_irq_save(flags);
1576 __count_vm_events(PGFREE, 1 << order);
7fef431b
DH
1577 free_one_page(page_zone(page), page, pfn, order, migratetype,
1578 fpi_flags);
d34b0733 1579 local_irq_restore(flags);
1da177e4
LT
1580}
1581
a9cd410a 1582void __free_pages_core(struct page *page, unsigned int order)
a226f6c8 1583{
c3993076 1584 unsigned int nr_pages = 1 << order;
e2d0bd2b 1585 struct page *p = page;
c3993076 1586 unsigned int loop;
a226f6c8 1587
7fef431b
DH
1588 /*
1589 * When initializing the memmap, __init_single_page() sets the refcount
1590 * of all pages to 1 ("allocated"/"not free"). We have to set the
1591 * refcount of all involved pages to 0.
1592 */
e2d0bd2b
YL
1593 prefetchw(p);
1594 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1595 prefetchw(p + 1);
c3993076
JW
1596 __ClearPageReserved(p);
1597 set_page_count(p, 0);
a226f6c8 1598 }
e2d0bd2b
YL
1599 __ClearPageReserved(p);
1600 set_page_count(p, 0);
c3993076 1601
9705bea5 1602 atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
7fef431b
DH
1603
1604 /*
1605 * Bypass PCP and place fresh pages right to the tail, primarily
1606 * relevant for memory onlining.
1607 */
2c335680 1608 __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
a226f6c8
DH
1609}
1610
3f08a302 1611#ifdef CONFIG_NEED_MULTIPLE_NODES
7ace9917 1612
03e92a5e
MR
1613/*
1614 * During memory init memblocks map pfns to nids. The search is expensive and
1615 * this caches recent lookups. The implementation of __early_pfn_to_nid
1616 * treats start/end as pfns.
1617 */
1618struct mminit_pfnnid_cache {
1619 unsigned long last_start;
1620 unsigned long last_end;
1621 int last_nid;
1622};
75a592a4 1623
03e92a5e 1624static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
6f24fbd3
MR
1625
1626/*
1627 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
1628 */
03e92a5e 1629static int __meminit __early_pfn_to_nid(unsigned long pfn,
6f24fbd3 1630 struct mminit_pfnnid_cache *state)
75a592a4 1631{
6f24fbd3 1632 unsigned long start_pfn, end_pfn;
75a592a4
MG
1633 int nid;
1634
6f24fbd3
MR
1635 if (state->last_start <= pfn && pfn < state->last_end)
1636 return state->last_nid;
1637
1638 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
1639 if (nid != NUMA_NO_NODE) {
1640 state->last_start = start_pfn;
1641 state->last_end = end_pfn;
1642 state->last_nid = nid;
1643 }
7ace9917
MG
1644
1645 return nid;
75a592a4 1646}
75a592a4 1647
75a592a4 1648int __meminit early_pfn_to_nid(unsigned long pfn)
75a592a4 1649{
7ace9917 1650 static DEFINE_SPINLOCK(early_pfn_lock);
75a592a4
MG
1651 int nid;
1652
7ace9917 1653 spin_lock(&early_pfn_lock);
56ec43d8 1654 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
7ace9917 1655 if (nid < 0)
e4568d38 1656 nid = first_online_node;
7ace9917 1657 spin_unlock(&early_pfn_lock);
75a592a4 1658
7ace9917 1659 return nid;
75a592a4 1660}
3f08a302 1661#endif /* CONFIG_NEED_MULTIPLE_NODES */
75a592a4 1662
7c2ee349 1663void __init memblock_free_pages(struct page *page, unsigned long pfn,
3a80a7fa
MG
1664 unsigned int order)
1665{
1666 if (early_page_uninitialised(pfn))
1667 return;
a9cd410a 1668 __free_pages_core(page, order);
3a80a7fa
MG
1669}
1670
7cf91a98
JK
1671/*
1672 * Check that the whole (or subset of) a pageblock given by the interval of
1673 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1674 * with the migration of free compaction scanner. The scanners then need to
1675 * use only pfn_valid_within() check for arches that allow holes within
1676 * pageblocks.
1677 *
1678 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1679 *
1680 * It's possible on some configurations to have a setup like node0 node1 node0
1681 * i.e. it's possible that all pages within a zones range of pages do not
1682 * belong to a single zone. We assume that a border between node0 and node1
1683 * can occur within a single pageblock, but not a node0 node1 node0
1684 * interleaving within a single pageblock. It is therefore sufficient to check
1685 * the first and last page of a pageblock and avoid checking each individual
1686 * page in a pageblock.
1687 */
1688struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1689 unsigned long end_pfn, struct zone *zone)
1690{
1691 struct page *start_page;
1692 struct page *end_page;
1693
1694 /* end_pfn is one past the range we are checking */
1695 end_pfn--;
1696
1697 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1698 return NULL;
1699
2d070eab
MH
1700 start_page = pfn_to_online_page(start_pfn);
1701 if (!start_page)
1702 return NULL;
7cf91a98
JK
1703
1704 if (page_zone(start_page) != zone)
1705 return NULL;
1706
1707 end_page = pfn_to_page(end_pfn);
1708
1709 /* This gives a shorter code than deriving page_zone(end_page) */
1710 if (page_zone_id(start_page) != page_zone_id(end_page))
1711 return NULL;
1712
1713 return start_page;
1714}
1715
1716void set_zone_contiguous(struct zone *zone)
1717{
1718 unsigned long block_start_pfn = zone->zone_start_pfn;
1719 unsigned long block_end_pfn;
1720
1721 block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1722 for (; block_start_pfn < zone_end_pfn(zone);
1723 block_start_pfn = block_end_pfn,
1724 block_end_pfn += pageblock_nr_pages) {
1725
1726 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1727
1728 if (!__pageblock_pfn_to_page(block_start_pfn,
1729 block_end_pfn, zone))
1730 return;
e84fe99b 1731 cond_resched();
7cf91a98
JK
1732 }
1733
1734 /* We confirm that there is no hole */
1735 zone->contiguous = true;
1736}
1737
1738void clear_zone_contiguous(struct zone *zone)
1739{
1740 zone->contiguous = false;
1741}
1742
7e18adb4 1743#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
2f47a91f
PT
1744static void __init deferred_free_range(unsigned long pfn,
1745 unsigned long nr_pages)
a4de83dd 1746{
2f47a91f
PT
1747 struct page *page;
1748 unsigned long i;
a4de83dd 1749
2f47a91f 1750 if (!nr_pages)
a4de83dd
MG
1751 return;
1752
2f47a91f
PT
1753 page = pfn_to_page(pfn);
1754
a4de83dd 1755 /* Free a large naturally-aligned chunk if possible */
e780149b
XQ
1756 if (nr_pages == pageblock_nr_pages &&
1757 (pfn & (pageblock_nr_pages - 1)) == 0) {
ac5d2539 1758 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
a9cd410a 1759 __free_pages_core(page, pageblock_order);
a4de83dd
MG
1760 return;
1761 }
1762
e780149b
XQ
1763 for (i = 0; i < nr_pages; i++, page++, pfn++) {
1764 if ((pfn & (pageblock_nr_pages - 1)) == 0)
1765 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
a9cd410a 1766 __free_pages_core(page, 0);
e780149b 1767 }
a4de83dd
MG
1768}
1769
d3cd131d
NS
1770/* Completion tracking for deferred_init_memmap() threads */
1771static atomic_t pgdat_init_n_undone __initdata;
1772static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1773
1774static inline void __init pgdat_init_report_one_done(void)
1775{
1776 if (atomic_dec_and_test(&pgdat_init_n_undone))
1777 complete(&pgdat_init_all_done_comp);
1778}
0e1cc95b 1779
2f47a91f 1780/*
80b1f41c
PT
1781 * Returns true if page needs to be initialized or freed to buddy allocator.
1782 *
1783 * First we check if pfn is valid on architectures where it is possible to have
1784 * holes within pageblock_nr_pages. On systems where it is not possible, this
1785 * function is optimized out.
1786 *
1787 * Then, we check if a current large page is valid by only checking the validity
1788 * of the head pfn.
2f47a91f 1789 */
56ec43d8 1790static inline bool __init deferred_pfn_valid(unsigned long pfn)
2f47a91f 1791{
80b1f41c
PT
1792 if (!pfn_valid_within(pfn))
1793 return false;
1794 if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1795 return false;
80b1f41c
PT
1796 return true;
1797}
2f47a91f 1798
80b1f41c
PT
1799/*
1800 * Free pages to buddy allocator. Try to free aligned pages in
1801 * pageblock_nr_pages sizes.
1802 */
56ec43d8 1803static void __init deferred_free_pages(unsigned long pfn,
80b1f41c
PT
1804 unsigned long end_pfn)
1805{
80b1f41c
PT
1806 unsigned long nr_pgmask = pageblock_nr_pages - 1;
1807 unsigned long nr_free = 0;
2f47a91f 1808
80b1f41c 1809 for (; pfn < end_pfn; pfn++) {
56ec43d8 1810 if (!deferred_pfn_valid(pfn)) {
80b1f41c
PT
1811 deferred_free_range(pfn - nr_free, nr_free);
1812 nr_free = 0;
1813 } else if (!(pfn & nr_pgmask)) {
1814 deferred_free_range(pfn - nr_free, nr_free);
1815 nr_free = 1;
80b1f41c
PT
1816 } else {
1817 nr_free++;
1818 }
1819 }
1820 /* Free the last block of pages to allocator */
1821 deferred_free_range(pfn - nr_free, nr_free);
2f47a91f
PT
1822}
1823
80b1f41c
PT
1824/*
1825 * Initialize struct pages. We minimize pfn page lookups and scheduler checks
1826 * by performing it only once every pageblock_nr_pages.
1827 * Return number of pages initialized.
1828 */
56ec43d8 1829static unsigned long __init deferred_init_pages(struct zone *zone,
80b1f41c
PT
1830 unsigned long pfn,
1831 unsigned long end_pfn)
2f47a91f 1832{
2f47a91f 1833 unsigned long nr_pgmask = pageblock_nr_pages - 1;
56ec43d8 1834 int nid = zone_to_nid(zone);
2f47a91f 1835 unsigned long nr_pages = 0;
56ec43d8 1836 int zid = zone_idx(zone);
2f47a91f 1837 struct page *page = NULL;
2f47a91f 1838
80b1f41c 1839 for (; pfn < end_pfn; pfn++) {
56ec43d8 1840 if (!deferred_pfn_valid(pfn)) {
80b1f41c 1841 page = NULL;
2f47a91f 1842 continue;
80b1f41c 1843 } else if (!page || !(pfn & nr_pgmask)) {
2f47a91f 1844 page = pfn_to_page(pfn);
80b1f41c
PT
1845 } else {
1846 page++;
2f47a91f 1847 }
d0dc12e8 1848 __init_single_page(page, pfn, zid, nid);
80b1f41c 1849 nr_pages++;
2f47a91f 1850 }
80b1f41c 1851 return (nr_pages);
2f47a91f
PT
1852}
1853
0e56acae
AD
1854/*
1855 * This function is meant to pre-load the iterator for the zone init.
1856 * Specifically it walks through the ranges until we are caught up to the
1857 * first_init_pfn value and exits there. If we never encounter the value we
1858 * return false indicating there are no valid ranges left.
1859 */
1860static bool __init
1861deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
1862 unsigned long *spfn, unsigned long *epfn,
1863 unsigned long first_init_pfn)
1864{
1865 u64 j;
1866
1867 /*
1868 * Start out by walking through the ranges in this zone that have
1869 * already been initialized. We don't need to do anything with them
1870 * so we just need to flush them out of the system.
1871 */
1872 for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
1873 if (*epfn <= first_init_pfn)
1874 continue;
1875 if (*spfn < first_init_pfn)
1876 *spfn = first_init_pfn;
1877 *i = j;
1878 return true;
1879 }
1880
1881 return false;
1882}
1883
1884/*
1885 * Initialize and free pages. We do it in two loops: first we initialize
1886 * struct page, then free to buddy allocator, because while we are
1887 * freeing pages we can access pages that are ahead (computing buddy
1888 * page in __free_one_page()).
1889 *
1890 * In order to try and keep some memory in the cache we have the loop
1891 * broken along max page order boundaries. This way we will not cause
1892 * any issues with the buddy page computation.
1893 */
1894static unsigned long __init
1895deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
1896 unsigned long *end_pfn)
1897{
1898 unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
1899 unsigned long spfn = *start_pfn, epfn = *end_pfn;
1900 unsigned long nr_pages = 0;
1901 u64 j = *i;
1902
1903 /* First we loop through and initialize the page values */
1904 for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
1905 unsigned long t;
1906
1907 if (mo_pfn <= *start_pfn)
1908 break;
1909
1910 t = min(mo_pfn, *end_pfn);
1911 nr_pages += deferred_init_pages(zone, *start_pfn, t);
1912
1913 if (mo_pfn < *end_pfn) {
1914 *start_pfn = mo_pfn;
1915 break;
1916 }
1917 }
1918
1919 /* Reset values and now loop through freeing pages as needed */
1920 swap(j, *i);
1921
1922 for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
1923 unsigned long t;
1924
1925 if (mo_pfn <= spfn)
1926 break;
1927
1928 t = min(mo_pfn, epfn);
1929 deferred_free_pages(spfn, t);
1930
1931 if (mo_pfn <= epfn)
1932 break;
1933 }
1934
1935 return nr_pages;
1936}
1937
e4443149
DJ
1938static void __init
1939deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
1940 void *arg)
1941{
1942 unsigned long spfn, epfn;
1943 struct zone *zone = arg;
1944 u64 i;
1945
1946 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
1947
1948 /*
1949 * Initialize and free pages in MAX_ORDER sized increments so that we
1950 * can avoid introducing any issues with the buddy allocator.
1951 */
1952 while (spfn < end_pfn) {
1953 deferred_init_maxorder(&i, zone, &spfn, &epfn);
1954 cond_resched();
1955 }
1956}
1957
ecd09650
DJ
1958/* An arch may override for more concurrency. */
1959__weak int __init
1960deferred_page_init_max_threads(const struct cpumask *node_cpumask)
1961{
1962 return 1;
1963}
1964
7e18adb4 1965/* Initialise remaining memory on a node */
0e1cc95b 1966static int __init deferred_init_memmap(void *data)
7e18adb4 1967{
0e1cc95b 1968 pg_data_t *pgdat = data;
0e56acae 1969 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
89c7c402 1970 unsigned long spfn = 0, epfn = 0;
0e56acae 1971 unsigned long first_init_pfn, flags;
7e18adb4 1972 unsigned long start = jiffies;
7e18adb4 1973 struct zone *zone;
e4443149 1974 int zid, max_threads;
2f47a91f 1975 u64 i;
7e18adb4 1976
3a2d7fa8
PT
1977 /* Bind memory initialisation thread to a local node if possible */
1978 if (!cpumask_empty(cpumask))
1979 set_cpus_allowed_ptr(current, cpumask);
1980
1981 pgdat_resize_lock(pgdat, &flags);
1982 first_init_pfn = pgdat->first_deferred_pfn;
0e1cc95b 1983 if (first_init_pfn == ULONG_MAX) {
3a2d7fa8 1984 pgdat_resize_unlock(pgdat, &flags);
d3cd131d 1985 pgdat_init_report_one_done();
0e1cc95b
MG
1986 return 0;
1987 }
1988
7e18adb4
MG
1989 /* Sanity check boundaries */
1990 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
1991 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
1992 pgdat->first_deferred_pfn = ULONG_MAX;
1993
3d060856
PT
1994 /*
1995 * Once we unlock here, the zone cannot be grown anymore, thus if an
1996 * interrupt thread must allocate this early in boot, zone must be
1997 * pre-grown prior to start of deferred page initialization.
1998 */
1999 pgdat_resize_unlock(pgdat, &flags);
2000
7e18adb4
MG
2001 /* Only the highest zone is deferred so find it */
2002 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2003 zone = pgdat->node_zones + zid;
2004 if (first_init_pfn < zone_end_pfn(zone))
2005 break;
2006 }
0e56acae
AD
2007
2008 /* If the zone is empty somebody else may have cleared out the zone */
2009 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2010 first_init_pfn))
2011 goto zone_empty;
7e18adb4 2012
ecd09650 2013 max_threads = deferred_page_init_max_threads(cpumask);
7e18adb4 2014
117003c3 2015 while (spfn < epfn) {
e4443149
DJ
2016 unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
2017 struct padata_mt_job job = {
2018 .thread_fn = deferred_init_memmap_chunk,
2019 .fn_arg = zone,
2020 .start = spfn,
2021 .size = epfn_align - spfn,
2022 .align = PAGES_PER_SECTION,
2023 .min_chunk = PAGES_PER_SECTION,
2024 .max_threads = max_threads,
2025 };
2026
2027 padata_do_multithreaded(&job);
2028 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2029 epfn_align);
117003c3 2030 }
0e56acae 2031zone_empty:
7e18adb4
MG
2032 /* Sanity check that the next zone really is unpopulated */
2033 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
2034
89c7c402
DJ
2035 pr_info("node %d deferred pages initialised in %ums\n",
2036 pgdat->node_id, jiffies_to_msecs(jiffies - start));
d3cd131d
NS
2037
2038 pgdat_init_report_one_done();
0e1cc95b
MG
2039 return 0;
2040}
c9e97a19 2041
c9e97a19
PT
2042/*
2043 * If this zone has deferred pages, try to grow it by initializing enough
2044 * deferred pages to satisfy the allocation specified by order, rounded up to
2045 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
2046 * of SECTION_SIZE bytes by initializing struct pages in increments of
2047 * PAGES_PER_SECTION * sizeof(struct page) bytes.
2048 *
2049 * Return true when zone was grown, otherwise return false. We return true even
2050 * when we grow less than requested, to let the caller decide if there are
2051 * enough pages to satisfy the allocation.
2052 *
2053 * Note: We use noinline because this function is needed only during boot, and
2054 * it is called from a __ref function _deferred_grow_zone. This way we are
2055 * making sure that it is not inlined into permanent text section.
2056 */
2057static noinline bool __init
2058deferred_grow_zone(struct zone *zone, unsigned int order)
2059{
c9e97a19 2060 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
837566e7 2061 pg_data_t *pgdat = zone->zone_pgdat;
c9e97a19 2062 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
0e56acae
AD
2063 unsigned long spfn, epfn, flags;
2064 unsigned long nr_pages = 0;
c9e97a19
PT
2065 u64 i;
2066
2067 /* Only the last zone may have deferred pages */
2068 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
2069 return false;
2070
2071 pgdat_resize_lock(pgdat, &flags);
2072
c9e97a19
PT
2073 /*
2074 * If someone grew this zone while we were waiting for spinlock, return
2075 * true, as there might be enough pages already.
2076 */
2077 if (first_deferred_pfn != pgdat->first_deferred_pfn) {
2078 pgdat_resize_unlock(pgdat, &flags);
2079 return true;
2080 }
2081
0e56acae
AD
2082 /* If the zone is empty somebody else may have cleared out the zone */
2083 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2084 first_deferred_pfn)) {
2085 pgdat->first_deferred_pfn = ULONG_MAX;
c9e97a19 2086 pgdat_resize_unlock(pgdat, &flags);
b9705d87
JG
2087 /* Retry only once. */
2088 return first_deferred_pfn != ULONG_MAX;
c9e97a19
PT
2089 }
2090
0e56acae
AD
2091 /*
2092 * Initialize and free pages in MAX_ORDER sized increments so
2093 * that we can avoid introducing any issues with the buddy
2094 * allocator.
2095 */
2096 while (spfn < epfn) {
2097 /* update our first deferred PFN for this section */
2098 first_deferred_pfn = spfn;
2099
2100 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
117003c3 2101 touch_nmi_watchdog();
c9e97a19 2102
0e56acae
AD
2103 /* We should only stop along section boundaries */
2104 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
2105 continue;
c9e97a19 2106
0e56acae 2107 /* If our quota has been met we can stop here */
c9e97a19
PT
2108 if (nr_pages >= nr_pages_needed)
2109 break;
2110 }
2111
0e56acae 2112 pgdat->first_deferred_pfn = spfn;
c9e97a19
PT
2113 pgdat_resize_unlock(pgdat, &flags);
2114
2115 return nr_pages > 0;
2116}
2117
2118/*
2119 * deferred_grow_zone() is __init, but it is called from
2120 * get_page_from_freelist() during early boot until deferred_pages permanently
2121 * disables this call. This is why we have refdata wrapper to avoid warning,
2122 * and to ensure that the function body gets unloaded.
2123 */
2124static bool __ref
2125_deferred_grow_zone(struct zone *zone, unsigned int order)
2126{
2127 return deferred_grow_zone(zone, order);
2128}
2129
7cf91a98 2130#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
0e1cc95b
MG
2131
2132void __init page_alloc_init_late(void)
2133{
7cf91a98 2134 struct zone *zone;
e900a918 2135 int nid;
7cf91a98
JK
2136
2137#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
0e1cc95b 2138
d3cd131d
NS
2139 /* There will be num_node_state(N_MEMORY) threads */
2140 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
0e1cc95b 2141 for_each_node_state(nid, N_MEMORY) {
0e1cc95b
MG
2142 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
2143 }
2144
2145 /* Block until all are initialised */
d3cd131d 2146 wait_for_completion(&pgdat_init_all_done_comp);
4248b0da 2147
3e8fc007
MG
2148 /*
2149 * The number of managed pages has changed due to the initialisation
2150 * so the pcpu batch and high limits needs to be updated or the limits
2151 * will be artificially small.
2152 */
2153 for_each_populated_zone(zone)
2154 zone_pcp_update(zone);
2155
c9e97a19
PT
2156 /*
2157 * We initialized the rest of the deferred pages. Permanently disable
2158 * on-demand struct page initialization.
2159 */
2160 static_branch_disable(&deferred_pages);
2161
4248b0da
MG
2162 /* Reinit limits that are based on free pages after the kernel is up */
2163 files_maxfiles_init();
7cf91a98 2164#endif
350e88ba 2165
ba8f3587
LF
2166 buffer_init();
2167
3010f876
PT
2168 /* Discard memblock private memory */
2169 memblock_discard();
7cf91a98 2170
e900a918
DW
2171 for_each_node_state(nid, N_MEMORY)
2172 shuffle_free_memory(NODE_DATA(nid));
2173
7cf91a98
JK
2174 for_each_populated_zone(zone)
2175 set_zone_contiguous(zone);
7e18adb4 2176}
7e18adb4 2177
47118af0 2178#ifdef CONFIG_CMA
9cf510a5 2179/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
47118af0
MN
2180void __init init_cma_reserved_pageblock(struct page *page)
2181{
2182 unsigned i = pageblock_nr_pages;
2183 struct page *p = page;
2184
2185 do {
2186 __ClearPageReserved(p);
2187 set_page_count(p, 0);
d883c6cf 2188 } while (++p, --i);
47118af0 2189
47118af0 2190 set_pageblock_migratetype(page, MIGRATE_CMA);
dc78327c
MN
2191
2192 if (pageblock_order >= MAX_ORDER) {
2193 i = pageblock_nr_pages;
2194 p = page;
2195 do {
2196 set_page_refcounted(p);
2197 __free_pages(p, MAX_ORDER - 1);
2198 p += MAX_ORDER_NR_PAGES;
2199 } while (i -= MAX_ORDER_NR_PAGES);
2200 } else {
2201 set_page_refcounted(page);
2202 __free_pages(page, pageblock_order);
2203 }
2204
3dcc0571 2205 adjust_managed_page_count(page, pageblock_nr_pages);
3c381db1 2206 page_zone(page)->cma_pages += pageblock_nr_pages;
47118af0
MN
2207}
2208#endif
1da177e4
LT
2209
2210/*
2211 * The order of subdivision here is critical for the IO subsystem.
2212 * Please do not alter this order without good reasons and regression
2213 * testing. Specifically, as large blocks of memory are subdivided,
2214 * the order in which smaller blocks are delivered depends on the order
2215 * they're subdivided in this function. This is the primary factor
2216 * influencing the order in which pages are delivered to the IO
2217 * subsystem according to empirical testing, and this is also justified
2218 * by considering the behavior of a buddy system containing a single
2219 * large block of memory acted on by a series of small allocations.
2220 * This behavior is a critical factor in sglist merging's success.
2221 *
6d49e352 2222 * -- nyc
1da177e4 2223 */
085cc7d5 2224static inline void expand(struct zone *zone, struct page *page,
6ab01363 2225 int low, int high, int migratetype)
1da177e4
LT
2226{
2227 unsigned long size = 1 << high;
2228
2229 while (high > low) {
1da177e4
LT
2230 high--;
2231 size >>= 1;
309381fe 2232 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
c0a32fc5 2233
acbc15a4
JK
2234 /*
2235 * Mark as guard pages (or page), that will allow to
2236 * merge back to allocator when buddy will be freed.
2237 * Corresponding page table entries will not be touched,
2238 * pages will stay not present in virtual address space
2239 */
2240 if (set_page_guard(zone, &page[size], high, migratetype))
c0a32fc5 2241 continue;
acbc15a4 2242
6ab01363 2243 add_to_free_list(&page[size], zone, high, migratetype);
ab130f91 2244 set_buddy_order(&page[size], high);
1da177e4 2245 }
1da177e4
LT
2246}
2247
4e611801 2248static void check_new_page_bad(struct page *page)
1da177e4 2249{
f4c18e6f 2250 if (unlikely(page->flags & __PG_HWPOISON)) {
e570f56c
NH
2251 /* Don't complain about hwpoisoned pages */
2252 page_mapcount_reset(page); /* remove PageBuddy */
2253 return;
f4c18e6f 2254 }
58b7f119
WY
2255
2256 bad_page(page,
2257 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
4e611801
VB
2258}
2259
2260/*
2261 * This page is about to be returned from the page allocator
2262 */
2263static inline int check_new_page(struct page *page)
2264{
2265 if (likely(page_expected_state(page,
2266 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
2267 return 0;
2268
2269 check_new_page_bad(page);
2270 return 1;
2a7684a2
WF
2271}
2272
479f854a 2273#ifdef CONFIG_DEBUG_VM
4462b32c
VB
2274/*
2275 * With DEBUG_VM enabled, order-0 pages are checked for expected state when
2276 * being allocated from pcp lists. With debug_pagealloc also enabled, they are
2277 * also checked when pcp lists are refilled from the free lists.
2278 */
2279static inline bool check_pcp_refill(struct page *page)
479f854a 2280{
8e57f8ac 2281 if (debug_pagealloc_enabled_static())
4462b32c
VB
2282 return check_new_page(page);
2283 else
2284 return false;
479f854a
MG
2285}
2286
4462b32c 2287static inline bool check_new_pcp(struct page *page)
479f854a
MG
2288{
2289 return check_new_page(page);
2290}
2291#else
4462b32c
VB
2292/*
2293 * With DEBUG_VM disabled, free order-0 pages are checked for expected state
2294 * when pcp lists are being refilled from the free lists. With debug_pagealloc
2295 * enabled, they are also checked when being allocated from the pcp lists.
2296 */
2297static inline bool check_pcp_refill(struct page *page)
479f854a
MG
2298{
2299 return check_new_page(page);
2300}
4462b32c 2301static inline bool check_new_pcp(struct page *page)
479f854a 2302{
8e57f8ac 2303 if (debug_pagealloc_enabled_static())
4462b32c
VB
2304 return check_new_page(page);
2305 else
2306 return false;
479f854a
MG
2307}
2308#endif /* CONFIG_DEBUG_VM */
2309
2310static bool check_new_pages(struct page *page, unsigned int order)
2311{
2312 int i;
2313 for (i = 0; i < (1 << order); i++) {
2314 struct page *p = page + i;
2315
2316 if (unlikely(check_new_page(p)))
2317 return true;
2318 }
2319
2320 return false;
2321}
2322
46f24fd8
JK
2323inline void post_alloc_hook(struct page *page, unsigned int order,
2324 gfp_t gfp_flags)
2325{
1bb5eab3
AK
2326 bool init;
2327
46f24fd8
JK
2328 set_page_private(page, 0);
2329 set_page_refcounted(page);
2330
2331 arch_alloc_page(page, order);
77bc7fd6 2332 debug_pagealloc_map_pages(page, 1 << order);
1bb5eab3
AK
2333
2334 /*
2335 * Page unpoisoning must happen before memory initialization.
2336 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
2337 * allocations and the page unpoisoning code will complain.
2338 */
8db26a3d 2339 kernel_unpoison_pages(page, 1 << order);
862b6dee 2340
1bb5eab3
AK
2341 /*
2342 * As memory initialization might be integrated into KASAN,
2343 * kasan_alloc_pages and kernel_init_free_pages must be
2344 * kept together to avoid discrepancies in behavior.
2345 */
2346 init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
2347 kasan_alloc_pages(page, order, init);
2348 if (init && !kasan_has_integrated_init())
862b6dee 2349 kernel_init_free_pages(page, 1 << order);
1bb5eab3
AK
2350
2351 set_page_owner(page, order, gfp_flags);
46f24fd8
JK
2352}
2353
479f854a 2354static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
c603844b 2355 unsigned int alloc_flags)
2a7684a2 2356{
46f24fd8 2357 post_alloc_hook(page, order, gfp_flags);
17cf4406 2358
17cf4406
NP
2359 if (order && (gfp_flags & __GFP_COMP))
2360 prep_compound_page(page, order);
2361
75379191 2362 /*
2f064f34 2363 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
75379191
VB
2364 * allocate the page. The expectation is that the caller is taking
2365 * steps that will free more memory. The caller should avoid the page
2366 * being used for !PFMEMALLOC purposes.
2367 */
2f064f34
MH
2368 if (alloc_flags & ALLOC_NO_WATERMARKS)
2369 set_page_pfmemalloc(page);
2370 else
2371 clear_page_pfmemalloc(page);
1da177e4
LT
2372}
2373
56fd56b8
MG
2374/*
2375 * Go through the free lists for the given migratetype and remove
2376 * the smallest available page from the freelists
2377 */
85ccc8fa 2378static __always_inline
728ec980 2379struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
56fd56b8
MG
2380 int migratetype)
2381{
2382 unsigned int current_order;
b8af2941 2383 struct free_area *area;
56fd56b8
MG
2384 struct page *page;
2385
2386 /* Find a page of the appropriate size in the preferred list */
2387 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
2388 area = &(zone->free_area[current_order]);
b03641af 2389 page = get_page_from_free_area(area, migratetype);
a16601c5
GT
2390 if (!page)
2391 continue;
6ab01363
AD
2392 del_page_from_free_list(page, zone, current_order);
2393 expand(zone, page, order, current_order, migratetype);
bb14c2c7 2394 set_pcppage_migratetype(page, migratetype);
56fd56b8
MG
2395 return page;
2396 }
2397
2398 return NULL;
2399}
2400
2401
b2a0ac88
MG
2402/*
2403 * This array describes the order lists are fallen back to when
2404 * the free lists for the desirable migrate type are depleted
2405 */
da415663 2406static int fallbacks[MIGRATE_TYPES][3] = {
974a786e 2407 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
974a786e 2408 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
7ead3342 2409 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
47118af0 2410#ifdef CONFIG_CMA
974a786e 2411 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
47118af0 2412#endif
194159fb 2413#ifdef CONFIG_MEMORY_ISOLATION
974a786e 2414 [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
194159fb 2415#endif
b2a0ac88
MG
2416};
2417
dc67647b 2418#ifdef CONFIG_CMA
85ccc8fa 2419static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
dc67647b
JK
2420 unsigned int order)
2421{
2422 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
2423}
2424#else
2425static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
2426 unsigned int order) { return NULL; }
2427#endif
2428
c361be55 2429/*
293ffa5e 2430 * Move the free pages in a range to the freelist tail of the requested type.
d9c23400 2431 * Note that start_page and end_pages are not aligned on a pageblock
c361be55
MG
2432 * boundary. If alignment is required, use move_freepages_block()
2433 */
02aa0cdd 2434static int move_freepages(struct zone *zone,
39ddb991 2435 unsigned long start_pfn, unsigned long end_pfn,
02aa0cdd 2436 int migratetype, int *num_movable)
c361be55
MG
2437{
2438 struct page *page;
39ddb991 2439 unsigned long pfn;
d00181b9 2440 unsigned int order;
d100313f 2441 int pages_moved = 0;
c361be55 2442
39ddb991
KW
2443 for (pfn = start_pfn; pfn <= end_pfn;) {
2444 if (!pfn_valid_within(pfn)) {
2445 pfn++;
c361be55
MG
2446 continue;
2447 }
2448
39ddb991 2449 page = pfn_to_page(pfn);
c361be55 2450 if (!PageBuddy(page)) {
02aa0cdd
VB
2451 /*
2452 * We assume that pages that could be isolated for
2453 * migration are movable. But we don't actually try
2454 * isolating, as that would be expensive.
2455 */
2456 if (num_movable &&
2457 (PageLRU(page) || __PageMovable(page)))
2458 (*num_movable)++;
39ddb991 2459 pfn++;
c361be55
MG
2460 continue;
2461 }
2462
cd961038
DR
2463 /* Make sure we are not inadvertently changing nodes */
2464 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2465 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
2466
ab130f91 2467 order = buddy_order(page);
6ab01363 2468 move_to_free_list(page, zone, order, migratetype);
39ddb991 2469 pfn += 1 << order;
d100313f 2470 pages_moved += 1 << order;
c361be55
MG
2471 }
2472
d100313f 2473 return pages_moved;
c361be55
MG
2474}
2475
ee6f509c 2476int move_freepages_block(struct zone *zone, struct page *page,
02aa0cdd 2477 int migratetype, int *num_movable)
c361be55 2478{
39ddb991 2479 unsigned long start_pfn, end_pfn, pfn;
c361be55 2480
4a222127
DR
2481 if (num_movable)
2482 *num_movable = 0;
2483
39ddb991
KW
2484 pfn = page_to_pfn(page);
2485 start_pfn = pfn & ~(pageblock_nr_pages - 1);
d9c23400 2486 end_pfn = start_pfn + pageblock_nr_pages - 1;
c361be55
MG
2487
2488 /* Do not cross zone boundaries */
108bcc96 2489 if (!zone_spans_pfn(zone, start_pfn))
39ddb991 2490 start_pfn = pfn;
108bcc96 2491 if (!zone_spans_pfn(zone, end_pfn))
c361be55
MG
2492 return 0;
2493
39ddb991 2494 return move_freepages(zone, start_pfn, end_pfn, migratetype,
02aa0cdd 2495 num_movable);
c361be55
MG
2496}
2497
2f66a68f
MG
2498static void change_pageblock_range(struct page *pageblock_page,
2499 int start_order, int migratetype)
2500{
2501 int nr_pageblocks = 1 << (start_order - pageblock_order);
2502
2503 while (nr_pageblocks--) {
2504 set_pageblock_migratetype(pageblock_page, migratetype);
2505 pageblock_page += pageblock_nr_pages;
2506 }
2507}
2508
fef903ef 2509/*
9c0415eb
VB
2510 * When we are falling back to another migratetype during allocation, try to
2511 * steal extra free pages from the same pageblocks to satisfy further
2512 * allocations, instead of polluting multiple pageblocks.
2513 *
2514 * If we are stealing a relatively large buddy page, it is likely there will
2515 * be more free pages in the pageblock, so try to steal them all. For
2516 * reclaimable and unmovable allocations, we steal regardless of page size,
2517 * as fragmentation caused by those allocations polluting movable pageblocks
2518 * is worse than movable allocations stealing from unmovable and reclaimable
2519 * pageblocks.
fef903ef 2520 */
4eb7dce6
JK
2521static bool can_steal_fallback(unsigned int order, int start_mt)
2522{
2523 /*
2524 * Leaving this order check is intended, although there is
2525 * relaxed order check in next check. The reason is that
2526 * we can actually steal whole pageblock if this condition met,
2527 * but, below check doesn't guarantee it and that is just heuristic
2528 * so could be changed anytime.
2529 */
2530 if (order >= pageblock_order)
2531 return true;
2532
2533 if (order >= pageblock_order / 2 ||
2534 start_mt == MIGRATE_RECLAIMABLE ||
2535 start_mt == MIGRATE_UNMOVABLE ||
2536 page_group_by_mobility_disabled)
2537 return true;
2538
2539 return false;
2540}
2541
597c8920 2542static inline bool boost_watermark(struct zone *zone)
1c30844d
MG
2543{
2544 unsigned long max_boost;
2545
2546 if (!watermark_boost_factor)
597c8920 2547 return false;
14f69140
HW
2548 /*
2549 * Don't bother in zones that are unlikely to produce results.
2550 * On small machines, including kdump capture kernels running
2551 * in a small area, boosting the watermark can cause an out of
2552 * memory situation immediately.
2553 */
2554 if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
597c8920 2555 return false;
1c30844d
MG
2556
2557 max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2558 watermark_boost_factor, 10000);
94b3334c
MG
2559
2560 /*
2561 * high watermark may be uninitialised if fragmentation occurs
2562 * very early in boot so do not boost. We do not fall
2563 * through and boost by pageblock_nr_pages as failing
2564 * allocations that early means that reclaim is not going
2565 * to help and it may even be impossible to reclaim the
2566 * boosted watermark resulting in a hang.
2567 */
2568 if (!max_boost)
597c8920 2569 return false;
94b3334c 2570
1c30844d
MG
2571 max_boost = max(pageblock_nr_pages, max_boost);
2572
2573 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2574 max_boost);
597c8920
JW
2575
2576 return true;
1c30844d
MG
2577}
2578
4eb7dce6
JK
2579/*
2580 * This function implements actual steal behaviour. If order is large enough,
2581 * we can steal whole pageblock. If not, we first move freepages in this
02aa0cdd
VB
2582 * pageblock to our migratetype and determine how many already-allocated pages
2583 * are there in the pageblock with a compatible migratetype. If at least half
2584 * of pages are free or compatible, we can change migratetype of the pageblock
2585 * itself, so pages freed in the future will be put on the correct free list.
4eb7dce6
JK
2586 */
2587static void steal_suitable_fallback(struct zone *zone, struct page *page,
1c30844d 2588 unsigned int alloc_flags, int start_type, bool whole_block)
fef903ef 2589{
ab130f91 2590 unsigned int current_order = buddy_order(page);
02aa0cdd
VB
2591 int free_pages, movable_pages, alike_pages;
2592 int old_block_type;
2593
2594 old_block_type = get_pageblock_migratetype(page);
fef903ef 2595
3bc48f96
VB
2596 /*
2597 * This can happen due to races and we want to prevent broken
2598 * highatomic accounting.
2599 */
02aa0cdd 2600 if (is_migrate_highatomic(old_block_type))
3bc48f96
VB
2601 goto single_page;
2602
fef903ef
SB
2603 /* Take ownership for orders >= pageblock_order */
2604 if (current_order >= pageblock_order) {
2605 change_pageblock_range(page, current_order, start_type);
3bc48f96 2606 goto single_page;
fef903ef
SB
2607 }
2608
1c30844d
MG
2609 /*
2610 * Boost watermarks to increase reclaim pressure to reduce the
2611 * likelihood of future fallbacks. Wake kswapd now as the node
2612 * may be balanced overall and kswapd will not wake naturally.
2613 */
597c8920 2614 if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
73444bc4 2615 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
1c30844d 2616
3bc48f96
VB
2617 /* We are not allowed to try stealing from the whole block */
2618 if (!whole_block)
2619 goto single_page;
2620
02aa0cdd
VB
2621 free_pages = move_freepages_block(zone, page, start_type,
2622 &movable_pages);
2623 /*
2624 * Determine how many pages are compatible with our allocation.
2625 * For movable allocation, it's the number of movable pages which
2626 * we just obtained. For other types it's a bit more tricky.
2627 */
2628 if (start_type == MIGRATE_MOVABLE) {
2629 alike_pages = movable_pages;
2630 } else {
2631 /*
2632 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2633 * to MOVABLE pageblock, consider all non-movable pages as
2634 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2635 * vice versa, be conservative since we can't distinguish the
2636 * exact migratetype of non-movable pages.
2637 */
2638 if (old_block_type == MIGRATE_MOVABLE)
2639 alike_pages = pageblock_nr_pages
2640 - (free_pages + movable_pages);
2641 else
2642 alike_pages = 0;
2643 }
2644
3bc48f96 2645 /* moving whole block can fail due to zone boundary conditions */
02aa0cdd 2646 if (!free_pages)
3bc48f96 2647 goto single_page;
fef903ef 2648
02aa0cdd
VB
2649 /*
2650 * If a sufficient number of pages in the block are either free or of
2651 * comparable migratability as our allocation, claim the whole block.
2652 */
2653 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
4eb7dce6
JK
2654 page_group_by_mobility_disabled)
2655 set_pageblock_migratetype(page, start_type);
3bc48f96
VB
2656
2657 return;
2658
2659single_page:
6ab01363 2660 move_to_free_list(page, zone, current_order, start_type);
4eb7dce6
JK
2661}
2662
2149cdae
JK
2663/*
2664 * Check whether there is a suitable fallback freepage with requested order.
2665 * If only_stealable is true, this function returns fallback_mt only if
2666 * we can steal other freepages all together. This would help to reduce
2667 * fragmentation due to mixed migratetype pages in one pageblock.
2668 */
2669int find_suitable_fallback(struct free_area *area, unsigned int order,
2670 int migratetype, bool only_stealable, bool *can_steal)
4eb7dce6
JK
2671{
2672 int i;
2673 int fallback_mt;
2674
2675 if (area->nr_free == 0)
2676 return -1;
2677
2678 *can_steal = false;
2679 for (i = 0;; i++) {
2680 fallback_mt = fallbacks[migratetype][i];
974a786e 2681 if (fallback_mt == MIGRATE_TYPES)
4eb7dce6
JK
2682 break;
2683
b03641af 2684 if (free_area_empty(area, fallback_mt))
4eb7dce6 2685 continue;
fef903ef 2686
4eb7dce6
JK
2687 if (can_steal_fallback(order, migratetype))
2688 *can_steal = true;
2689
2149cdae
JK
2690 if (!only_stealable)
2691 return fallback_mt;
2692
2693 if (*can_steal)
2694 return fallback_mt;
fef903ef 2695 }
4eb7dce6
JK
2696
2697 return -1;
fef903ef
SB
2698}
2699
0aaa29a5
MG
2700/*
2701 * Reserve a pageblock for exclusive use of high-order atomic allocations if
2702 * there are no empty page blocks that contain a page with a suitable order
2703 */
2704static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2705 unsigned int alloc_order)
2706{
2707 int mt;
2708 unsigned long max_managed, flags;
2709
2710 /*
2711 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2712 * Check is race-prone but harmless.
2713 */
9705bea5 2714 max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
0aaa29a5
MG
2715 if (zone->nr_reserved_highatomic >= max_managed)
2716 return;
2717
2718 spin_lock_irqsave(&zone->lock, flags);
2719
2720 /* Recheck the nr_reserved_highatomic limit under the lock */
2721 if (zone->nr_reserved_highatomic >= max_managed)
2722 goto out_unlock;
2723
2724 /* Yoink! */
2725 mt = get_pageblock_migratetype(page);
a6ffdc07
XQ
2726 if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
2727 && !is_migrate_cma(mt)) {
0aaa29a5
MG
2728 zone->nr_reserved_highatomic += pageblock_nr_pages;
2729 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
02aa0cdd 2730 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
0aaa29a5
MG
2731 }
2732
2733out_unlock:
2734 spin_unlock_irqrestore(&zone->lock, flags);
2735}
2736
2737/*
2738 * Used when an allocation is about to fail under memory pressure. This
2739 * potentially hurts the reliability of high-order allocations when under
2740 * intense memory pressure but failed atomic allocations should be easier
2741 * to recover from than an OOM.
29fac03b
MK
2742 *
2743 * If @force is true, try to unreserve a pageblock even though highatomic
2744 * pageblock is exhausted.
0aaa29a5 2745 */
29fac03b
MK
2746static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2747 bool force)
0aaa29a5
MG
2748{
2749 struct zonelist *zonelist = ac->zonelist;
2750 unsigned long flags;
2751 struct zoneref *z;
2752 struct zone *zone;
2753 struct page *page;
2754 int order;
04c8716f 2755 bool ret;
0aaa29a5 2756
97a225e6 2757 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
0aaa29a5 2758 ac->nodemask) {
29fac03b
MK
2759 /*
2760 * Preserve at least one pageblock unless memory pressure
2761 * is really high.
2762 */
2763 if (!force && zone->nr_reserved_highatomic <=
2764 pageblock_nr_pages)
0aaa29a5
MG
2765 continue;
2766
2767 spin_lock_irqsave(&zone->lock, flags);
2768 for (order = 0; order < MAX_ORDER; order++) {
2769 struct free_area *area = &(zone->free_area[order]);
2770
b03641af 2771 page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
a16601c5 2772 if (!page)
0aaa29a5
MG
2773 continue;
2774
0aaa29a5 2775 /*
4855e4a7
MK
2776 * In page freeing path, migratetype change is racy so
2777 * we can counter several free pages in a pageblock
f0953a1b 2778 * in this loop although we changed the pageblock type
4855e4a7
MK
2779 * from highatomic to ac->migratetype. So we should
2780 * adjust the count once.
0aaa29a5 2781 */
a6ffdc07 2782 if (is_migrate_highatomic_page(page)) {
4855e4a7
MK
2783 /*
2784 * It should never happen but changes to
2785 * locking could inadvertently allow a per-cpu
2786 * drain to add pages to MIGRATE_HIGHATOMIC
2787 * while unreserving so be safe and watch for
2788 * underflows.
2789 */
2790 zone->nr_reserved_highatomic -= min(
2791 pageblock_nr_pages,
2792 zone->nr_reserved_highatomic);
2793 }
0aaa29a5
MG
2794
2795 /*
2796 * Convert to ac->migratetype and avoid the normal
2797 * pageblock stealing heuristics. Minimally, the caller
2798 * is doing the work and needs the pages. More
2799 * importantly, if the block was always converted to
2800 * MIGRATE_UNMOVABLE or another type then the number
2801 * of pageblocks that cannot be completely freed
2802 * may increase.
2803 */
2804 set_pageblock_migratetype(page, ac->migratetype);
02aa0cdd
VB
2805 ret = move_freepages_block(zone, page, ac->migratetype,
2806 NULL);
29fac03b
MK
2807 if (ret) {
2808 spin_unlock_irqrestore(&zone->lock, flags);
2809 return ret;
2810 }
0aaa29a5
MG
2811 }
2812 spin_unlock_irqrestore(&zone->lock, flags);
2813 }
04c8716f
MK
2814
2815 return false;
0aaa29a5
MG
2816}
2817
3bc48f96
VB
2818/*
2819 * Try finding a free buddy page on the fallback list and put it on the free
2820 * list of requested migratetype, possibly along with other pages from the same
2821 * block, depending on fragmentation avoidance heuristics. Returns true if
2822 * fallback was found so that __rmqueue_smallest() can grab it.
b002529d
RV
2823 *
2824 * The use of signed ints for order and current_order is a deliberate
2825 * deviation from the rest of this file, to make the for loop
2826 * condition simpler.
3bc48f96 2827 */
85ccc8fa 2828static __always_inline bool
6bb15450
MG
2829__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2830 unsigned int alloc_flags)
b2a0ac88 2831{
b8af2941 2832 struct free_area *area;
b002529d 2833 int current_order;
6bb15450 2834 int min_order = order;
b2a0ac88 2835 struct page *page;
4eb7dce6
JK
2836 int fallback_mt;
2837 bool can_steal;
b2a0ac88 2838
6bb15450
MG
2839 /*
2840 * Do not steal pages from freelists belonging to other pageblocks
2841 * i.e. orders < pageblock_order. If there are no local zones free,
2842 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2843 */
2844 if (alloc_flags & ALLOC_NOFRAGMENT)
2845 min_order = pageblock_order;
2846
7a8f58f3
VB
2847 /*
2848 * Find the largest available free page in the other list. This roughly
2849 * approximates finding the pageblock with the most free pages, which
2850 * would be too costly to do exactly.
2851 */
6bb15450 2852 for (current_order = MAX_ORDER - 1; current_order >= min_order;
7aeb09f9 2853 --current_order) {
4eb7dce6
JK
2854 area = &(zone->free_area[current_order]);
2855 fallback_mt = find_suitable_fallback(area, current_order,
2149cdae 2856 start_migratetype, false, &can_steal);
4eb7dce6
JK
2857 if (fallback_mt == -1)
2858 continue;
b2a0ac88 2859
7a8f58f3
VB
2860 /*
2861 * We cannot steal all free pages from the pageblock and the
2862 * requested migratetype is movable. In that case it's better to
2863 * steal and split the smallest available page instead of the
2864 * largest available page, because even if the next movable
2865 * allocation falls back into a different pageblock than this
2866 * one, it won't cause permanent fragmentation.
2867 */
2868 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2869 && current_order > order)
2870 goto find_smallest;
b2a0ac88 2871
7a8f58f3
VB
2872 goto do_steal;
2873 }
e0fff1bd 2874
7a8f58f3 2875 return false;
e0fff1bd 2876
7a8f58f3
VB
2877find_smallest:
2878 for (current_order = order; current_order < MAX_ORDER;
2879 current_order++) {
2880 area = &(zone->free_area[current_order]);
2881 fallback_mt = find_suitable_fallback(area, current_order,
2882 start_migratetype, false, &can_steal);
2883 if (fallback_mt != -1)
2884 break;
b2a0ac88
MG
2885 }
2886
7a8f58f3
VB
2887 /*
2888 * This should not happen - we already found a suitable fallback
2889 * when looking for the largest page.
2890 */
2891 VM_BUG_ON(current_order == MAX_ORDER);
2892
2893do_steal:
b03641af 2894 page = get_page_from_free_area(area, fallback_mt);
7a8f58f3 2895
1c30844d
MG
2896 steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2897 can_steal);
7a8f58f3
VB
2898
2899 trace_mm_page_alloc_extfrag(page, order, current_order,
2900 start_migratetype, fallback_mt);
2901
2902 return true;
2903
b2a0ac88
MG
2904}
2905
56fd56b8 2906/*
1da177e4
LT
2907 * Do the hard work of removing an element from the buddy allocator.
2908 * Call me with the zone->lock already held.
2909 */
85ccc8fa 2910static __always_inline struct page *
6bb15450
MG
2911__rmqueue(struct zone *zone, unsigned int order, int migratetype,
2912 unsigned int alloc_flags)
1da177e4 2913{
1da177e4
LT
2914 struct page *page;
2915
ce8f86ee
H
2916 if (IS_ENABLED(CONFIG_CMA)) {
2917 /*
2918 * Balance movable allocations between regular and CMA areas by
2919 * allocating from CMA when over half of the zone's free memory
2920 * is in the CMA area.
2921 */
2922 if (alloc_flags & ALLOC_CMA &&
2923 zone_page_state(zone, NR_FREE_CMA_PAGES) >
2924 zone_page_state(zone, NR_FREE_PAGES) / 2) {
2925 page = __rmqueue_cma_fallback(zone, order);
2926 if (page)
2927 goto out;
2928 }
16867664 2929 }
3bc48f96 2930retry:
56fd56b8 2931 page = __rmqueue_smallest(zone, order, migratetype);
974a786e 2932 if (unlikely(!page)) {
8510e69c 2933 if (alloc_flags & ALLOC_CMA)
dc67647b
JK
2934 page = __rmqueue_cma_fallback(zone, order);
2935
6bb15450
MG
2936 if (!page && __rmqueue_fallback(zone, order, migratetype,
2937 alloc_flags))
3bc48f96 2938 goto retry;
728ec980 2939 }
ce8f86ee
H
2940out:
2941 if (page)
2942 trace_mm_page_alloc_zone_locked(page, order, migratetype);
b2a0ac88 2943 return page;
1da177e4
LT
2944}
2945
5f63b720 2946/*
1da177e4
LT
2947 * Obtain a specified number of elements from the buddy allocator, all under
2948 * a single hold of the lock, for efficiency. Add them to the supplied list.
2949 * Returns the number of new pages which were placed at *list.
2950 */
5f63b720 2951static int rmqueue_bulk(struct zone *zone, unsigned int order,
b2a0ac88 2952 unsigned long count, struct list_head *list,
6bb15450 2953 int migratetype, unsigned int alloc_flags)
1da177e4 2954{
cb66bede 2955 int i, allocated = 0;
5f63b720 2956
d34b0733 2957 spin_lock(&zone->lock);
1da177e4 2958 for (i = 0; i < count; ++i) {
6bb15450
MG
2959 struct page *page = __rmqueue(zone, order, migratetype,
2960 alloc_flags);
085cc7d5 2961 if (unlikely(page == NULL))
1da177e4 2962 break;
81eabcbe 2963
479f854a
MG
2964 if (unlikely(check_pcp_refill(page)))
2965 continue;
2966
81eabcbe 2967 /*
0fac3ba5
VB
2968 * Split buddy pages returned by expand() are received here in
2969 * physical page order. The page is added to the tail of
2970 * caller's list. From the callers perspective, the linked list
2971 * is ordered by page number under some conditions. This is
2972 * useful for IO devices that can forward direction from the
2973 * head, thus also in the physical page order. This is useful
2974 * for IO devices that can merge IO requests if the physical
2975 * pages are ordered properly.
81eabcbe 2976 */
0fac3ba5 2977 list_add_tail(&page->lru, list);
cb66bede 2978 allocated++;
bb14c2c7 2979 if (is_migrate_cma(get_pcppage_migratetype(page)))
d1ce749a
BZ
2980 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2981 -(1 << order));
1da177e4 2982 }
a6de734b
MG
2983
2984 /*
2985 * i pages were removed from the buddy list even if some leak due
2986 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
cb66bede 2987 * on i. Do not confuse with 'allocated' which is the number of
a6de734b
MG
2988 * pages added to the pcp list.
2989 */
f2260e6b 2990 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
d34b0733 2991 spin_unlock(&zone->lock);
cb66bede 2992 return allocated;
1da177e4
LT
2993}
2994
4ae7c039 2995#ifdef CONFIG_NUMA
8fce4d8e 2996/*
4037d452
CL
2997 * Called from the vmstat counter updater to drain pagesets of this
2998 * currently executing processor on remote nodes after they have
2999 * expired.
3000 *
879336c3
CL
3001 * Note that this function must be called with the thread pinned to
3002 * a single processor.
8fce4d8e 3003 */
4037d452 3004void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
4ae7c039 3005{
4ae7c039 3006 unsigned long flags;
7be12fc9 3007 int to_drain, batch;
4ae7c039 3008
4037d452 3009 local_irq_save(flags);
4db0c3c2 3010 batch = READ_ONCE(pcp->batch);
7be12fc9 3011 to_drain = min(pcp->count, batch);
77ba9062 3012 if (to_drain > 0)
2a13515c 3013 free_pcppages_bulk(zone, to_drain, pcp);
4037d452 3014 local_irq_restore(flags);
4ae7c039
CL
3015}
3016#endif
3017
9f8f2172 3018/*
93481ff0 3019 * Drain pcplists of the indicated processor and zone.
9f8f2172
CL
3020 *
3021 * The processor must either be the current processor and the
3022 * thread pinned to the current processor or a processor that
3023 * is not online.
3024 */
93481ff0 3025static void drain_pages_zone(unsigned int cpu, struct zone *zone)
1da177e4 3026{
c54ad30c 3027 unsigned long flags;
93481ff0
VB
3028 struct per_cpu_pageset *pset;
3029 struct per_cpu_pages *pcp;
1da177e4 3030
93481ff0
VB
3031 local_irq_save(flags);
3032 pset = per_cpu_ptr(zone->pageset, cpu);
1da177e4 3033
93481ff0 3034 pcp = &pset->pcp;
77ba9062 3035 if (pcp->count)
93481ff0 3036 free_pcppages_bulk(zone, pcp->count, pcp);
93481ff0
VB
3037 local_irq_restore(flags);
3038}
3dfa5721 3039
93481ff0
VB
3040/*
3041 * Drain pcplists of all zones on the indicated processor.
3042 *
3043 * The processor must either be the current processor and the
3044 * thread pinned to the current processor or a processor that
3045 * is not online.
3046 */
3047static void drain_pages(unsigned int cpu)
3048{
3049 struct zone *zone;
3050
3051 for_each_populated_zone(zone) {
3052 drain_pages_zone(cpu, zone);
1da177e4
LT
3053 }
3054}
1da177e4 3055
9f8f2172
CL
3056/*
3057 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
93481ff0
VB
3058 *
3059 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
3060 * the single zone's pages.
9f8f2172 3061 */
93481ff0 3062void drain_local_pages(struct zone *zone)
9f8f2172 3063{
93481ff0
VB
3064 int cpu = smp_processor_id();
3065
3066 if (zone)
3067 drain_pages_zone(cpu, zone);
3068 else
3069 drain_pages(cpu);
9f8f2172
CL
3070}
3071
0ccce3b9
MG
3072static void drain_local_pages_wq(struct work_struct *work)
3073{
d9367bd0
WY
3074 struct pcpu_drain *drain;
3075
3076 drain = container_of(work, struct pcpu_drain, work);
3077
a459eeb7
MH
3078 /*
3079 * drain_all_pages doesn't use proper cpu hotplug protection so
3080 * we can race with cpu offline when the WQ can move this from
3081 * a cpu pinned worker to an unbound one. We can operate on a different
f0953a1b 3082 * cpu which is alright but we also have to make sure to not move to
a459eeb7
MH
3083 * a different one.
3084 */
3085 preempt_disable();
d9367bd0 3086 drain_local_pages(drain->zone);
a459eeb7 3087 preempt_enable();
0ccce3b9
MG
3088}
3089
9f8f2172 3090/*
ec6e8c7e
VB
3091 * The implementation of drain_all_pages(), exposing an extra parameter to
3092 * drain on all cpus.
93481ff0 3093 *
ec6e8c7e
VB
3094 * drain_all_pages() is optimized to only execute on cpus where pcplists are
3095 * not empty. The check for non-emptiness can however race with a free to
3096 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
3097 * that need the guarantee that every CPU has drained can disable the
3098 * optimizing racy check.
9f8f2172 3099 */
3b1f3658 3100static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
9f8f2172 3101{
74046494 3102 int cpu;
74046494
GBY
3103
3104 /*
3105 * Allocate in the BSS so we wont require allocation in
3106 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
3107 */
3108 static cpumask_t cpus_with_pcps;
3109
ce612879
MH
3110 /*
3111 * Make sure nobody triggers this path before mm_percpu_wq is fully
3112 * initialized.
3113 */
3114 if (WARN_ON_ONCE(!mm_percpu_wq))
3115 return;
3116
bd233f53
MG
3117 /*
3118 * Do not drain if one is already in progress unless it's specific to
3119 * a zone. Such callers are primarily CMA and memory hotplug and need
3120 * the drain to be complete when the call returns.
3121 */
3122 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
3123 if (!zone)
3124 return;
3125 mutex_lock(&pcpu_drain_mutex);
3126 }
0ccce3b9 3127
74046494
GBY
3128 /*
3129 * We don't care about racing with CPU hotplug event
3130 * as offline notification will cause the notified
3131 * cpu to drain that CPU pcps and on_each_cpu_mask
3132 * disables preemption as part of its processing
3133 */
3134 for_each_online_cpu(cpu) {
93481ff0
VB
3135 struct per_cpu_pageset *pcp;
3136 struct zone *z;
74046494 3137 bool has_pcps = false;
93481ff0 3138
ec6e8c7e
VB
3139 if (force_all_cpus) {
3140 /*
3141 * The pcp.count check is racy, some callers need a
3142 * guarantee that no cpu is missed.
3143 */
3144 has_pcps = true;
3145 } else if (zone) {
74046494 3146 pcp = per_cpu_ptr(zone->pageset, cpu);
93481ff0 3147 if (pcp->pcp.count)
74046494 3148 has_pcps = true;
93481ff0
VB
3149 } else {
3150 for_each_populated_zone(z) {
3151 pcp = per_cpu_ptr(z->pageset, cpu);
3152 if (pcp->pcp.count) {
3153 has_pcps = true;
3154 break;
3155 }
74046494
GBY
3156 }
3157 }
93481ff0 3158
74046494
GBY
3159 if (has_pcps)
3160 cpumask_set_cpu(cpu, &cpus_with_pcps);
3161 else
3162 cpumask_clear_cpu(cpu, &cpus_with_pcps);
3163 }
0ccce3b9 3164
bd233f53 3165 for_each_cpu(cpu, &cpus_with_pcps) {
d9367bd0
WY
3166 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
3167
3168 drain->zone = zone;
3169 INIT_WORK(&drain->work, drain_local_pages_wq);
3170 queue_work_on(cpu, mm_percpu_wq, &drain->work);
0ccce3b9 3171 }
bd233f53 3172 for_each_cpu(cpu, &cpus_with_pcps)
d9367bd0 3173 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
bd233f53
MG
3174
3175 mutex_unlock(&pcpu_drain_mutex);
9f8f2172
CL
3176}
3177
ec6e8c7e
VB
3178/*
3179 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
3180 *
3181 * When zone parameter is non-NULL, spill just the single zone's pages.
3182 *
3183 * Note that this can be extremely slow as the draining happens in a workqueue.
3184 */
3185void drain_all_pages(struct zone *zone)
3186{
3187 __drain_all_pages(zone, false);
3188}
3189
296699de 3190#ifdef CONFIG_HIBERNATION
1da177e4 3191
556b969a
CY
3192/*
3193 * Touch the watchdog for every WD_PAGE_COUNT pages.
3194 */
3195#define WD_PAGE_COUNT (128*1024)
3196
1da177e4
LT
3197void mark_free_pages(struct zone *zone)
3198{
556b969a 3199 unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
f623f0db 3200 unsigned long flags;
7aeb09f9 3201 unsigned int order, t;
86760a2c 3202 struct page *page;
1da177e4 3203
8080fc03 3204 if (zone_is_empty(zone))
1da177e4
LT
3205 return;
3206
3207 spin_lock_irqsave(&zone->lock, flags);
f623f0db 3208
108bcc96 3209 max_zone_pfn = zone_end_pfn(zone);
f623f0db
RW
3210 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
3211 if (pfn_valid(pfn)) {
86760a2c 3212 page = pfn_to_page(pfn);
ba6b0979 3213
556b969a
CY
3214 if (!--page_count) {
3215 touch_nmi_watchdog();
3216 page_count = WD_PAGE_COUNT;
3217 }
3218
ba6b0979
JK
3219 if (page_zone(page) != zone)
3220 continue;
3221
7be98234
RW
3222 if (!swsusp_page_is_forbidden(page))
3223 swsusp_unset_page_free(page);
f623f0db 3224 }
1da177e4 3225
b2a0ac88 3226 for_each_migratetype_order(order, t) {
86760a2c
GT
3227 list_for_each_entry(page,
3228 &zone->free_area[order].free_list[t], lru) {
f623f0db 3229 unsigned long i;
1da177e4 3230
86760a2c 3231 pfn = page_to_pfn(page);
556b969a
CY
3232 for (i = 0; i < (1UL << order); i++) {
3233 if (!--page_count) {
3234 touch_nmi_watchdog();
3235 page_count = WD_PAGE_COUNT;
3236 }
7be98234 3237 swsusp_set_page_free(pfn_to_page(pfn + i));
556b969a 3238 }
f623f0db 3239 }
b2a0ac88 3240 }
1da177e4
LT
3241 spin_unlock_irqrestore(&zone->lock, flags);
3242}
e2c55dc8 3243#endif /* CONFIG_PM */
1da177e4 3244
2d4894b5 3245static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
1da177e4 3246{
5f8dcc21 3247 int migratetype;
1da177e4 3248
4db7548c 3249 if (!free_pcp_prepare(page))
9cca35d4 3250 return false;
689bcebf 3251
dc4b0caf 3252 migratetype = get_pfnblock_migratetype(page, pfn);
bb14c2c7 3253 set_pcppage_migratetype(page, migratetype);
9cca35d4
MG
3254 return true;
3255}
3256
2d4894b5 3257static void free_unref_page_commit(struct page *page, unsigned long pfn)
9cca35d4
MG
3258{
3259 struct zone *zone = page_zone(page);
3260 struct per_cpu_pages *pcp;
3261 int migratetype;
3262
3263 migratetype = get_pcppage_migratetype(page);
d34b0733 3264 __count_vm_event(PGFREE);
da456f14 3265
5f8dcc21
MG
3266 /*
3267 * We only track unmovable, reclaimable and movable on pcp lists.
3268 * Free ISOLATE pages back to the allocator because they are being
a6ffdc07 3269 * offlined but treat HIGHATOMIC as movable pages so we can get those
5f8dcc21
MG
3270 * areas back if necessary. Otherwise, we may have to free
3271 * excessively into the page allocator
3272 */
3273 if (migratetype >= MIGRATE_PCPTYPES) {
194159fb 3274 if (unlikely(is_migrate_isolate(migratetype))) {
7fef431b
DH
3275 free_one_page(zone, page, pfn, 0, migratetype,
3276 FPI_NONE);
9cca35d4 3277 return;
5f8dcc21
MG
3278 }
3279 migratetype = MIGRATE_MOVABLE;
3280 }
3281
99dcc3e5 3282 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2d4894b5 3283 list_add(&page->lru, &pcp->lists[migratetype]);
1da177e4 3284 pcp->count++;
5c3ad2eb
VB
3285 if (pcp->count >= READ_ONCE(pcp->high))
3286 free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
9cca35d4 3287}
5f8dcc21 3288
9cca35d4
MG
3289/*
3290 * Free a 0-order page
9cca35d4 3291 */
2d4894b5 3292void free_unref_page(struct page *page)
9cca35d4
MG
3293{
3294 unsigned long flags;
3295 unsigned long pfn = page_to_pfn(page);
3296
2d4894b5 3297 if (!free_unref_page_prepare(page, pfn))
9cca35d4
MG
3298 return;
3299
3300 local_irq_save(flags);
2d4894b5 3301 free_unref_page_commit(page, pfn);
d34b0733 3302 local_irq_restore(flags);
1da177e4
LT
3303}
3304
cc59850e
KK
3305/*
3306 * Free a list of 0-order pages
3307 */
2d4894b5 3308void free_unref_page_list(struct list_head *list)
cc59850e
KK
3309{
3310 struct page *page, *next;
9cca35d4 3311 unsigned long flags, pfn;
c24ad77d 3312 int batch_count = 0;
9cca35d4
MG
3313
3314 /* Prepare pages for freeing */
3315 list_for_each_entry_safe(page, next, list, lru) {
3316 pfn = page_to_pfn(page);
2d4894b5 3317 if (!free_unref_page_prepare(page, pfn))
9cca35d4
MG
3318 list_del(&page->lru);
3319 set_page_private(page, pfn);
3320 }
cc59850e 3321
9cca35d4 3322 local_irq_save(flags);
cc59850e 3323 list_for_each_entry_safe(page, next, list, lru) {
9cca35d4
MG
3324 unsigned long pfn = page_private(page);
3325
3326 set_page_private(page, 0);
2d4894b5
MG
3327 trace_mm_page_free_batched(page);
3328 free_unref_page_commit(page, pfn);
c24ad77d
LS
3329
3330 /*
3331 * Guard against excessive IRQ disabled times when we get
3332 * a large list of pages to free.
3333 */
3334 if (++batch_count == SWAP_CLUSTER_MAX) {
3335 local_irq_restore(flags);
3336 batch_count = 0;
3337 local_irq_save(flags);
3338 }
cc59850e 3339 }
9cca35d4 3340 local_irq_restore(flags);
cc59850e
KK
3341}
3342
8dfcc9ba
NP
3343/*
3344 * split_page takes a non-compound higher-order page, and splits it into
3345 * n (1<<order) sub-pages: page[0..n]
3346 * Each sub-page must be freed individually.
3347 *
3348 * Note: this is probably too low level an operation for use in drivers.
3349 * Please consult with lkml before using this in your driver.
3350 */
3351void split_page(struct page *page, unsigned int order)
3352{
3353 int i;
3354
309381fe
SL
3355 VM_BUG_ON_PAGE(PageCompound(page), page);
3356 VM_BUG_ON_PAGE(!page_count(page), page);
b1eeab67 3357
a9627bc5 3358 for (i = 1; i < (1 << order); i++)
7835e98b 3359 set_page_refcounted(page + i);
8fb156c9 3360 split_page_owner(page, 1 << order);
e1baddf8 3361 split_page_memcg(page, 1 << order);
8dfcc9ba 3362}
5853ff23 3363EXPORT_SYMBOL_GPL(split_page);
8dfcc9ba 3364
3c605096 3365int __isolate_free_page(struct page *page, unsigned int order)
748446bb 3366{
748446bb
MG
3367 unsigned long watermark;
3368 struct zone *zone;
2139cbe6 3369 int mt;
748446bb
MG
3370
3371 BUG_ON(!PageBuddy(page));
3372
3373 zone = page_zone(page);
2e30abd1 3374 mt = get_pageblock_migratetype(page);
748446bb 3375
194159fb 3376 if (!is_migrate_isolate(mt)) {
8348faf9
VB
3377 /*
3378 * Obey watermarks as if the page was being allocated. We can
3379 * emulate a high-order watermark check with a raised order-0
3380 * watermark, because we already know our high-order page
3381 * exists.
3382 */
fd1444b2 3383 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
d883c6cf 3384 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2e30abd1
MS
3385 return 0;
3386
8fb74b9f 3387 __mod_zone_freepage_state(zone, -(1UL << order), mt);
2e30abd1 3388 }
748446bb
MG
3389
3390 /* Remove page from free list */
b03641af 3391
6ab01363 3392 del_page_from_free_list(page, zone, order);
2139cbe6 3393
400bc7fd 3394 /*
3395 * Set the pageblock if the isolated page is at least half of a
3396 * pageblock
3397 */
748446bb
MG
3398 if (order >= pageblock_order - 1) {
3399 struct page *endpage = page + (1 << order) - 1;
47118af0
MN
3400 for (; page < endpage; page += pageblock_nr_pages) {
3401 int mt = get_pageblock_migratetype(page);
88ed365e 3402 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
a6ffdc07 3403 && !is_migrate_highatomic(mt))
47118af0
MN
3404 set_pageblock_migratetype(page,
3405 MIGRATE_MOVABLE);
3406 }
748446bb
MG
3407 }
3408
f3a14ced 3409
8fb74b9f 3410 return 1UL << order;
1fb3f8ca
MG
3411}
3412
624f58d8
AD
3413/**
3414 * __putback_isolated_page - Return a now-isolated page back where we got it
3415 * @page: Page that was isolated
3416 * @order: Order of the isolated page
e6a0a7ad 3417 * @mt: The page's pageblock's migratetype
624f58d8
AD
3418 *
3419 * This function is meant to return a page pulled from the free lists via
3420 * __isolate_free_page back to the free lists they were pulled from.
3421 */
3422void __putback_isolated_page(struct page *page, unsigned int order, int mt)
3423{
3424 struct zone *zone = page_zone(page);
3425
3426 /* zone lock should be held when this function is called */
3427 lockdep_assert_held(&zone->lock);
3428
3429 /* Return isolated page to tail of freelist. */
f04a5d5d 3430 __free_one_page(page, page_to_pfn(page), zone, order, mt,
47b6a24a 3431 FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
624f58d8
AD
3432}
3433
060e7417
MG
3434/*
3435 * Update NUMA hit/miss statistics
3436 *
3437 * Must be called with interrupts disabled.
060e7417 3438 */
41b6167e 3439static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
060e7417
MG
3440{
3441#ifdef CONFIG_NUMA
3a321d2a 3442 enum numa_stat_item local_stat = NUMA_LOCAL;
060e7417 3443
4518085e
KW
3444 /* skip numa counters update if numa stats is disabled */
3445 if (!static_branch_likely(&vm_numa_stat_key))
3446 return;
3447
c1093b74 3448 if (zone_to_nid(z) != numa_node_id())
060e7417 3449 local_stat = NUMA_OTHER;
060e7417 3450
c1093b74 3451 if (zone_to_nid(z) == zone_to_nid(preferred_zone))
3a321d2a 3452 __inc_numa_state(z, NUMA_HIT);
2df26639 3453 else {
3a321d2a
KW
3454 __inc_numa_state(z, NUMA_MISS);
3455 __inc_numa_state(preferred_zone, NUMA_FOREIGN);
060e7417 3456 }
3a321d2a 3457 __inc_numa_state(z, local_stat);
060e7417
MG
3458#endif
3459}
3460
066b2393 3461/* Remove page from the per-cpu list, caller must protect the list */
3b822017
JDB
3462static inline
3463struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
6bb15450 3464 unsigned int alloc_flags,
453f85d4 3465 struct per_cpu_pages *pcp,
066b2393
MG
3466 struct list_head *list)
3467{
3468 struct page *page;
3469
3470 do {
3471 if (list_empty(list)) {
3472 pcp->count += rmqueue_bulk(zone, 0,
5c3ad2eb 3473 READ_ONCE(pcp->batch), list,
6bb15450 3474 migratetype, alloc_flags);
066b2393
MG
3475 if (unlikely(list_empty(list)))
3476 return NULL;
3477 }
3478
453f85d4 3479 page = list_first_entry(list, struct page, lru);
066b2393
MG
3480 list_del(&page->lru);
3481 pcp->count--;
3482 } while (check_new_pcp(page));
3483
3484 return page;
3485}
3486
3487/* Lock and remove page from the per-cpu list */
3488static struct page *rmqueue_pcplist(struct zone *preferred_zone,
1c52e6d0
YS
3489 struct zone *zone, gfp_t gfp_flags,
3490 int migratetype, unsigned int alloc_flags)
066b2393
MG
3491{
3492 struct per_cpu_pages *pcp;
3493 struct list_head *list;
066b2393 3494 struct page *page;
d34b0733 3495 unsigned long flags;
066b2393 3496
d34b0733 3497 local_irq_save(flags);
066b2393
MG
3498 pcp = &this_cpu_ptr(zone->pageset)->pcp;
3499 list = &pcp->lists[migratetype];
6bb15450 3500 page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
066b2393 3501 if (page) {
1c52e6d0 3502 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
066b2393
MG
3503 zone_statistics(preferred_zone, zone);
3504 }
d34b0733 3505 local_irq_restore(flags);
066b2393
MG
3506 return page;
3507}
3508
1da177e4 3509/*
75379191 3510 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
1da177e4 3511 */
0a15c3e9 3512static inline
066b2393 3513struct page *rmqueue(struct zone *preferred_zone,
7aeb09f9 3514 struct zone *zone, unsigned int order,
c603844b
MG
3515 gfp_t gfp_flags, unsigned int alloc_flags,
3516 int migratetype)
1da177e4
LT
3517{
3518 unsigned long flags;
689bcebf 3519 struct page *page;
1da177e4 3520
d34b0733 3521 if (likely(order == 0)) {
1d91df85
JK
3522 /*
3523 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
3524 * we need to skip it when CMA area isn't allowed.
3525 */
3526 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
3527 migratetype != MIGRATE_MOVABLE) {
3528 page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
1c52e6d0 3529 migratetype, alloc_flags);
1d91df85
JK
3530 goto out;
3531 }
066b2393 3532 }
83b9355b 3533
066b2393
MG
3534 /*
3535 * We most definitely don't want callers attempting to
3536 * allocate greater than order-1 page units with __GFP_NOFAIL.
3537 */
3538 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3539 spin_lock_irqsave(&zone->lock, flags);
0aaa29a5 3540
066b2393
MG
3541 do {
3542 page = NULL;
1d91df85
JK
3543 /*
3544 * order-0 request can reach here when the pcplist is skipped
3545 * due to non-CMA allocation context. HIGHATOMIC area is
3546 * reserved for high-order atomic allocation, so order-0
3547 * request should skip it.
3548 */
3549 if (order > 0 && alloc_flags & ALLOC_HARDER) {
066b2393
MG
3550 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3551 if (page)
3552 trace_mm_page_alloc_zone_locked(page, order, migratetype);
3553 }
a74609fa 3554 if (!page)
6bb15450 3555 page = __rmqueue(zone, order, migratetype, alloc_flags);
066b2393
MG
3556 } while (page && check_new_pages(page, order));
3557 spin_unlock(&zone->lock);
3558 if (!page)
3559 goto failed;
3560 __mod_zone_freepage_state(zone, -(1 << order),
3561 get_pcppage_migratetype(page));
1da177e4 3562
16709d1d 3563 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
41b6167e 3564 zone_statistics(preferred_zone, zone);
a74609fa 3565 local_irq_restore(flags);
1da177e4 3566
066b2393 3567out:
73444bc4
MG
3568 /* Separate test+clear to avoid unnecessary atomics */
3569 if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
3570 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3571 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3572 }
3573
066b2393 3574 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
1da177e4 3575 return page;
a74609fa
NP
3576
3577failed:
3578 local_irq_restore(flags);
a74609fa 3579 return NULL;
1da177e4
LT
3580}
3581
933e312e
AM
3582#ifdef CONFIG_FAIL_PAGE_ALLOC
3583
b2588c4b 3584static struct {
933e312e
AM
3585 struct fault_attr attr;
3586
621a5f7a 3587 bool ignore_gfp_highmem;
71baba4b 3588 bool ignore_gfp_reclaim;
54114994 3589 u32 min_order;
933e312e
AM
3590} fail_page_alloc = {
3591 .attr = FAULT_ATTR_INITIALIZER,
71baba4b 3592 .ignore_gfp_reclaim = true,
621a5f7a 3593 .ignore_gfp_highmem = true,
54114994 3594 .min_order = 1,
933e312e
AM
3595};
3596
3597static int __init setup_fail_page_alloc(char *str)
3598{
3599 return setup_fault_attr(&fail_page_alloc.attr, str);
3600}
3601__setup("fail_page_alloc=", setup_fail_page_alloc);
3602
af3b8544 3603static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
933e312e 3604{
54114994 3605 if (order < fail_page_alloc.min_order)
deaf386e 3606 return false;
933e312e 3607 if (gfp_mask & __GFP_NOFAIL)
deaf386e 3608 return false;
933e312e 3609 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
deaf386e 3610 return false;
71baba4b
MG
3611 if (fail_page_alloc.ignore_gfp_reclaim &&
3612 (gfp_mask & __GFP_DIRECT_RECLAIM))
deaf386e 3613 return false;
933e312e
AM
3614
3615 return should_fail(&fail_page_alloc.attr, 1 << order);
3616}
3617
3618#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3619
3620static int __init fail_page_alloc_debugfs(void)
3621{
0825a6f9 3622 umode_t mode = S_IFREG | 0600;
933e312e 3623 struct dentry *dir;
933e312e 3624
dd48c085
AM
3625 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3626 &fail_page_alloc.attr);
b2588c4b 3627
d9f7979c
GKH
3628 debugfs_create_bool("ignore-gfp-wait", mode, dir,
3629 &fail_page_alloc.ignore_gfp_reclaim);
3630 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3631 &fail_page_alloc.ignore_gfp_highmem);
3632 debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
933e312e 3633
d9f7979c 3634 return 0;
933e312e
AM
3635}
3636
3637late_initcall(fail_page_alloc_debugfs);
3638
3639#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3640
3641#else /* CONFIG_FAIL_PAGE_ALLOC */
3642
af3b8544 3643static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
933e312e 3644{
deaf386e 3645 return false;
933e312e
AM
3646}
3647
3648#endif /* CONFIG_FAIL_PAGE_ALLOC */
3649
76cd6173 3650noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
af3b8544
BP
3651{
3652 return __should_fail_alloc_page(gfp_mask, order);
3653}
3654ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3655
f27ce0e1
JK
3656static inline long __zone_watermark_unusable_free(struct zone *z,
3657 unsigned int order, unsigned int alloc_flags)
3658{
3659 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
3660 long unusable_free = (1 << order) - 1;
3661
3662 /*
3663 * If the caller does not have rights to ALLOC_HARDER then subtract
3664 * the high-atomic reserves. This will over-estimate the size of the
3665 * atomic reserve but it avoids a search.
3666 */
3667 if (likely(!alloc_harder))
3668 unusable_free += z->nr_reserved_highatomic;
3669
3670#ifdef CONFIG_CMA
3671 /* If allocation can't use CMA areas don't use free CMA pages */
3672 if (!(alloc_flags & ALLOC_CMA))
3673 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
3674#endif
3675
3676 return unusable_free;
3677}
3678
1da177e4 3679/*
97a16fc8
MG
3680 * Return true if free base pages are above 'mark'. For high-order checks it
3681 * will return true of the order-0 watermark is reached and there is at least
3682 * one free page of a suitable size. Checking now avoids taking the zone lock
3683 * to check in the allocation paths if no pages are free.
1da177e4 3684 */
86a294a8 3685bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
97a225e6 3686 int highest_zoneidx, unsigned int alloc_flags,
86a294a8 3687 long free_pages)
1da177e4 3688{
d23ad423 3689 long min = mark;
1da177e4 3690 int o;
cd04ae1e 3691 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
1da177e4 3692
0aaa29a5 3693 /* free_pages may go negative - that's OK */
f27ce0e1 3694 free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
0aaa29a5 3695
7fb1d9fc 3696 if (alloc_flags & ALLOC_HIGH)
1da177e4 3697 min -= min / 2;
0aaa29a5 3698
f27ce0e1 3699 if (unlikely(alloc_harder)) {
cd04ae1e
MH
3700 /*
3701 * OOM victims can try even harder than normal ALLOC_HARDER
3702 * users on the grounds that it's definitely going to be in
3703 * the exit path shortly and free memory. Any allocation it
3704 * makes during the free path will be small and short-lived.
3705 */
3706 if (alloc_flags & ALLOC_OOM)
3707 min -= min / 2;
3708 else
3709 min -= min / 4;
3710 }
3711
97a16fc8
MG
3712 /*
3713 * Check watermarks for an order-0 allocation request. If these
3714 * are not met, then a high-order request also cannot go ahead
3715 * even if a suitable page happened to be free.
3716 */
97a225e6 3717 if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
88f5acf8 3718 return false;
1da177e4 3719
97a16fc8
MG
3720 /* If this is an order-0 request then the watermark is fine */
3721 if (!order)
3722 return true;
3723
3724 /* For a high-order request, check at least one suitable page is free */
3725 for (o = order; o < MAX_ORDER; o++) {
3726 struct free_area *area = &z->free_area[o];
3727 int mt;
3728
3729 if (!area->nr_free)
3730 continue;
3731
97a16fc8 3732 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
b03641af 3733 if (!free_area_empty(area, mt))
97a16fc8
MG
3734 return true;
3735 }
3736
3737#ifdef CONFIG_CMA
d883c6cf 3738 if ((alloc_flags & ALLOC_CMA) &&
b03641af 3739 !free_area_empty(area, MIGRATE_CMA)) {
97a16fc8 3740 return true;
d883c6cf 3741 }
97a16fc8 3742#endif
76089d00 3743 if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
b050e376 3744 return true;
1da177e4 3745 }
97a16fc8 3746 return false;
88f5acf8
MG
3747}
3748
7aeb09f9 3749bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
97a225e6 3750 int highest_zoneidx, unsigned int alloc_flags)
88f5acf8 3751{
97a225e6 3752 return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
88f5acf8
MG
3753 zone_page_state(z, NR_FREE_PAGES));
3754}
3755
48ee5f36 3756static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
97a225e6 3757 unsigned long mark, int highest_zoneidx,
f80b08fc 3758 unsigned int alloc_flags, gfp_t gfp_mask)
48ee5f36 3759{
f27ce0e1 3760 long free_pages;
d883c6cf 3761
f27ce0e1 3762 free_pages = zone_page_state(z, NR_FREE_PAGES);
48ee5f36
MG
3763
3764 /*
3765 * Fast check for order-0 only. If this fails then the reserves
f27ce0e1 3766 * need to be calculated.
48ee5f36 3767 */
f27ce0e1
JK
3768 if (!order) {
3769 long fast_free;
3770
3771 fast_free = free_pages;
3772 fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
3773 if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
3774 return true;
3775 }
48ee5f36 3776
f80b08fc
CTR
3777 if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3778 free_pages))
3779 return true;
3780 /*
3781 * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
3782 * when checking the min watermark. The min watermark is the
3783 * point where boosting is ignored so that kswapd is woken up
3784 * when below the low watermark.
3785 */
3786 if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
3787 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
3788 mark = z->_watermark[WMARK_MIN];
3789 return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3790 alloc_flags, free_pages);
3791 }
3792
3793 return false;
48ee5f36
MG
3794}
3795
7aeb09f9 3796bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
97a225e6 3797 unsigned long mark, int highest_zoneidx)
88f5acf8
MG
3798{
3799 long free_pages = zone_page_state(z, NR_FREE_PAGES);
3800
3801 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3802 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3803
97a225e6 3804 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
88f5acf8 3805 free_pages);
1da177e4
LT
3806}
3807
9276b1bc 3808#ifdef CONFIG_NUMA
957f822a
DR
3809static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3810{
e02dc017 3811 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
a55c7454 3812 node_reclaim_distance;
957f822a 3813}
9276b1bc 3814#else /* CONFIG_NUMA */
957f822a
DR
3815static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3816{
3817 return true;
3818}
9276b1bc
PJ
3819#endif /* CONFIG_NUMA */
3820
6bb15450
MG
3821/*
3822 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3823 * fragmentation is subtle. If the preferred zone was HIGHMEM then
3824 * premature use of a lower zone may cause lowmem pressure problems that
3825 * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3826 * probably too small. It only makes sense to spread allocations to avoid
3827 * fragmentation between the Normal and DMA32 zones.
3828 */
3829static inline unsigned int
0a79cdad 3830alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
6bb15450 3831{
736838e9 3832 unsigned int alloc_flags;
0a79cdad 3833
736838e9
MN
3834 /*
3835 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3836 * to save a branch.
3837 */
3838 alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
0a79cdad
MG
3839
3840#ifdef CONFIG_ZONE_DMA32
8139ad04
AR
3841 if (!zone)
3842 return alloc_flags;
3843
6bb15450 3844 if (zone_idx(zone) != ZONE_NORMAL)
8118b82e 3845 return alloc_flags;
6bb15450
MG
3846
3847 /*
3848 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3849 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3850 * on UMA that if Normal is populated then so is DMA32.
3851 */
3852 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3853 if (nr_online_nodes > 1 && !populated_zone(--zone))
8118b82e 3854 return alloc_flags;
6bb15450 3855
8118b82e 3856 alloc_flags |= ALLOC_NOFRAGMENT;
0a79cdad
MG
3857#endif /* CONFIG_ZONE_DMA32 */
3858 return alloc_flags;
6bb15450 3859}
6bb15450 3860
8e3560d9
PT
3861/* Must be called after current_gfp_context() which can change gfp_mask */
3862static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
3863 unsigned int alloc_flags)
8510e69c
JK
3864{
3865#ifdef CONFIG_CMA
8e3560d9 3866 if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
8510e69c 3867 alloc_flags |= ALLOC_CMA;
8510e69c
JK
3868#endif
3869 return alloc_flags;
3870}
3871
7fb1d9fc 3872/*
0798e519 3873 * get_page_from_freelist goes through the zonelist trying to allocate
7fb1d9fc
RS
3874 * a page.
3875 */
3876static struct page *
a9263751
VB
3877get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3878 const struct alloc_context *ac)
753ee728 3879{
6bb15450 3880 struct zoneref *z;
5117f45d 3881 struct zone *zone;
3b8c0be4 3882 struct pglist_data *last_pgdat_dirty_limit = NULL;
6bb15450 3883 bool no_fallback;
3b8c0be4 3884
6bb15450 3885retry:
7fb1d9fc 3886 /*
9276b1bc 3887 * Scan zonelist, looking for a zone with enough free.
344736f2 3888 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
7fb1d9fc 3889 */
6bb15450
MG
3890 no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3891 z = ac->preferred_zoneref;
30d8ec73
MN
3892 for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
3893 ac->nodemask) {
be06af00 3894 struct page *page;
e085dbc5
JW
3895 unsigned long mark;
3896
664eedde
MG
3897 if (cpusets_enabled() &&
3898 (alloc_flags & ALLOC_CPUSET) &&
002f2906 3899 !__cpuset_zone_allowed(zone, gfp_mask))
cd38b115 3900 continue;
a756cf59
JW
3901 /*
3902 * When allocating a page cache page for writing, we
281e3726
MG
3903 * want to get it from a node that is within its dirty
3904 * limit, such that no single node holds more than its
a756cf59 3905 * proportional share of globally allowed dirty pages.
281e3726 3906 * The dirty limits take into account the node's
a756cf59
JW
3907 * lowmem reserves and high watermark so that kswapd
3908 * should be able to balance it without having to
3909 * write pages from its LRU list.
3910 *
a756cf59 3911 * XXX: For now, allow allocations to potentially
281e3726 3912 * exceed the per-node dirty limit in the slowpath
c9ab0c4f 3913 * (spread_dirty_pages unset) before going into reclaim,
a756cf59 3914 * which is important when on a NUMA setup the allowed
281e3726 3915 * nodes are together not big enough to reach the
a756cf59 3916 * global limit. The proper fix for these situations
281e3726 3917 * will require awareness of nodes in the
a756cf59
JW
3918 * dirty-throttling and the flusher threads.
3919 */
3b8c0be4
MG
3920 if (ac->spread_dirty_pages) {
3921 if (last_pgdat_dirty_limit == zone->zone_pgdat)
3922 continue;
3923
3924 if (!node_dirty_ok(zone->zone_pgdat)) {
3925 last_pgdat_dirty_limit = zone->zone_pgdat;
3926 continue;
3927 }
3928 }
7fb1d9fc 3929
6bb15450
MG
3930 if (no_fallback && nr_online_nodes > 1 &&
3931 zone != ac->preferred_zoneref->zone) {
3932 int local_nid;
3933
3934 /*
3935 * If moving to a remote node, retry but allow
3936 * fragmenting fallbacks. Locality is more important
3937 * than fragmentation avoidance.
3938 */
3939 local_nid = zone_to_nid(ac->preferred_zoneref->zone);
3940 if (zone_to_nid(zone) != local_nid) {
3941 alloc_flags &= ~ALLOC_NOFRAGMENT;
3942 goto retry;
3943 }
3944 }
3945
a9214443 3946 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
48ee5f36 3947 if (!zone_watermark_fast(zone, order, mark,
f80b08fc
CTR
3948 ac->highest_zoneidx, alloc_flags,
3949 gfp_mask)) {
fa5e084e
MG
3950 int ret;
3951
c9e97a19
PT
3952#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3953 /*
3954 * Watermark failed for this zone, but see if we can
3955 * grow this zone if it contains deferred pages.
3956 */
3957 if (static_branch_unlikely(&deferred_pages)) {
3958 if (_deferred_grow_zone(zone, order))
3959 goto try_this_zone;
3960 }
3961#endif
5dab2911
MG
3962 /* Checked here to keep the fast path fast */
3963 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3964 if (alloc_flags & ALLOC_NO_WATERMARKS)
3965 goto try_this_zone;
3966
202e35db 3967 if (!node_reclaim_enabled() ||
c33d6c06 3968 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
cd38b115
MG
3969 continue;
3970
a5f5f91d 3971 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
fa5e084e 3972 switch (ret) {
a5f5f91d 3973 case NODE_RECLAIM_NOSCAN:
fa5e084e 3974 /* did not scan */
cd38b115 3975 continue;
a5f5f91d 3976 case NODE_RECLAIM_FULL:
fa5e084e 3977 /* scanned but unreclaimable */
cd38b115 3978 continue;
fa5e084e
MG
3979 default:
3980 /* did we reclaim enough */
fed2719e 3981 if (zone_watermark_ok(zone, order, mark,
97a225e6 3982 ac->highest_zoneidx, alloc_flags))
fed2719e
MG
3983 goto try_this_zone;
3984
fed2719e 3985 continue;
0798e519 3986 }
7fb1d9fc
RS
3987 }
3988
fa5e084e 3989try_this_zone:
066b2393 3990 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
0aaa29a5 3991 gfp_mask, alloc_flags, ac->migratetype);
75379191 3992 if (page) {
479f854a 3993 prep_new_page(page, order, gfp_mask, alloc_flags);
0aaa29a5
MG
3994
3995 /*
3996 * If this is a high-order atomic allocation then check
3997 * if the pageblock should be reserved for the future
3998 */
3999 if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
4000 reserve_highatomic_pageblock(page, zone, order);
4001
75379191 4002 return page;
c9e97a19
PT
4003 } else {
4004#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
4005 /* Try again if zone has deferred pages */
4006 if (static_branch_unlikely(&deferred_pages)) {
4007 if (_deferred_grow_zone(zone, order))
4008 goto try_this_zone;
4009 }
4010#endif
75379191 4011 }
54a6eb5c 4012 }
9276b1bc 4013
6bb15450
MG
4014 /*
4015 * It's possible on a UMA machine to get through all zones that are
4016 * fragmented. If avoiding fragmentation, reset and try again.
4017 */
4018 if (no_fallback) {
4019 alloc_flags &= ~ALLOC_NOFRAGMENT;
4020 goto retry;
4021 }
4022
4ffeaf35 4023 return NULL;
753ee728
MH
4024}
4025
9af744d7 4026static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
a238ab5b 4027{
a238ab5b 4028 unsigned int filter = SHOW_MEM_FILTER_NODES;
a238ab5b
DH
4029
4030 /*
4031 * This documents exceptions given to allocations in certain
4032 * contexts that are allowed to allocate outside current's set
4033 * of allowed nodes.
4034 */
4035 if (!(gfp_mask & __GFP_NOMEMALLOC))
cd04ae1e 4036 if (tsk_is_oom_victim(current) ||
a238ab5b
DH
4037 (current->flags & (PF_MEMALLOC | PF_EXITING)))
4038 filter &= ~SHOW_MEM_FILTER_NODES;
d0164adc 4039 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
a238ab5b
DH
4040 filter &= ~SHOW_MEM_FILTER_NODES;
4041
9af744d7 4042 show_mem(filter, nodemask);
aa187507
MH
4043}
4044
a8e99259 4045void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
aa187507
MH
4046{
4047 struct va_format vaf;
4048 va_list args;
1be334e5 4049 static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
aa187507 4050
0f7896f1 4051 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
aa187507
MH
4052 return;
4053
7877cdcc
MH
4054 va_start(args, fmt);
4055 vaf.fmt = fmt;
4056 vaf.va = &args;
ef8444ea 4057 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
0205f755
MH
4058 current->comm, &vaf, gfp_mask, &gfp_mask,
4059 nodemask_pr_args(nodemask));
7877cdcc 4060 va_end(args);
3ee9a4f0 4061
a8e99259 4062 cpuset_print_current_mems_allowed();
ef8444ea 4063 pr_cont("\n");
a238ab5b 4064 dump_stack();
685dbf6f 4065 warn_alloc_show_mem(gfp_mask, nodemask);
a238ab5b
DH
4066}
4067
6c18ba7a
MH
4068static inline struct page *
4069__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
4070 unsigned int alloc_flags,
4071 const struct alloc_context *ac)
4072{
4073 struct page *page;
4074
4075 page = get_page_from_freelist(gfp_mask, order,
4076 alloc_flags|ALLOC_CPUSET, ac);
4077 /*
4078 * fallback to ignore cpuset restriction if our nodes
4079 * are depleted
4080 */
4081 if (!page)
4082 page = get_page_from_freelist(gfp_mask, order,
4083 alloc_flags, ac);
4084
4085 return page;
4086}
4087
11e33f6a
MG
4088static inline struct page *
4089__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
a9263751 4090 const struct alloc_context *ac, unsigned long *did_some_progress)
11e33f6a 4091{
6e0fc46d
DR
4092 struct oom_control oc = {
4093 .zonelist = ac->zonelist,
4094 .nodemask = ac->nodemask,
2a966b77 4095 .memcg = NULL,
6e0fc46d
DR
4096 .gfp_mask = gfp_mask,
4097 .order = order,
6e0fc46d 4098 };
11e33f6a
MG
4099 struct page *page;
4100
9879de73
JW
4101 *did_some_progress = 0;
4102
9879de73 4103 /*
dc56401f
JW
4104 * Acquire the oom lock. If that fails, somebody else is
4105 * making progress for us.
9879de73 4106 */
dc56401f 4107 if (!mutex_trylock(&oom_lock)) {
9879de73 4108 *did_some_progress = 1;
11e33f6a 4109 schedule_timeout_uninterruptible(1);
1da177e4
LT
4110 return NULL;
4111 }
6b1de916 4112
11e33f6a
MG
4113 /*
4114 * Go through the zonelist yet one more time, keep very high watermark
4115 * here, this is only to catch a parallel oom killing, we must fail if
e746bf73
TH
4116 * we're still under heavy pressure. But make sure that this reclaim
4117 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
4118 * allocation which will never fail due to oom_lock already held.
11e33f6a 4119 */
e746bf73
TH
4120 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
4121 ~__GFP_DIRECT_RECLAIM, order,
4122 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
7fb1d9fc 4123 if (page)
11e33f6a
MG
4124 goto out;
4125
06ad276a
MH
4126 /* Coredumps can quickly deplete all memory reserves */
4127 if (current->flags & PF_DUMPCORE)
4128 goto out;
4129 /* The OOM killer will not help higher order allocs */
4130 if (order > PAGE_ALLOC_COSTLY_ORDER)
4131 goto out;
dcda9b04
MH
4132 /*
4133 * We have already exhausted all our reclaim opportunities without any
4134 * success so it is time to admit defeat. We will skip the OOM killer
4135 * because it is very likely that the caller has a more reasonable
4136 * fallback than shooting a random task.
cfb4a541
MN
4137 *
4138 * The OOM killer may not free memory on a specific node.
dcda9b04 4139 */
cfb4a541 4140 if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
dcda9b04 4141 goto out;
06ad276a 4142 /* The OOM killer does not needlessly kill tasks for lowmem */
97a225e6 4143 if (ac->highest_zoneidx < ZONE_NORMAL)
06ad276a
MH
4144 goto out;
4145 if (pm_suspended_storage())
4146 goto out;
4147 /*
4148 * XXX: GFP_NOFS allocations should rather fail than rely on
4149 * other request to make a forward progress.
4150 * We are in an unfortunate situation where out_of_memory cannot
4151 * do much for this context but let's try it to at least get
4152 * access to memory reserved if the current task is killed (see
4153 * out_of_memory). Once filesystems are ready to handle allocation
4154 * failures more gracefully we should just bail out here.
4155 */
4156
3c2c6488 4157 /* Exhausted what can be done so it's blame time */
5020e285 4158 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
c32b3cbe 4159 *did_some_progress = 1;
5020e285 4160
6c18ba7a
MH
4161 /*
4162 * Help non-failing allocations by giving them access to memory
4163 * reserves
4164 */
4165 if (gfp_mask & __GFP_NOFAIL)
4166 page = __alloc_pages_cpuset_fallback(gfp_mask, order,
5020e285 4167 ALLOC_NO_WATERMARKS, ac);
5020e285 4168 }
11e33f6a 4169out:
dc56401f 4170 mutex_unlock(&oom_lock);
11e33f6a
MG
4171 return page;
4172}
4173
33c2d214 4174/*
baf2f90b 4175 * Maximum number of compaction retries with a progress before OOM
33c2d214
MH
4176 * killer is consider as the only way to move forward.
4177 */
4178#define MAX_COMPACT_RETRIES 16
4179
56de7263
MG
4180#ifdef CONFIG_COMPACTION
4181/* Try memory compaction for high-order allocations before reclaim */
4182static struct page *
4183__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
c603844b 4184 unsigned int alloc_flags, const struct alloc_context *ac,
a5508cd8 4185 enum compact_priority prio, enum compact_result *compact_result)
56de7263 4186{
5e1f0f09 4187 struct page *page = NULL;
eb414681 4188 unsigned long pflags;
499118e9 4189 unsigned int noreclaim_flag;
53853e2d
VB
4190
4191 if (!order)
66199712 4192 return NULL;
66199712 4193
eb414681 4194 psi_memstall_enter(&pflags);
499118e9 4195 noreclaim_flag = memalloc_noreclaim_save();
eb414681 4196
c5d01d0d 4197 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
5e1f0f09 4198 prio, &page);
eb414681 4199
499118e9 4200 memalloc_noreclaim_restore(noreclaim_flag);
eb414681 4201 psi_memstall_leave(&pflags);
56de7263 4202
06dac2f4
CTR
4203 if (*compact_result == COMPACT_SKIPPED)
4204 return NULL;
98dd3b48
VB
4205 /*
4206 * At least in one zone compaction wasn't deferred or skipped, so let's
4207 * count a compaction stall
4208 */
4209 count_vm_event(COMPACTSTALL);
8fb74b9f 4210
5e1f0f09
MG
4211 /* Prep a captured page if available */
4212 if (page)
4213 prep_new_page(page, order, gfp_mask, alloc_flags);
4214
4215 /* Try get a page from the freelist if available */
4216 if (!page)
4217 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
53853e2d 4218
98dd3b48
VB
4219 if (page) {
4220 struct zone *zone = page_zone(page);
53853e2d 4221
98dd3b48
VB
4222 zone->compact_blockskip_flush = false;
4223 compaction_defer_reset(zone, order, true);
4224 count_vm_event(COMPACTSUCCESS);
4225 return page;
4226 }
56de7263 4227
98dd3b48
VB
4228 /*
4229 * It's bad if compaction run occurs and fails. The most likely reason
4230 * is that pages exist, but not enough to satisfy watermarks.
4231 */
4232 count_vm_event(COMPACTFAIL);
66199712 4233
98dd3b48 4234 cond_resched();
56de7263
MG
4235
4236 return NULL;
4237}
33c2d214 4238
3250845d
VB
4239static inline bool
4240should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
4241 enum compact_result compact_result,
4242 enum compact_priority *compact_priority,
d9436498 4243 int *compaction_retries)
3250845d
VB
4244{
4245 int max_retries = MAX_COMPACT_RETRIES;
c2033b00 4246 int min_priority;
65190cff
MH
4247 bool ret = false;
4248 int retries = *compaction_retries;
4249 enum compact_priority priority = *compact_priority;
3250845d
VB
4250
4251 if (!order)
4252 return false;
4253
691d9497
AT
4254 if (fatal_signal_pending(current))
4255 return false;
4256
d9436498
VB
4257 if (compaction_made_progress(compact_result))
4258 (*compaction_retries)++;
4259
3250845d
VB
4260 /*
4261 * compaction considers all the zone as desperately out of memory
4262 * so it doesn't really make much sense to retry except when the
4263 * failure could be caused by insufficient priority
4264 */
d9436498
VB
4265 if (compaction_failed(compact_result))
4266 goto check_priority;
3250845d 4267
49433085
VB
4268 /*
4269 * compaction was skipped because there are not enough order-0 pages
4270 * to work with, so we retry only if it looks like reclaim can help.
4271 */
4272 if (compaction_needs_reclaim(compact_result)) {
4273 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
4274 goto out;
4275 }
4276
3250845d
VB
4277 /*
4278 * make sure the compaction wasn't deferred or didn't bail out early
4279 * due to locks contention before we declare that we should give up.
49433085
VB
4280 * But the next retry should use a higher priority if allowed, so
4281 * we don't just keep bailing out endlessly.
3250845d 4282 */
65190cff 4283 if (compaction_withdrawn(compact_result)) {
49433085 4284 goto check_priority;
65190cff 4285 }
3250845d
VB
4286
4287 /*
dcda9b04 4288 * !costly requests are much more important than __GFP_RETRY_MAYFAIL
3250845d
VB
4289 * costly ones because they are de facto nofail and invoke OOM
4290 * killer to move on while costly can fail and users are ready
4291 * to cope with that. 1/4 retries is rather arbitrary but we
4292 * would need much more detailed feedback from compaction to
4293 * make a better decision.
4294 */
4295 if (order > PAGE_ALLOC_COSTLY_ORDER)
4296 max_retries /= 4;
65190cff
MH
4297 if (*compaction_retries <= max_retries) {
4298 ret = true;
4299 goto out;
4300 }
3250845d 4301
d9436498
VB
4302 /*
4303 * Make sure there are attempts at the highest priority if we exhausted
4304 * all retries or failed at the lower priorities.
4305 */
4306check_priority:
c2033b00
VB
4307 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
4308 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
65190cff 4309
c2033b00 4310 if (*compact_priority > min_priority) {
d9436498
VB
4311 (*compact_priority)--;
4312 *compaction_retries = 0;
65190cff 4313 ret = true;
d9436498 4314 }
65190cff
MH
4315out:
4316 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
4317 return ret;
3250845d 4318}
56de7263
MG
4319#else
4320static inline struct page *
4321__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
c603844b 4322 unsigned int alloc_flags, const struct alloc_context *ac,
a5508cd8 4323 enum compact_priority prio, enum compact_result *compact_result)
56de7263 4324{
33c2d214 4325 *compact_result = COMPACT_SKIPPED;
56de7263
MG
4326 return NULL;
4327}
33c2d214
MH
4328
4329static inline bool
86a294a8
MH
4330should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
4331 enum compact_result compact_result,
a5508cd8 4332 enum compact_priority *compact_priority,
d9436498 4333 int *compaction_retries)
33c2d214 4334{
31e49bfd
MH
4335 struct zone *zone;
4336 struct zoneref *z;
4337
4338 if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
4339 return false;
4340
4341 /*
4342 * There are setups with compaction disabled which would prefer to loop
4343 * inside the allocator rather than hit the oom killer prematurely.
4344 * Let's give them a good hope and keep retrying while the order-0
4345 * watermarks are OK.
4346 */
97a225e6
JK
4347 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4348 ac->highest_zoneidx, ac->nodemask) {
31e49bfd 4349 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
97a225e6 4350 ac->highest_zoneidx, alloc_flags))
31e49bfd
MH
4351 return true;
4352 }
33c2d214
MH
4353 return false;
4354}
3250845d 4355#endif /* CONFIG_COMPACTION */
56de7263 4356
d92a8cfc 4357#ifdef CONFIG_LOCKDEP
93781325 4358static struct lockdep_map __fs_reclaim_map =
d92a8cfc
PZ
4359 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
4360
f920e413 4361static bool __need_reclaim(gfp_t gfp_mask)
d92a8cfc 4362{
d92a8cfc
PZ
4363 /* no reclaim without waiting on it */
4364 if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
4365 return false;
4366
4367 /* this guy won't enter reclaim */
2e517d68 4368 if (current->flags & PF_MEMALLOC)
d92a8cfc
PZ
4369 return false;
4370
d92a8cfc
PZ
4371 if (gfp_mask & __GFP_NOLOCKDEP)
4372 return false;
4373
4374 return true;
4375}
4376
93781325
OS
4377void __fs_reclaim_acquire(void)
4378{
4379 lock_map_acquire(&__fs_reclaim_map);
4380}
4381
4382void __fs_reclaim_release(void)
4383{
4384 lock_map_release(&__fs_reclaim_map);
4385}
4386
d92a8cfc
PZ
4387void fs_reclaim_acquire(gfp_t gfp_mask)
4388{
f920e413
SV
4389 gfp_mask = current_gfp_context(gfp_mask);
4390
4391 if (__need_reclaim(gfp_mask)) {
4392 if (gfp_mask & __GFP_FS)
4393 __fs_reclaim_acquire();
4394
4395#ifdef CONFIG_MMU_NOTIFIER
4396 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
4397 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
4398#endif
4399
4400 }
d92a8cfc
PZ
4401}
4402EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
4403
4404void fs_reclaim_release(gfp_t gfp_mask)
4405{
f920e413
SV
4406 gfp_mask = current_gfp_context(gfp_mask);
4407
4408 if (__need_reclaim(gfp_mask)) {
4409 if (gfp_mask & __GFP_FS)
4410 __fs_reclaim_release();
4411 }
d92a8cfc
PZ
4412}
4413EXPORT_SYMBOL_GPL(fs_reclaim_release);
4414#endif
4415
bba90710 4416/* Perform direct synchronous page reclaim */
2187e17b 4417static unsigned long
a9263751
VB
4418__perform_reclaim(gfp_t gfp_mask, unsigned int order,
4419 const struct alloc_context *ac)
11e33f6a 4420{
499118e9 4421 unsigned int noreclaim_flag;
2187e17b 4422 unsigned long pflags, progress;
11e33f6a
MG
4423
4424 cond_resched();
4425
4426 /* We now go into synchronous reclaim */
4427 cpuset_memory_pressure_bump();
eb414681 4428 psi_memstall_enter(&pflags);
d92a8cfc 4429 fs_reclaim_acquire(gfp_mask);
93781325 4430 noreclaim_flag = memalloc_noreclaim_save();
11e33f6a 4431
a9263751
VB
4432 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
4433 ac->nodemask);
11e33f6a 4434
499118e9 4435 memalloc_noreclaim_restore(noreclaim_flag);
93781325 4436 fs_reclaim_release(gfp_mask);
eb414681 4437 psi_memstall_leave(&pflags);
11e33f6a
MG
4438
4439 cond_resched();
4440
bba90710
MS
4441 return progress;
4442}
4443
4444/* The really slow allocator path where we enter direct reclaim */
4445static inline struct page *
4446__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
c603844b 4447 unsigned int alloc_flags, const struct alloc_context *ac,
a9263751 4448 unsigned long *did_some_progress)
bba90710
MS
4449{
4450 struct page *page = NULL;
4451 bool drained = false;
4452
a9263751 4453 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
9ee493ce
MG
4454 if (unlikely(!(*did_some_progress)))
4455 return NULL;
11e33f6a 4456
9ee493ce 4457retry:
31a6c190 4458 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
9ee493ce
MG
4459
4460 /*
4461 * If an allocation failed after direct reclaim, it could be because
0aaa29a5 4462 * pages are pinned on the per-cpu lists or in high alloc reserves.
047b9967 4463 * Shrink them and try again
9ee493ce
MG
4464 */
4465 if (!page && !drained) {
29fac03b 4466 unreserve_highatomic_pageblock(ac, false);
93481ff0 4467 drain_all_pages(NULL);
9ee493ce
MG
4468 drained = true;
4469 goto retry;
4470 }
4471
11e33f6a
MG
4472 return page;
4473}
4474
5ecd9d40
DR
4475static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
4476 const struct alloc_context *ac)
3a025760
JW
4477{
4478 struct zoneref *z;
4479 struct zone *zone;
e1a55637 4480 pg_data_t *last_pgdat = NULL;
97a225e6 4481 enum zone_type highest_zoneidx = ac->highest_zoneidx;
3a025760 4482
97a225e6 4483 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
5ecd9d40 4484 ac->nodemask) {
e1a55637 4485 if (last_pgdat != zone->zone_pgdat)
97a225e6 4486 wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
e1a55637
MG
4487 last_pgdat = zone->zone_pgdat;
4488 }
3a025760
JW
4489}
4490
c603844b 4491static inline unsigned int
341ce06f
PZ
4492gfp_to_alloc_flags(gfp_t gfp_mask)
4493{
c603844b 4494 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1da177e4 4495
736838e9
MN
4496 /*
4497 * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
4498 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
4499 * to save two branches.
4500 */
e6223a3b 4501 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
736838e9 4502 BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
933e312e 4503
341ce06f
PZ
4504 /*
4505 * The caller may dip into page reserves a bit more if the caller
4506 * cannot run direct reclaim, or if the caller has realtime scheduling
4507 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
d0164adc 4508 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
341ce06f 4509 */
736838e9
MN
4510 alloc_flags |= (__force int)
4511 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
1da177e4 4512
d0164adc 4513 if (gfp_mask & __GFP_ATOMIC) {
5c3240d9 4514 /*
b104a35d
DR
4515 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
4516 * if it can't schedule.
5c3240d9 4517 */
b104a35d 4518 if (!(gfp_mask & __GFP_NOMEMALLOC))
5c3240d9 4519 alloc_flags |= ALLOC_HARDER;
523b9458 4520 /*
b104a35d 4521 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
344736f2 4522 * comment for __cpuset_node_allowed().
523b9458 4523 */
341ce06f 4524 alloc_flags &= ~ALLOC_CPUSET;
c06b1fca 4525 } else if (unlikely(rt_task(current)) && !in_interrupt())
341ce06f
PZ
4526 alloc_flags |= ALLOC_HARDER;
4527
8e3560d9 4528 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
8510e69c 4529
341ce06f
PZ
4530 return alloc_flags;
4531}
4532
cd04ae1e 4533static bool oom_reserves_allowed(struct task_struct *tsk)
072bb0aa 4534{
cd04ae1e
MH
4535 if (!tsk_is_oom_victim(tsk))
4536 return false;
4537
4538 /*
4539 * !MMU doesn't have oom reaper so give access to memory reserves
4540 * only to the thread with TIF_MEMDIE set
4541 */
4542 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
31a6c190
VB
4543 return false;
4544
cd04ae1e
MH
4545 return true;
4546}
4547
4548/*
4549 * Distinguish requests which really need access to full memory
4550 * reserves from oom victims which can live with a portion of it
4551 */
4552static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
4553{
4554 if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
4555 return 0;
31a6c190 4556 if (gfp_mask & __GFP_MEMALLOC)
cd04ae1e 4557 return ALLOC_NO_WATERMARKS;
31a6c190 4558 if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
cd04ae1e
MH
4559 return ALLOC_NO_WATERMARKS;
4560 if (!in_interrupt()) {
4561 if (current->flags & PF_MEMALLOC)
4562 return ALLOC_NO_WATERMARKS;
4563 else if (oom_reserves_allowed(current))
4564 return ALLOC_OOM;
4565 }
31a6c190 4566
cd04ae1e
MH
4567 return 0;
4568}
4569
4570bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
4571{
4572 return !!__gfp_pfmemalloc_flags(gfp_mask);
072bb0aa
MG
4573}
4574
0a0337e0
MH
4575/*
4576 * Checks whether it makes sense to retry the reclaim to make a forward progress
4577 * for the given allocation request.
491d79ae
JW
4578 *
4579 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4580 * without success, or when we couldn't even meet the watermark if we
4581 * reclaimed all remaining pages on the LRU lists.
0a0337e0
MH
4582 *
4583 * Returns true if a retry is viable or false to enter the oom path.
4584 */
4585static inline bool
4586should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4587 struct alloc_context *ac, int alloc_flags,
423b452e 4588 bool did_some_progress, int *no_progress_loops)
0a0337e0
MH
4589{
4590 struct zone *zone;
4591 struct zoneref *z;
15f570bf 4592 bool ret = false;
0a0337e0 4593
423b452e
VB
4594 /*
4595 * Costly allocations might have made a progress but this doesn't mean
4596 * their order will become available due to high fragmentation so
4597 * always increment the no progress counter for them
4598 */
4599 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4600 *no_progress_loops = 0;
4601 else
4602 (*no_progress_loops)++;
4603
0a0337e0
MH
4604 /*
4605 * Make sure we converge to OOM if we cannot make any progress
4606 * several times in the row.
4607 */
04c8716f
MK
4608 if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
4609 /* Before OOM, exhaust highatomic_reserve */
29fac03b 4610 return unreserve_highatomic_pageblock(ac, true);
04c8716f 4611 }
0a0337e0 4612
bca67592
MG
4613 /*
4614 * Keep reclaiming pages while there is a chance this will lead
4615 * somewhere. If none of the target zones can satisfy our allocation
4616 * request even if all reclaimable pages are considered then we are
4617 * screwed and have to go OOM.
0a0337e0 4618 */
97a225e6
JK
4619 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4620 ac->highest_zoneidx, ac->nodemask) {
0a0337e0 4621 unsigned long available;
ede37713 4622 unsigned long reclaimable;
d379f01d
MH
4623 unsigned long min_wmark = min_wmark_pages(zone);
4624 bool wmark;
0a0337e0 4625
5a1c84b4 4626 available = reclaimable = zone_reclaimable_pages(zone);
5a1c84b4 4627 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
0a0337e0
MH
4628
4629 /*
491d79ae
JW
4630 * Would the allocation succeed if we reclaimed all
4631 * reclaimable pages?
0a0337e0 4632 */
d379f01d 4633 wmark = __zone_watermark_ok(zone, order, min_wmark,
97a225e6 4634 ac->highest_zoneidx, alloc_flags, available);
d379f01d
MH
4635 trace_reclaim_retry_zone(z, order, reclaimable,
4636 available, min_wmark, *no_progress_loops, wmark);
4637 if (wmark) {
ede37713
MH
4638 /*
4639 * If we didn't make any progress and have a lot of
4640 * dirty + writeback pages then we should wait for
4641 * an IO to complete to slow down the reclaim and
4642 * prevent from pre mature OOM
4643 */
4644 if (!did_some_progress) {
11fb9989 4645 unsigned long write_pending;
ede37713 4646
5a1c84b4
MG
4647 write_pending = zone_page_state_snapshot(zone,
4648 NR_ZONE_WRITE_PENDING);
ede37713 4649
11fb9989 4650 if (2 * write_pending > reclaimable) {
ede37713
MH
4651 congestion_wait(BLK_RW_ASYNC, HZ/10);
4652 return true;
4653 }
4654 }
5a1c84b4 4655
15f570bf
MH
4656 ret = true;
4657 goto out;
0a0337e0
MH
4658 }
4659 }
4660
15f570bf
MH
4661out:
4662 /*
4663 * Memory allocation/reclaim might be called from a WQ context and the
4664 * current implementation of the WQ concurrency control doesn't
4665 * recognize that a particular WQ is congested if the worker thread is
4666 * looping without ever sleeping. Therefore we have to do a short sleep
4667 * here rather than calling cond_resched().
4668 */
4669 if (current->flags & PF_WQ_WORKER)
4670 schedule_timeout_uninterruptible(1);
4671 else
4672 cond_resched();
4673 return ret;
0a0337e0
MH
4674}
4675
902b6281
VB
4676static inline bool
4677check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
4678{
4679 /*
4680 * It's possible that cpuset's mems_allowed and the nodemask from
4681 * mempolicy don't intersect. This should be normally dealt with by
4682 * policy_nodemask(), but it's possible to race with cpuset update in
4683 * such a way the check therein was true, and then it became false
4684 * before we got our cpuset_mems_cookie here.
4685 * This assumes that for all allocations, ac->nodemask can come only
4686 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4687 * when it does not intersect with the cpuset restrictions) or the
4688 * caller can deal with a violated nodemask.
4689 */
4690 if (cpusets_enabled() && ac->nodemask &&
4691 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
4692 ac->nodemask = NULL;
4693 return true;
4694 }
4695
4696 /*
4697 * When updating a task's mems_allowed or mempolicy nodemask, it is
4698 * possible to race with parallel threads in such a way that our
4699 * allocation can fail while the mask is being updated. If we are about
4700 * to fail, check if the cpuset changed during allocation and if so,
4701 * retry.
4702 */
4703 if (read_mems_allowed_retry(cpuset_mems_cookie))
4704 return true;
4705
4706 return false;
4707}
4708
11e33f6a
MG
4709static inline struct page *
4710__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
a9263751 4711 struct alloc_context *ac)
11e33f6a 4712{
d0164adc 4713 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
282722b0 4714 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
11e33f6a 4715 struct page *page = NULL;
c603844b 4716 unsigned int alloc_flags;
11e33f6a 4717 unsigned long did_some_progress;
5ce9bfef 4718 enum compact_priority compact_priority;
c5d01d0d 4719 enum compact_result compact_result;
5ce9bfef
VB
4720 int compaction_retries;
4721 int no_progress_loops;
5ce9bfef 4722 unsigned int cpuset_mems_cookie;
cd04ae1e 4723 int reserve_flags;
1da177e4 4724
d0164adc
MG
4725 /*
4726 * We also sanity check to catch abuse of atomic reserves being used by
4727 * callers that are not in atomic context.
4728 */
4729 if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
4730 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
4731 gfp_mask &= ~__GFP_ATOMIC;
4732
5ce9bfef
VB
4733retry_cpuset:
4734 compaction_retries = 0;
4735 no_progress_loops = 0;
4736 compact_priority = DEF_COMPACT_PRIORITY;
4737 cpuset_mems_cookie = read_mems_allowed_begin();
9a67f648
MH
4738
4739 /*
4740 * The fast path uses conservative alloc_flags to succeed only until
4741 * kswapd needs to be woken up, and to avoid the cost of setting up
4742 * alloc_flags precisely. So we do that now.
4743 */
4744 alloc_flags = gfp_to_alloc_flags(gfp_mask);
4745
e47483bc
VB
4746 /*
4747 * We need to recalculate the starting point for the zonelist iterator
4748 * because we might have used different nodemask in the fast path, or
4749 * there was a cpuset modification and we are retrying - otherwise we
4750 * could end up iterating over non-eligible zones endlessly.
4751 */
4752 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
97a225e6 4753 ac->highest_zoneidx, ac->nodemask);
e47483bc
VB
4754 if (!ac->preferred_zoneref->zone)
4755 goto nopage;
4756
0a79cdad 4757 if (alloc_flags & ALLOC_KSWAPD)
5ecd9d40 4758 wake_all_kswapds(order, gfp_mask, ac);
23771235
VB
4759
4760 /*
4761 * The adjusted alloc_flags might result in immediate success, so try
4762 * that first
4763 */
4764 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4765 if (page)
4766 goto got_pg;
4767
a8161d1e
VB
4768 /*
4769 * For costly allocations, try direct compaction first, as it's likely
282722b0
VB
4770 * that we have enough base pages and don't need to reclaim. For non-
4771 * movable high-order allocations, do that as well, as compaction will
4772 * try prevent permanent fragmentation by migrating from blocks of the
4773 * same migratetype.
4774 * Don't try this for allocations that are allowed to ignore
4775 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
a8161d1e 4776 */
282722b0
VB
4777 if (can_direct_reclaim &&
4778 (costly_order ||
4779 (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
4780 && !gfp_pfmemalloc_allowed(gfp_mask)) {
a8161d1e
VB
4781 page = __alloc_pages_direct_compact(gfp_mask, order,
4782 alloc_flags, ac,
a5508cd8 4783 INIT_COMPACT_PRIORITY,
a8161d1e
VB
4784 &compact_result);
4785 if (page)
4786 goto got_pg;
4787
cc638f32
VB
4788 /*
4789 * Checks for costly allocations with __GFP_NORETRY, which
4790 * includes some THP page fault allocations
4791 */
4792 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
b39d0ee2
DR
4793 /*
4794 * If allocating entire pageblock(s) and compaction
4795 * failed because all zones are below low watermarks
4796 * or is prohibited because it recently failed at this
3f36d866
DR
4797 * order, fail immediately unless the allocator has
4798 * requested compaction and reclaim retry.
b39d0ee2
DR
4799 *
4800 * Reclaim is
4801 * - potentially very expensive because zones are far
4802 * below their low watermarks or this is part of very
4803 * bursty high order allocations,
4804 * - not guaranteed to help because isolate_freepages()
4805 * may not iterate over freed pages as part of its
4806 * linear scan, and
4807 * - unlikely to make entire pageblocks free on its
4808 * own.
4809 */
4810 if (compact_result == COMPACT_SKIPPED ||
4811 compact_result == COMPACT_DEFERRED)
4812 goto nopage;
a8161d1e 4813
a8161d1e 4814 /*
3eb2771b
VB
4815 * Looks like reclaim/compaction is worth trying, but
4816 * sync compaction could be very expensive, so keep
25160354 4817 * using async compaction.
a8161d1e 4818 */
a5508cd8 4819 compact_priority = INIT_COMPACT_PRIORITY;
a8161d1e
VB
4820 }
4821 }
23771235 4822
31a6c190 4823retry:
23771235 4824 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
0a79cdad 4825 if (alloc_flags & ALLOC_KSWAPD)
5ecd9d40 4826 wake_all_kswapds(order, gfp_mask, ac);
31a6c190 4827
cd04ae1e
MH
4828 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4829 if (reserve_flags)
8e3560d9 4830 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
23771235 4831
e46e7b77 4832 /*
d6a24df0
VB
4833 * Reset the nodemask and zonelist iterators if memory policies can be
4834 * ignored. These allocations are high priority and system rather than
4835 * user oriented.
e46e7b77 4836 */
cd04ae1e 4837 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
d6a24df0 4838 ac->nodemask = NULL;
e46e7b77 4839 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
97a225e6 4840 ac->highest_zoneidx, ac->nodemask);
e46e7b77
MG
4841 }
4842
23771235 4843 /* Attempt with potentially adjusted zonelist and alloc_flags */
31a6c190 4844 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
7fb1d9fc
RS
4845 if (page)
4846 goto got_pg;
1da177e4 4847
d0164adc 4848 /* Caller is not willing to reclaim, we can't balance anything */
9a67f648 4849 if (!can_direct_reclaim)
1da177e4
LT
4850 goto nopage;
4851
9a67f648
MH
4852 /* Avoid recursion of direct reclaim */
4853 if (current->flags & PF_MEMALLOC)
6583bb64
DR
4854 goto nopage;
4855
a8161d1e
VB
4856 /* Try direct reclaim and then allocating */
4857 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4858 &did_some_progress);
4859 if (page)
4860 goto got_pg;
4861
4862 /* Try direct compaction and then allocating */
a9263751 4863 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
a5508cd8 4864 compact_priority, &compact_result);
56de7263
MG
4865 if (page)
4866 goto got_pg;
75f30861 4867
9083905a
JW
4868 /* Do not loop if specifically requested */
4869 if (gfp_mask & __GFP_NORETRY)
a8161d1e 4870 goto nopage;
9083905a 4871
0a0337e0
MH
4872 /*
4873 * Do not retry costly high order allocations unless they are
dcda9b04 4874 * __GFP_RETRY_MAYFAIL
0a0337e0 4875 */
dcda9b04 4876 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
a8161d1e 4877 goto nopage;
0a0337e0 4878
0a0337e0 4879 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
423b452e 4880 did_some_progress > 0, &no_progress_loops))
0a0337e0
MH
4881 goto retry;
4882
33c2d214
MH
4883 /*
4884 * It doesn't make any sense to retry for the compaction if the order-0
4885 * reclaim is not able to make any progress because the current
4886 * implementation of the compaction depends on the sufficient amount
4887 * of free memory (see __compaction_suitable)
4888 */
4889 if (did_some_progress > 0 &&
86a294a8 4890 should_compact_retry(ac, order, alloc_flags,
a5508cd8 4891 compact_result, &compact_priority,
d9436498 4892 &compaction_retries))
33c2d214
MH
4893 goto retry;
4894
902b6281
VB
4895
4896 /* Deal with possible cpuset update races before we start OOM killing */
4897 if (check_retry_cpuset(cpuset_mems_cookie, ac))
e47483bc
VB
4898 goto retry_cpuset;
4899
9083905a
JW
4900 /* Reclaim has failed us, start killing things */
4901 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4902 if (page)
4903 goto got_pg;
4904
9a67f648 4905 /* Avoid allocations with no watermarks from looping endlessly */
cd04ae1e 4906 if (tsk_is_oom_victim(current) &&
8510e69c 4907 (alloc_flags & ALLOC_OOM ||
c288983d 4908 (gfp_mask & __GFP_NOMEMALLOC)))
9a67f648
MH
4909 goto nopage;
4910
9083905a 4911 /* Retry as long as the OOM killer is making progress */
0a0337e0
MH
4912 if (did_some_progress) {
4913 no_progress_loops = 0;
9083905a 4914 goto retry;
0a0337e0 4915 }
9083905a 4916
1da177e4 4917nopage:
902b6281
VB
4918 /* Deal with possible cpuset update races before we fail */
4919 if (check_retry_cpuset(cpuset_mems_cookie, ac))
5ce9bfef
VB
4920 goto retry_cpuset;
4921
9a67f648
MH
4922 /*
4923 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4924 * we always retry
4925 */
4926 if (gfp_mask & __GFP_NOFAIL) {
4927 /*
4928 * All existing users of the __GFP_NOFAIL are blockable, so warn
4929 * of any new users that actually require GFP_NOWAIT
4930 */
4931 if (WARN_ON_ONCE(!can_direct_reclaim))
4932 goto fail;
4933
4934 /*
4935 * PF_MEMALLOC request from this context is rather bizarre
4936 * because we cannot reclaim anything and only can loop waiting
4937 * for somebody to do a work for us
4938 */
4939 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4940
4941 /*
4942 * non failing costly orders are a hard requirement which we
4943 * are not prepared for much so let's warn about these users
4944 * so that we can identify them and convert them to something
4945 * else.
4946 */
4947 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
4948
6c18ba7a
MH
4949 /*
4950 * Help non-failing allocations by giving them access to memory
4951 * reserves but do not use ALLOC_NO_WATERMARKS because this
4952 * could deplete whole memory reserves which would just make
4953 * the situation worse
4954 */
4955 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
4956 if (page)
4957 goto got_pg;
4958
9a67f648
MH
4959 cond_resched();
4960 goto retry;
4961 }
4962fail:
a8e99259 4963 warn_alloc(gfp_mask, ac->nodemask,
7877cdcc 4964 "page allocation failure: order:%u", order);
1da177e4 4965got_pg:
072bb0aa 4966 return page;
1da177e4 4967}
11e33f6a 4968
9cd75558 4969static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
04ec6264 4970 int preferred_nid, nodemask_t *nodemask,
8e6a930b 4971 struct alloc_context *ac, gfp_t *alloc_gfp,
9cd75558 4972 unsigned int *alloc_flags)
11e33f6a 4973{
97a225e6 4974 ac->highest_zoneidx = gfp_zone(gfp_mask);
04ec6264 4975 ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
9cd75558 4976 ac->nodemask = nodemask;
01c0bfe0 4977 ac->migratetype = gfp_migratetype(gfp_mask);
11e33f6a 4978
682a3385 4979 if (cpusets_enabled()) {
8e6a930b 4980 *alloc_gfp |= __GFP_HARDWALL;
182f3d7a
MS
4981 /*
4982 * When we are in the interrupt context, it is irrelevant
4983 * to the current task context. It means that any node ok.
4984 */
4985 if (!in_interrupt() && !ac->nodemask)
9cd75558 4986 ac->nodemask = &cpuset_current_mems_allowed;
51047820
VB
4987 else
4988 *alloc_flags |= ALLOC_CPUSET;
682a3385
MG
4989 }
4990
d92a8cfc
PZ
4991 fs_reclaim_acquire(gfp_mask);
4992 fs_reclaim_release(gfp_mask);
11e33f6a 4993
d0164adc 4994 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
11e33f6a
MG
4995
4996 if (should_fail_alloc_page(gfp_mask, order))
9cd75558 4997 return false;
11e33f6a 4998
8e3560d9 4999 *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
d883c6cf 5000
c9ab0c4f 5001 /* Dirty zone balancing only done in the fast path */
9cd75558 5002 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
c9ab0c4f 5003
e46e7b77
MG
5004 /*
5005 * The preferred zone is used for statistics but crucially it is
5006 * also used as the starting point for the zonelist iterator. It
5007 * may get reset for allocations that ignore memory policies.
5008 */
9cd75558 5009 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
97a225e6 5010 ac->highest_zoneidx, ac->nodemask);
a0622d05
MN
5011
5012 return true;
9cd75558
MG
5013}
5014
387ba26f 5015/*
0f87d9d3 5016 * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
387ba26f
MG
5017 * @gfp: GFP flags for the allocation
5018 * @preferred_nid: The preferred NUMA node ID to allocate from
5019 * @nodemask: Set of nodes to allocate from, may be NULL
0f87d9d3
MG
5020 * @nr_pages: The number of pages desired on the list or array
5021 * @page_list: Optional list to store the allocated pages
5022 * @page_array: Optional array to store the pages
387ba26f
MG
5023 *
5024 * This is a batched version of the page allocator that attempts to
0f87d9d3
MG
5025 * allocate nr_pages quickly. Pages are added to page_list if page_list
5026 * is not NULL, otherwise it is assumed that the page_array is valid.
387ba26f 5027 *
0f87d9d3
MG
5028 * For lists, nr_pages is the number of pages that should be allocated.
5029 *
5030 * For arrays, only NULL elements are populated with pages and nr_pages
5031 * is the maximum number of pages that will be stored in the array.
5032 *
5033 * Returns the number of pages on the list or array.
387ba26f
MG
5034 */
5035unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
5036 nodemask_t *nodemask, int nr_pages,
0f87d9d3
MG
5037 struct list_head *page_list,
5038 struct page **page_array)
387ba26f
MG
5039{
5040 struct page *page;
5041 unsigned long flags;
5042 struct zone *zone;
5043 struct zoneref *z;
5044 struct per_cpu_pages *pcp;
5045 struct list_head *pcp_list;
5046 struct alloc_context ac;
5047 gfp_t alloc_gfp;
5048 unsigned int alloc_flags = ALLOC_WMARK_LOW;
0f87d9d3 5049 int nr_populated = 0;
387ba26f 5050
ce76f9a1 5051 if (unlikely(nr_pages <= 0))
387ba26f
MG
5052 return 0;
5053
0f87d9d3
MG
5054 /*
5055 * Skip populated array elements to determine if any pages need
5056 * to be allocated before disabling IRQs.
5057 */
b08e50dd 5058 while (page_array && nr_populated < nr_pages && page_array[nr_populated])
0f87d9d3
MG
5059 nr_populated++;
5060
b3b64ebd
MG
5061 /* Already populated array? */
5062 if (unlikely(page_array && nr_pages - nr_populated == 0))
ff4b2b40 5063 return nr_populated;
b3b64ebd 5064
387ba26f 5065 /* Use the single page allocator for one page. */
0f87d9d3 5066 if (nr_pages - nr_populated == 1)
387ba26f
MG
5067 goto failed;
5068
5069 /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
5070 gfp &= gfp_allowed_mask;
5071 alloc_gfp = gfp;
5072 if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
5073 return 0;
5074 gfp = alloc_gfp;
5075
5076 /* Find an allowed local zone that meets the low watermark. */
5077 for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
5078 unsigned long mark;
5079
5080 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
5081 !__cpuset_zone_allowed(zone, gfp)) {
5082 continue;
5083 }
5084
5085 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
5086 zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
5087 goto failed;
5088 }
5089
5090 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
5091 if (zone_watermark_fast(zone, 0, mark,
5092 zonelist_zone_idx(ac.preferred_zoneref),
5093 alloc_flags, gfp)) {
5094 break;
5095 }
5096 }
5097
5098 /*
5099 * If there are no allowed local zones that meets the watermarks then
5100 * try to allocate a single page and reclaim if necessary.
5101 */
ce76f9a1 5102 if (unlikely(!zone))
387ba26f
MG
5103 goto failed;
5104
5105 /* Attempt the batch allocation */
5106 local_irq_save(flags);
5107 pcp = &this_cpu_ptr(zone->pageset)->pcp;
5108 pcp_list = &pcp->lists[ac.migratetype];
5109
0f87d9d3
MG
5110 while (nr_populated < nr_pages) {
5111
5112 /* Skip existing pages */
5113 if (page_array && page_array[nr_populated]) {
5114 nr_populated++;
5115 continue;
5116 }
5117
387ba26f
MG
5118 page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
5119 pcp, pcp_list);
ce76f9a1 5120 if (unlikely(!page)) {
387ba26f 5121 /* Try and get at least one page */
0f87d9d3 5122 if (!nr_populated)
387ba26f
MG
5123 goto failed_irq;
5124 break;
5125 }
5126
5127 /*
5128 * Ideally this would be batched but the best way to do
5129 * that cheaply is to first convert zone_statistics to
5130 * be inaccurate per-cpu counter like vm_events to avoid
5131 * a RMW cycle then do the accounting with IRQs enabled.
5132 */
5133 __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
5134 zone_statistics(ac.preferred_zoneref->zone, zone);
5135
5136 prep_new_page(page, 0, gfp, 0);
0f87d9d3
MG
5137 if (page_list)
5138 list_add(&page->lru, page_list);
5139 else
5140 page_array[nr_populated] = page;
5141 nr_populated++;
387ba26f
MG
5142 }
5143
5144 local_irq_restore(flags);
5145
0f87d9d3 5146 return nr_populated;
387ba26f
MG
5147
5148failed_irq:
5149 local_irq_restore(flags);
5150
5151failed:
5152 page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
5153 if (page) {
0f87d9d3
MG
5154 if (page_list)
5155 list_add(&page->lru, page_list);
5156 else
5157 page_array[nr_populated] = page;
5158 nr_populated++;
387ba26f
MG
5159 }
5160
0f87d9d3 5161 return nr_populated;
387ba26f
MG
5162}
5163EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
5164
9cd75558
MG
5165/*
5166 * This is the 'heart' of the zoned buddy allocator.
5167 */
84172f4b 5168struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
04ec6264 5169 nodemask_t *nodemask)
9cd75558
MG
5170{
5171 struct page *page;
5172 unsigned int alloc_flags = ALLOC_WMARK_LOW;
8e6a930b 5173 gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
9cd75558
MG
5174 struct alloc_context ac = { };
5175
c63ae43b
MH
5176 /*
5177 * There are several places where we assume that the order value is sane
5178 * so bail out early if the request is out of bound.
5179 */
5180 if (unlikely(order >= MAX_ORDER)) {
6e5e0f28 5181 WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
c63ae43b
MH
5182 return NULL;
5183 }
5184
6e5e0f28 5185 gfp &= gfp_allowed_mask;
da6df1b0
PT
5186 /*
5187 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
5188 * resp. GFP_NOIO which has to be inherited for all allocation requests
5189 * from a particular context which has been marked by
8e3560d9
PT
5190 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
5191 * movable zones are not used during allocation.
da6df1b0
PT
5192 */
5193 gfp = current_gfp_context(gfp);
6e5e0f28
MWO
5194 alloc_gfp = gfp;
5195 if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
8e6a930b 5196 &alloc_gfp, &alloc_flags))
9cd75558
MG
5197 return NULL;
5198
6bb15450
MG
5199 /*
5200 * Forbid the first pass from falling back to types that fragment
5201 * memory until all local zones are considered.
5202 */
6e5e0f28 5203 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
6bb15450 5204
5117f45d 5205 /* First allocation attempt */
8e6a930b 5206 page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
4fcb0971
MG
5207 if (likely(page))
5208 goto out;
11e33f6a 5209
da6df1b0 5210 alloc_gfp = gfp;
4fcb0971 5211 ac.spread_dirty_pages = false;
23f086f9 5212
4741526b
MG
5213 /*
5214 * Restore the original nodemask if it was potentially replaced with
5215 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
5216 */
97ce86f9 5217 ac.nodemask = nodemask;
16096c25 5218
8e6a930b 5219 page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
cc9a6c87 5220
4fcb0971 5221out:
6e5e0f28
MWO
5222 if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page &&
5223 unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
c4159a75
VD
5224 __free_pages(page, order);
5225 page = NULL;
4949148a
VD
5226 }
5227
8e6a930b 5228 trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
4fcb0971 5229
11e33f6a 5230 return page;
1da177e4 5231}
84172f4b 5232EXPORT_SYMBOL(__alloc_pages);
1da177e4
LT
5233
5234/*
9ea9a680
MH
5235 * Common helper functions. Never use with __GFP_HIGHMEM because the returned
5236 * address cannot represent highmem pages. Use alloc_pages and then kmap if
5237 * you need to access high mem.
1da177e4 5238 */
920c7a5d 5239unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1da177e4 5240{
945a1113
AM
5241 struct page *page;
5242
9ea9a680 5243 page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
1da177e4
LT
5244 if (!page)
5245 return 0;
5246 return (unsigned long) page_address(page);
5247}
1da177e4
LT
5248EXPORT_SYMBOL(__get_free_pages);
5249
920c7a5d 5250unsigned long get_zeroed_page(gfp_t gfp_mask)
1da177e4 5251{
945a1113 5252 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1da177e4 5253}
1da177e4
LT
5254EXPORT_SYMBOL(get_zeroed_page);
5255
742aa7fb 5256static inline void free_the_page(struct page *page, unsigned int order)
1da177e4 5257{
742aa7fb
AL
5258 if (order == 0) /* Via pcp? */
5259 free_unref_page(page);
5260 else
7fef431b 5261 __free_pages_ok(page, order, FPI_NONE);
1da177e4
LT
5262}
5263
7f194fbb
MWO
5264/**
5265 * __free_pages - Free pages allocated with alloc_pages().
5266 * @page: The page pointer returned from alloc_pages().
5267 * @order: The order of the allocation.
5268 *
5269 * This function can free multi-page allocations that are not compound
5270 * pages. It does not check that the @order passed in matches that of
5271 * the allocation, so it is easy to leak memory. Freeing more memory
5272 * than was allocated will probably emit a warning.
5273 *
5274 * If the last reference to this page is speculative, it will be released
5275 * by put_page() which only frees the first page of a non-compound
5276 * allocation. To prevent the remaining pages from being leaked, we free
5277 * the subsequent pages here. If you want to use the page's reference
5278 * count to decide when to free the allocation, you should allocate a
5279 * compound page, and use put_page() instead of __free_pages().
5280 *
5281 * Context: May be called in interrupt context or while holding a normal
5282 * spinlock, but not in NMI context or while holding a raw spinlock.
5283 */
742aa7fb
AL
5284void __free_pages(struct page *page, unsigned int order)
5285{
5286 if (put_page_testzero(page))
5287 free_the_page(page, order);
e320d301
MWO
5288 else if (!PageHead(page))
5289 while (order-- > 0)
5290 free_the_page(page + (1 << order), order);
742aa7fb 5291}
1da177e4
LT
5292EXPORT_SYMBOL(__free_pages);
5293
920c7a5d 5294void free_pages(unsigned long addr, unsigned int order)
1da177e4
LT
5295{
5296 if (addr != 0) {
725d704e 5297 VM_BUG_ON(!virt_addr_valid((void *)addr));
1da177e4
LT
5298 __free_pages(virt_to_page((void *)addr), order);
5299 }
5300}
5301
5302EXPORT_SYMBOL(free_pages);
5303
b63ae8ca
AD
5304/*
5305 * Page Fragment:
5306 * An arbitrary-length arbitrary-offset area of memory which resides
5307 * within a 0 or higher order page. Multiple fragments within that page
5308 * are individually refcounted, in the page's reference counter.
5309 *
5310 * The page_frag functions below provide a simple allocation framework for
5311 * page fragments. This is used by the network stack and network device
5312 * drivers to provide a backing region of memory for use as either an
5313 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
5314 */
2976db80
AD
5315static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
5316 gfp_t gfp_mask)
b63ae8ca
AD
5317{
5318 struct page *page = NULL;
5319 gfp_t gfp = gfp_mask;
5320
5321#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
5322 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
5323 __GFP_NOMEMALLOC;
5324 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
5325 PAGE_FRAG_CACHE_MAX_ORDER);
5326 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
5327#endif
5328 if (unlikely(!page))
5329 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
5330
5331 nc->va = page ? page_address(page) : NULL;
5332
5333 return page;
5334}
5335
2976db80 5336void __page_frag_cache_drain(struct page *page, unsigned int count)
44fdffd7
AD
5337{
5338 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
5339
742aa7fb
AL
5340 if (page_ref_sub_and_test(page, count))
5341 free_the_page(page, compound_order(page));
44fdffd7 5342}
2976db80 5343EXPORT_SYMBOL(__page_frag_cache_drain);
44fdffd7 5344
b358e212
KH
5345void *page_frag_alloc_align(struct page_frag_cache *nc,
5346 unsigned int fragsz, gfp_t gfp_mask,
5347 unsigned int align_mask)
b63ae8ca
AD
5348{
5349 unsigned int size = PAGE_SIZE;
5350 struct page *page;
5351 int offset;
5352
5353 if (unlikely(!nc->va)) {
5354refill:
2976db80 5355 page = __page_frag_cache_refill(nc, gfp_mask);
b63ae8ca
AD
5356 if (!page)
5357 return NULL;
5358
5359#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
5360 /* if size can vary use size else just use PAGE_SIZE */
5361 size = nc->size;
5362#endif
5363 /* Even if we own the page, we do not use atomic_set().
5364 * This would break get_page_unless_zero() users.
5365 */
86447726 5366 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
b63ae8ca
AD
5367
5368 /* reset page count bias and offset to start of new frag */
2f064f34 5369 nc->pfmemalloc = page_is_pfmemalloc(page);
86447726 5370 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
b63ae8ca
AD
5371 nc->offset = size;
5372 }
5373
5374 offset = nc->offset - fragsz;
5375 if (unlikely(offset < 0)) {
5376 page = virt_to_page(nc->va);
5377
fe896d18 5378 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
b63ae8ca
AD
5379 goto refill;
5380
d8c19014
DZ
5381 if (unlikely(nc->pfmemalloc)) {
5382 free_the_page(page, compound_order(page));
5383 goto refill;
5384 }
5385
b63ae8ca
AD
5386#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
5387 /* if size can vary use size else just use PAGE_SIZE */
5388 size = nc->size;
5389#endif
5390 /* OK, page count is 0, we can safely set it */
86447726 5391 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
b63ae8ca
AD
5392
5393 /* reset page count bias and offset to start of new frag */
86447726 5394 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
b63ae8ca
AD
5395 offset = size - fragsz;
5396 }
5397
5398 nc->pagecnt_bias--;
b358e212 5399 offset &= align_mask;
b63ae8ca
AD
5400 nc->offset = offset;
5401
5402 return nc->va + offset;
5403}
b358e212 5404EXPORT_SYMBOL(page_frag_alloc_align);
b63ae8ca
AD
5405
5406/*
5407 * Frees a page fragment allocated out of either a compound or order 0 page.
5408 */
8c2dd3e4 5409void page_frag_free(void *addr)
b63ae8ca
AD
5410{
5411 struct page *page = virt_to_head_page(addr);
5412
742aa7fb
AL
5413 if (unlikely(put_page_testzero(page)))
5414 free_the_page(page, compound_order(page));
b63ae8ca 5415}
8c2dd3e4 5416EXPORT_SYMBOL(page_frag_free);
b63ae8ca 5417
d00181b9
KS
5418static void *make_alloc_exact(unsigned long addr, unsigned int order,
5419 size_t size)
ee85c2e1
AK
5420{
5421 if (addr) {
5422 unsigned long alloc_end = addr + (PAGE_SIZE << order);
5423 unsigned long used = addr + PAGE_ALIGN(size);
5424
5425 split_page(virt_to_page((void *)addr), order);
5426 while (used < alloc_end) {
5427 free_page(used);
5428 used += PAGE_SIZE;
5429 }
5430 }
5431 return (void *)addr;
5432}
5433
2be0ffe2
TT
5434/**
5435 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
5436 * @size: the number of bytes to allocate
63931eb9 5437 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
2be0ffe2
TT
5438 *
5439 * This function is similar to alloc_pages(), except that it allocates the
5440 * minimum number of pages to satisfy the request. alloc_pages() can only
5441 * allocate memory in power-of-two pages.
5442 *
5443 * This function is also limited by MAX_ORDER.
5444 *
5445 * Memory allocated by this function must be released by free_pages_exact().
a862f68a
MR
5446 *
5447 * Return: pointer to the allocated area or %NULL in case of error.
2be0ffe2
TT
5448 */
5449void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
5450{
5451 unsigned int order = get_order(size);
5452 unsigned long addr;
5453
63931eb9
VB
5454 if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
5455 gfp_mask &= ~__GFP_COMP;
5456
2be0ffe2 5457 addr = __get_free_pages(gfp_mask, order);
ee85c2e1 5458 return make_alloc_exact(addr, order, size);
2be0ffe2
TT
5459}
5460EXPORT_SYMBOL(alloc_pages_exact);
5461
ee85c2e1
AK
5462/**
5463 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
5464 * pages on a node.
b5e6ab58 5465 * @nid: the preferred node ID where memory should be allocated
ee85c2e1 5466 * @size: the number of bytes to allocate
63931eb9 5467 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
ee85c2e1
AK
5468 *
5469 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
5470 * back.
a862f68a
MR
5471 *
5472 * Return: pointer to the allocated area or %NULL in case of error.
ee85c2e1 5473 */
e1931811 5474void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
ee85c2e1 5475{
d00181b9 5476 unsigned int order = get_order(size);
63931eb9
VB
5477 struct page *p;
5478
5479 if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
5480 gfp_mask &= ~__GFP_COMP;
5481
5482 p = alloc_pages_node(nid, gfp_mask, order);
ee85c2e1
AK
5483 if (!p)
5484 return NULL;
5485 return make_alloc_exact((unsigned long)page_address(p), order, size);
5486}
ee85c2e1 5487
2be0ffe2
TT
5488/**
5489 * free_pages_exact - release memory allocated via alloc_pages_exact()
5490 * @virt: the value returned by alloc_pages_exact.
5491 * @size: size of allocation, same value as passed to alloc_pages_exact().
5492 *
5493 * Release the memory allocated by a previous call to alloc_pages_exact.
5494 */
5495void free_pages_exact(void *virt, size_t size)
5496{
5497 unsigned long addr = (unsigned long)virt;
5498 unsigned long end = addr + PAGE_ALIGN(size);
5499
5500 while (addr < end) {
5501 free_page(addr);
5502 addr += PAGE_SIZE;
5503 }
5504}
5505EXPORT_SYMBOL(free_pages_exact);
5506
e0fb5815
ZY
5507/**
5508 * nr_free_zone_pages - count number of pages beyond high watermark
5509 * @offset: The zone index of the highest zone
5510 *
a862f68a 5511 * nr_free_zone_pages() counts the number of pages which are beyond the
e0fb5815
ZY
5512 * high watermark within all zones at or below a given zone index. For each
5513 * zone, the number of pages is calculated as:
0e056eb5
MCC
5514 *
5515 * nr_free_zone_pages = managed_pages - high_pages
a862f68a
MR
5516 *
5517 * Return: number of pages beyond high watermark.
e0fb5815 5518 */
ebec3862 5519static unsigned long nr_free_zone_pages(int offset)
1da177e4 5520{
dd1a239f 5521 struct zoneref *z;
54a6eb5c
MG
5522 struct zone *zone;
5523
e310fd43 5524 /* Just pick one node, since fallback list is circular */
ebec3862 5525 unsigned long sum = 0;
1da177e4 5526
0e88460d 5527 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1da177e4 5528
54a6eb5c 5529 for_each_zone_zonelist(zone, z, zonelist, offset) {
9705bea5 5530 unsigned long size = zone_managed_pages(zone);
41858966 5531 unsigned long high = high_wmark_pages(zone);
e310fd43
MB
5532 if (size > high)
5533 sum += size - high;
1da177e4
LT
5534 }
5535
5536 return sum;
5537}
5538
e0fb5815
ZY
5539/**
5540 * nr_free_buffer_pages - count number of pages beyond high watermark
5541 *
5542 * nr_free_buffer_pages() counts the number of pages which are beyond the high
5543 * watermark within ZONE_DMA and ZONE_NORMAL.
a862f68a
MR
5544 *
5545 * Return: number of pages beyond high watermark within ZONE_DMA and
5546 * ZONE_NORMAL.
1da177e4 5547 */
ebec3862 5548unsigned long nr_free_buffer_pages(void)
1da177e4 5549{
af4ca457 5550 return nr_free_zone_pages(gfp_zone(GFP_USER));
1da177e4 5551}
c2f1a551 5552EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
1da177e4 5553
08e0f6a9 5554static inline void show_node(struct zone *zone)
1da177e4 5555{
e5adfffc 5556 if (IS_ENABLED(CONFIG_NUMA))
25ba77c1 5557 printk("Node %d ", zone_to_nid(zone));
1da177e4 5558}
1da177e4 5559
d02bd27b
IR
5560long si_mem_available(void)
5561{
5562 long available;
5563 unsigned long pagecache;
5564 unsigned long wmark_low = 0;
5565 unsigned long pages[NR_LRU_LISTS];
b29940c1 5566 unsigned long reclaimable;
d02bd27b
IR
5567 struct zone *zone;
5568 int lru;
5569
5570 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
2f95ff90 5571 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
d02bd27b
IR
5572
5573 for_each_zone(zone)
a9214443 5574 wmark_low += low_wmark_pages(zone);
d02bd27b
IR
5575
5576 /*
5577 * Estimate the amount of memory available for userspace allocations,
5578 * without causing swapping.
5579 */
c41f012a 5580 available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
d02bd27b
IR
5581
5582 /*
5583 * Not all the page cache can be freed, otherwise the system will
5584 * start swapping. Assume at least half of the page cache, or the
5585 * low watermark worth of cache, needs to stay.
5586 */
5587 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
5588 pagecache -= min(pagecache / 2, wmark_low);
5589 available += pagecache;
5590
5591 /*
b29940c1
VB
5592 * Part of the reclaimable slab and other kernel memory consists of
5593 * items that are in use, and cannot be freed. Cap this estimate at the
5594 * low watermark.
d02bd27b 5595 */
d42f3245
RG
5596 reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
5597 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
b29940c1 5598 available += reclaimable - min(reclaimable / 2, wmark_low);
034ebf65 5599
d02bd27b
IR
5600 if (available < 0)
5601 available = 0;
5602 return available;
5603}
5604EXPORT_SYMBOL_GPL(si_mem_available);
5605
1da177e4
LT
5606void si_meminfo(struct sysinfo *val)
5607{
ca79b0c2 5608 val->totalram = totalram_pages();
11fb9989 5609 val->sharedram = global_node_page_state(NR_SHMEM);
c41f012a 5610 val->freeram = global_zone_page_state(NR_FREE_PAGES);
1da177e4 5611 val->bufferram = nr_blockdev_pages();
ca79b0c2 5612 val->totalhigh = totalhigh_pages();
1da177e4 5613 val->freehigh = nr_free_highpages();
1da177e4
LT
5614 val->mem_unit = PAGE_SIZE;
5615}
5616
5617EXPORT_SYMBOL(si_meminfo);
5618
5619#ifdef CONFIG_NUMA
5620void si_meminfo_node(struct sysinfo *val, int nid)
5621{
cdd91a77
JL
5622 int zone_type; /* needs to be signed */
5623 unsigned long managed_pages = 0;
fc2bd799
JK
5624 unsigned long managed_highpages = 0;
5625 unsigned long free_highpages = 0;
1da177e4
LT
5626 pg_data_t *pgdat = NODE_DATA(nid);
5627
cdd91a77 5628 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
9705bea5 5629 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
cdd91a77 5630 val->totalram = managed_pages;
11fb9989 5631 val->sharedram = node_page_state(pgdat, NR_SHMEM);
75ef7184 5632 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
98d2b0eb 5633#ifdef CONFIG_HIGHMEM
fc2bd799
JK
5634 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
5635 struct zone *zone = &pgdat->node_zones[zone_type];
5636
5637 if (is_highmem(zone)) {
9705bea5 5638 managed_highpages += zone_managed_pages(zone);
fc2bd799
JK
5639 free_highpages += zone_page_state(zone, NR_FREE_PAGES);
5640 }
5641 }
5642 val->totalhigh = managed_highpages;
5643 val->freehigh = free_highpages;
98d2b0eb 5644#else
fc2bd799
JK
5645 val->totalhigh = managed_highpages;
5646 val->freehigh = free_highpages;
98d2b0eb 5647#endif
1da177e4
LT
5648 val->mem_unit = PAGE_SIZE;
5649}
5650#endif
5651
ddd588b5 5652/*
7bf02ea2
DR
5653 * Determine whether the node should be displayed or not, depending on whether
5654 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
ddd588b5 5655 */
9af744d7 5656static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
ddd588b5 5657{
ddd588b5 5658 if (!(flags & SHOW_MEM_FILTER_NODES))
9af744d7 5659 return false;
ddd588b5 5660
9af744d7
MH
5661 /*
5662 * no node mask - aka implicit memory numa policy. Do not bother with
5663 * the synchronization - read_mems_allowed_begin - because we do not
5664 * have to be precise here.
5665 */
5666 if (!nodemask)
5667 nodemask = &cpuset_current_mems_allowed;
5668
5669 return !node_isset(nid, *nodemask);
ddd588b5
DR
5670}
5671
1da177e4
LT
5672#define K(x) ((x) << (PAGE_SHIFT-10))
5673
377e4f16
RV
5674static void show_migration_types(unsigned char type)
5675{
5676 static const char types[MIGRATE_TYPES] = {
5677 [MIGRATE_UNMOVABLE] = 'U',
377e4f16 5678 [MIGRATE_MOVABLE] = 'M',
475a2f90
VB
5679 [MIGRATE_RECLAIMABLE] = 'E',
5680 [MIGRATE_HIGHATOMIC] = 'H',
377e4f16
RV
5681#ifdef CONFIG_CMA
5682 [MIGRATE_CMA] = 'C',
5683#endif
194159fb 5684#ifdef CONFIG_MEMORY_ISOLATION
377e4f16 5685 [MIGRATE_ISOLATE] = 'I',
194159fb 5686#endif
377e4f16
RV
5687 };
5688 char tmp[MIGRATE_TYPES + 1];
5689 char *p = tmp;
5690 int i;
5691
5692 for (i = 0; i < MIGRATE_TYPES; i++) {
5693 if (type & (1 << i))
5694 *p++ = types[i];
5695 }
5696
5697 *p = '\0';
1f84a18f 5698 printk(KERN_CONT "(%s) ", tmp);
377e4f16
RV
5699}
5700
1da177e4
LT
5701/*
5702 * Show free area list (used inside shift_scroll-lock stuff)
5703 * We also calculate the percentage fragmentation. We do this by counting the
5704 * memory on each free list with the exception of the first item on the list.
d1bfcdb8
KK
5705 *
5706 * Bits in @filter:
5707 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
5708 * cpuset.
1da177e4 5709 */
9af744d7 5710void show_free_areas(unsigned int filter, nodemask_t *nodemask)
1da177e4 5711{
d1bfcdb8 5712 unsigned long free_pcp = 0;
c7241913 5713 int cpu;
1da177e4 5714 struct zone *zone;
599d0c95 5715 pg_data_t *pgdat;
1da177e4 5716
ee99c71c 5717 for_each_populated_zone(zone) {
9af744d7 5718 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
ddd588b5 5719 continue;
d1bfcdb8 5720
761b0677
KK
5721 for_each_online_cpu(cpu)
5722 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
1da177e4
LT
5723 }
5724
a731286d
KM
5725 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
5726 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
8d92890b 5727 " unevictable:%lu dirty:%lu writeback:%lu\n"
d1bfcdb8 5728 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
d1ce749a 5729 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
d1bfcdb8 5730 " free:%lu free_pcp:%lu free_cma:%lu\n",
599d0c95
MG
5731 global_node_page_state(NR_ACTIVE_ANON),
5732 global_node_page_state(NR_INACTIVE_ANON),
5733 global_node_page_state(NR_ISOLATED_ANON),
5734 global_node_page_state(NR_ACTIVE_FILE),
5735 global_node_page_state(NR_INACTIVE_FILE),
5736 global_node_page_state(NR_ISOLATED_FILE),
5737 global_node_page_state(NR_UNEVICTABLE),
11fb9989
MG
5738 global_node_page_state(NR_FILE_DIRTY),
5739 global_node_page_state(NR_WRITEBACK),
d42f3245
RG
5740 global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
5741 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
50658e2e 5742 global_node_page_state(NR_FILE_MAPPED),
11fb9989 5743 global_node_page_state(NR_SHMEM),
f0c0c115 5744 global_node_page_state(NR_PAGETABLE),
c41f012a
MH
5745 global_zone_page_state(NR_BOUNCE),
5746 global_zone_page_state(NR_FREE_PAGES),
d1bfcdb8 5747 free_pcp,
c41f012a 5748 global_zone_page_state(NR_FREE_CMA_PAGES));
1da177e4 5749
599d0c95 5750 for_each_online_pgdat(pgdat) {
9af744d7 5751 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
c02e50bb
MH
5752 continue;
5753
599d0c95
MG
5754 printk("Node %d"
5755 " active_anon:%lukB"
5756 " inactive_anon:%lukB"
5757 " active_file:%lukB"
5758 " inactive_file:%lukB"
5759 " unevictable:%lukB"
5760 " isolated(anon):%lukB"
5761 " isolated(file):%lukB"
50658e2e 5762 " mapped:%lukB"
11fb9989
MG
5763 " dirty:%lukB"
5764 " writeback:%lukB"
5765 " shmem:%lukB"
5766#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5767 " shmem_thp: %lukB"
5768 " shmem_pmdmapped: %lukB"
5769 " anon_thp: %lukB"
5770#endif
5771 " writeback_tmp:%lukB"
991e7673
SB
5772 " kernel_stack:%lukB"
5773#ifdef CONFIG_SHADOW_CALL_STACK
5774 " shadow_call_stack:%lukB"
5775#endif
f0c0c115 5776 " pagetables:%lukB"
599d0c95
MG
5777 " all_unreclaimable? %s"
5778 "\n",
5779 pgdat->node_id,
5780 K(node_page_state(pgdat, NR_ACTIVE_ANON)),
5781 K(node_page_state(pgdat, NR_INACTIVE_ANON)),
5782 K(node_page_state(pgdat, NR_ACTIVE_FILE)),
5783 K(node_page_state(pgdat, NR_INACTIVE_FILE)),
5784 K(node_page_state(pgdat, NR_UNEVICTABLE)),
5785 K(node_page_state(pgdat, NR_ISOLATED_ANON)),
5786 K(node_page_state(pgdat, NR_ISOLATED_FILE)),
50658e2e 5787 K(node_page_state(pgdat, NR_FILE_MAPPED)),
11fb9989
MG
5788 K(node_page_state(pgdat, NR_FILE_DIRTY)),
5789 K(node_page_state(pgdat, NR_WRITEBACK)),
1f06b81a 5790 K(node_page_state(pgdat, NR_SHMEM)),
11fb9989 5791#ifdef CONFIG_TRANSPARENT_HUGEPAGE
57b2847d 5792 K(node_page_state(pgdat, NR_SHMEM_THPS)),
a1528e21 5793 K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
69473e5d 5794 K(node_page_state(pgdat, NR_ANON_THPS)),
11fb9989 5795#endif
11fb9989 5796 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
991e7673
SB
5797 node_page_state(pgdat, NR_KERNEL_STACK_KB),
5798#ifdef CONFIG_SHADOW_CALL_STACK
5799 node_page_state(pgdat, NR_KERNEL_SCS_KB),
5800#endif
f0c0c115 5801 K(node_page_state(pgdat, NR_PAGETABLE)),
c73322d0
JW
5802 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
5803 "yes" : "no");
599d0c95
MG
5804 }
5805
ee99c71c 5806 for_each_populated_zone(zone) {
1da177e4
LT
5807 int i;
5808
9af744d7 5809 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
ddd588b5 5810 continue;
d1bfcdb8
KK
5811
5812 free_pcp = 0;
5813 for_each_online_cpu(cpu)
5814 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
5815
1da177e4 5816 show_node(zone);
1f84a18f
JP
5817 printk(KERN_CONT
5818 "%s"
1da177e4
LT
5819 " free:%lukB"
5820 " min:%lukB"
5821 " low:%lukB"
5822 " high:%lukB"
e47b346a 5823 " reserved_highatomic:%luKB"
71c799f4
MK
5824 " active_anon:%lukB"
5825 " inactive_anon:%lukB"
5826 " active_file:%lukB"
5827 " inactive_file:%lukB"
5828 " unevictable:%lukB"
5a1c84b4 5829 " writepending:%lukB"
1da177e4 5830 " present:%lukB"
9feedc9d 5831 " managed:%lukB"
4a0aa73f 5832 " mlocked:%lukB"
4a0aa73f 5833 " bounce:%lukB"
d1bfcdb8
KK
5834 " free_pcp:%lukB"
5835 " local_pcp:%ukB"
d1ce749a 5836 " free_cma:%lukB"
1da177e4
LT
5837 "\n",
5838 zone->name,
88f5acf8 5839 K(zone_page_state(zone, NR_FREE_PAGES)),
41858966
MG
5840 K(min_wmark_pages(zone)),
5841 K(low_wmark_pages(zone)),
5842 K(high_wmark_pages(zone)),
e47b346a 5843 K(zone->nr_reserved_highatomic),
71c799f4
MK
5844 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
5845 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
5846 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
5847 K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
5848 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
5a1c84b4 5849 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
1da177e4 5850 K(zone->present_pages),
9705bea5 5851 K(zone_managed_pages(zone)),
4a0aa73f 5852 K(zone_page_state(zone, NR_MLOCK)),
4a0aa73f 5853 K(zone_page_state(zone, NR_BOUNCE)),
d1bfcdb8
KK
5854 K(free_pcp),
5855 K(this_cpu_read(zone->pageset->pcp.count)),
33e077bd 5856 K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
1da177e4
LT
5857 printk("lowmem_reserve[]:");
5858 for (i = 0; i < MAX_NR_ZONES; i++)
1f84a18f
JP
5859 printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
5860 printk(KERN_CONT "\n");
1da177e4
LT
5861 }
5862
ee99c71c 5863 for_each_populated_zone(zone) {
d00181b9
KS
5864 unsigned int order;
5865 unsigned long nr[MAX_ORDER], flags, total = 0;
377e4f16 5866 unsigned char types[MAX_ORDER];
1da177e4 5867
9af744d7 5868 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
ddd588b5 5869 continue;
1da177e4 5870 show_node(zone);
1f84a18f 5871 printk(KERN_CONT "%s: ", zone->name);
1da177e4
LT
5872
5873 spin_lock_irqsave(&zone->lock, flags);
5874 for (order = 0; order < MAX_ORDER; order++) {
377e4f16
RV
5875 struct free_area *area = &zone->free_area[order];
5876 int type;
5877
5878 nr[order] = area->nr_free;
8f9de51a 5879 total += nr[order] << order;
377e4f16
RV
5880
5881 types[order] = 0;
5882 for (type = 0; type < MIGRATE_TYPES; type++) {
b03641af 5883 if (!free_area_empty(area, type))
377e4f16
RV
5884 types[order] |= 1 << type;
5885 }
1da177e4
LT
5886 }
5887 spin_unlock_irqrestore(&zone->lock, flags);
377e4f16 5888 for (order = 0; order < MAX_ORDER; order++) {
1f84a18f
JP
5889 printk(KERN_CONT "%lu*%lukB ",
5890 nr[order], K(1UL) << order);
377e4f16
RV
5891 if (nr[order])
5892 show_migration_types(types[order]);
5893 }
1f84a18f 5894 printk(KERN_CONT "= %lukB\n", K(total));
1da177e4
LT
5895 }
5896
949f7ec5
DR
5897 hugetlb_show_meminfo();
5898
11fb9989 5899 printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
e6f3602d 5900
1da177e4
LT
5901 show_swap_cache_info();
5902}
5903
19770b32
MG
5904static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
5905{
5906 zoneref->zone = zone;
5907 zoneref->zone_idx = zone_idx(zone);
5908}
5909
1da177e4
LT
5910/*
5911 * Builds allocation fallback zone lists.
1a93205b
CL
5912 *
5913 * Add all populated zones of a node to the zonelist.
1da177e4 5914 */
9d3be21b 5915static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
1da177e4 5916{
1a93205b 5917 struct zone *zone;
bc732f1d 5918 enum zone_type zone_type = MAX_NR_ZONES;
9d3be21b 5919 int nr_zones = 0;
02a68a5e
CL
5920
5921 do {
2f6726e5 5922 zone_type--;
070f8032 5923 zone = pgdat->node_zones + zone_type;
6aa303de 5924 if (managed_zone(zone)) {
9d3be21b 5925 zoneref_set_zone(zone, &zonerefs[nr_zones++]);
070f8032 5926 check_highest_zone(zone_type);
1da177e4 5927 }
2f6726e5 5928 } while (zone_type);
bc732f1d 5929
070f8032 5930 return nr_zones;
1da177e4
LT
5931}
5932
5933#ifdef CONFIG_NUMA
f0c0b2b8
KH
5934
5935static int __parse_numa_zonelist_order(char *s)
5936{
c9bff3ee 5937 /*
f0953a1b 5938 * We used to support different zonelists modes but they turned
c9bff3ee
MH
5939 * out to be just not useful. Let's keep the warning in place
5940 * if somebody still use the cmd line parameter so that we do
5941 * not fail it silently
5942 */
5943 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
5944 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
f0c0b2b8
KH
5945 return -EINVAL;
5946 }
5947 return 0;
5948}
5949
c9bff3ee
MH
5950char numa_zonelist_order[] = "Node";
5951
f0c0b2b8
KH
5952/*
5953 * sysctl handler for numa_zonelist_order
5954 */
cccad5b9 5955int numa_zonelist_order_handler(struct ctl_table *table, int write,
32927393 5956 void *buffer, size_t *length, loff_t *ppos)
f0c0b2b8 5957{
32927393
CH
5958 if (write)
5959 return __parse_numa_zonelist_order(buffer);
5960 return proc_dostring(table, write, buffer, length, ppos);
f0c0b2b8
KH
5961}
5962
5963
62bc62a8 5964#define MAX_NODE_LOAD (nr_online_nodes)
f0c0b2b8
KH
5965static int node_load[MAX_NUMNODES];
5966
1da177e4 5967/**
4dc3b16b 5968 * find_next_best_node - find the next node that should appear in a given node's fallback list
1da177e4
LT
5969 * @node: node whose fallback list we're appending
5970 * @used_node_mask: nodemask_t of already used nodes
5971 *
5972 * We use a number of factors to determine which is the next node that should
5973 * appear on a given node's fallback list. The node should not have appeared
5974 * already in @node's fallback list, and it should be the next closest node
5975 * according to the distance array (which contains arbitrary distance values
5976 * from each node to each node in the system), and should also prefer nodes
5977 * with no CPUs, since presumably they'll have very little allocation pressure
5978 * on them otherwise.
a862f68a
MR
5979 *
5980 * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
1da177e4 5981 */
f0c0b2b8 5982static int find_next_best_node(int node, nodemask_t *used_node_mask)
1da177e4 5983{
4cf808eb 5984 int n, val;
1da177e4 5985 int min_val = INT_MAX;
00ef2d2f 5986 int best_node = NUMA_NO_NODE;
1da177e4 5987
4cf808eb
LT
5988 /* Use the local node if we haven't already */
5989 if (!node_isset(node, *used_node_mask)) {
5990 node_set(node, *used_node_mask);
5991 return node;
5992 }
1da177e4 5993
4b0ef1fe 5994 for_each_node_state(n, N_MEMORY) {
1da177e4
LT
5995
5996 /* Don't want a node to appear more than once */
5997 if (node_isset(n, *used_node_mask))
5998 continue;
5999
1da177e4
LT
6000 /* Use the distance array to find the distance */
6001 val = node_distance(node, n);
6002
4cf808eb
LT
6003 /* Penalize nodes under us ("prefer the next node") */
6004 val += (n < node);
6005
1da177e4 6006 /* Give preference to headless and unused nodes */
b630749f 6007 if (!cpumask_empty(cpumask_of_node(n)))
1da177e4
LT
6008 val += PENALTY_FOR_NODE_WITH_CPUS;
6009
6010 /* Slight preference for less loaded node */
6011 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
6012 val += node_load[n];
6013
6014 if (val < min_val) {
6015 min_val = val;
6016 best_node = n;
6017 }
6018 }
6019
6020 if (best_node >= 0)
6021 node_set(best_node, *used_node_mask);
6022
6023 return best_node;
6024}
6025
f0c0b2b8
KH
6026
6027/*
6028 * Build zonelists ordered by node and zones within node.
6029 * This results in maximum locality--normal zone overflows into local
6030 * DMA zone, if any--but risks exhausting DMA zone.
6031 */
9d3be21b
MH
6032static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
6033 unsigned nr_nodes)
1da177e4 6034{
9d3be21b
MH
6035 struct zoneref *zonerefs;
6036 int i;
6037
6038 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
6039
6040 for (i = 0; i < nr_nodes; i++) {
6041 int nr_zones;
6042
6043 pg_data_t *node = NODE_DATA(node_order[i]);
f0c0b2b8 6044
9d3be21b
MH
6045 nr_zones = build_zonerefs_node(node, zonerefs);
6046 zonerefs += nr_zones;
6047 }
6048 zonerefs->zone = NULL;
6049 zonerefs->zone_idx = 0;
f0c0b2b8
KH
6050}
6051
523b9458
CL
6052/*
6053 * Build gfp_thisnode zonelists
6054 */
6055static void build_thisnode_zonelists(pg_data_t *pgdat)
6056{
9d3be21b
MH
6057 struct zoneref *zonerefs;
6058 int nr_zones;
523b9458 6059
9d3be21b
MH
6060 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
6061 nr_zones = build_zonerefs_node(pgdat, zonerefs);
6062 zonerefs += nr_zones;
6063 zonerefs->zone = NULL;
6064 zonerefs->zone_idx = 0;
523b9458
CL
6065}
6066
f0c0b2b8
KH
6067/*
6068 * Build zonelists ordered by zone and nodes within zones.
6069 * This results in conserving DMA zone[s] until all Normal memory is
6070 * exhausted, but results in overflowing to remote node while memory
6071 * may still exist in local DMA zone.
6072 */
f0c0b2b8 6073
f0c0b2b8
KH
6074static void build_zonelists(pg_data_t *pgdat)
6075{
9d3be21b
MH
6076 static int node_order[MAX_NUMNODES];
6077 int node, load, nr_nodes = 0;
d0ddf49b 6078 nodemask_t used_mask = NODE_MASK_NONE;
f0c0b2b8 6079 int local_node, prev_node;
1da177e4
LT
6080
6081 /* NUMA-aware ordering of nodes */
6082 local_node = pgdat->node_id;
62bc62a8 6083 load = nr_online_nodes;
1da177e4 6084 prev_node = local_node;
f0c0b2b8 6085
f0c0b2b8 6086 memset(node_order, 0, sizeof(node_order));
1da177e4
LT
6087 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
6088 /*
6089 * We don't want to pressure a particular node.
6090 * So adding penalty to the first node in same
6091 * distance group to make it round-robin.
6092 */
957f822a
DR
6093 if (node_distance(local_node, node) !=
6094 node_distance(local_node, prev_node))
f0c0b2b8
KH
6095 node_load[node] = load;
6096
9d3be21b 6097 node_order[nr_nodes++] = node;
1da177e4
LT
6098 prev_node = node;
6099 load--;
1da177e4 6100 }
523b9458 6101
9d3be21b 6102 build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
523b9458 6103 build_thisnode_zonelists(pgdat);
1da177e4
LT
6104}
6105
7aac7898
LS
6106#ifdef CONFIG_HAVE_MEMORYLESS_NODES
6107/*
6108 * Return node id of node used for "local" allocations.
6109 * I.e., first node id of first zone in arg node's generic zonelist.
6110 * Used for initializing percpu 'numa_mem', which is used primarily
6111 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
6112 */
6113int local_memory_node(int node)
6114{
c33d6c06 6115 struct zoneref *z;
7aac7898 6116
c33d6c06 6117 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
7aac7898 6118 gfp_zone(GFP_KERNEL),
c33d6c06 6119 NULL);
c1093b74 6120 return zone_to_nid(z->zone);
7aac7898
LS
6121}
6122#endif
f0c0b2b8 6123
6423aa81
JK
6124static void setup_min_unmapped_ratio(void);
6125static void setup_min_slab_ratio(void);
1da177e4
LT
6126#else /* CONFIG_NUMA */
6127
f0c0b2b8 6128static void build_zonelists(pg_data_t *pgdat)
1da177e4 6129{
19655d34 6130 int node, local_node;
9d3be21b
MH
6131 struct zoneref *zonerefs;
6132 int nr_zones;
1da177e4
LT
6133
6134 local_node = pgdat->node_id;
1da177e4 6135
9d3be21b
MH
6136 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
6137 nr_zones = build_zonerefs_node(pgdat, zonerefs);
6138 zonerefs += nr_zones;
1da177e4 6139
54a6eb5c
MG
6140 /*
6141 * Now we build the zonelist so that it contains the zones
6142 * of all the other nodes.
6143 * We don't want to pressure a particular node, so when
6144 * building the zones for node N, we make sure that the
6145 * zones coming right after the local ones are those from
6146 * node N+1 (modulo N)
6147 */
6148 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
6149 if (!node_online(node))
6150 continue;
9d3be21b
MH
6151 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
6152 zonerefs += nr_zones;
1da177e4 6153 }
54a6eb5c
MG
6154 for (node = 0; node < local_node; node++) {
6155 if (!node_online(node))
6156 continue;
9d3be21b
MH
6157 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
6158 zonerefs += nr_zones;
54a6eb5c
MG
6159 }
6160
9d3be21b
MH
6161 zonerefs->zone = NULL;
6162 zonerefs->zone_idx = 0;
1da177e4
LT
6163}
6164
6165#endif /* CONFIG_NUMA */
6166
99dcc3e5
CL
6167/*
6168 * Boot pageset table. One per cpu which is going to be used for all
6169 * zones and all nodes. The parameters will be set in such a way
6170 * that an item put on a list will immediately be handed over to
6171 * the buddy list. This is safe since pageset manipulation is done
6172 * with interrupts disabled.
6173 *
6174 * The boot_pagesets must be kept even after bootup is complete for
6175 * unused processors and/or zones. They do play a role for bootstrapping
6176 * hotplugged processors.
6177 *
6178 * zoneinfo_show() and maybe other functions do
6179 * not check if the processor is online before following the pageset pointer.
6180 * Other parts of the kernel may not check if the zone is available.
6181 */
69a8396a 6182static void pageset_init(struct per_cpu_pageset *p);
952eaf81
VB
6183/* These effectively disable the pcplists in the boot pageset completely */
6184#define BOOT_PAGESET_HIGH 0
6185#define BOOT_PAGESET_BATCH 1
99dcc3e5 6186static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
385386cf 6187static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
99dcc3e5 6188
11cd8638 6189static void __build_all_zonelists(void *data)
1da177e4 6190{
6811378e 6191 int nid;
afb6ebb3 6192 int __maybe_unused cpu;
9adb62a5 6193 pg_data_t *self = data;
b93e0f32
MH
6194 static DEFINE_SPINLOCK(lock);
6195
6196 spin_lock(&lock);
9276b1bc 6197
7f9cfb31
BL
6198#ifdef CONFIG_NUMA
6199 memset(node_load, 0, sizeof(node_load));
6200#endif
9adb62a5 6201
c1152583
WY
6202 /*
6203 * This node is hotadded and no memory is yet present. So just
6204 * building zonelists is fine - no need to touch other nodes.
6205 */
9adb62a5
JL
6206 if (self && !node_online(self->node_id)) {
6207 build_zonelists(self);
c1152583
WY
6208 } else {
6209 for_each_online_node(nid) {
6210 pg_data_t *pgdat = NODE_DATA(nid);
7ea1530a 6211
c1152583
WY
6212 build_zonelists(pgdat);
6213 }
99dcc3e5 6214
7aac7898
LS
6215#ifdef CONFIG_HAVE_MEMORYLESS_NODES
6216 /*
6217 * We now know the "local memory node" for each node--
6218 * i.e., the node of the first zone in the generic zonelist.
6219 * Set up numa_mem percpu variable for on-line cpus. During
6220 * boot, only the boot cpu should be on-line; we'll init the
6221 * secondary cpus' numa_mem as they come on-line. During
6222 * node/memory hotplug, we'll fixup all on-line cpus.
6223 */
d9c9a0b9 6224 for_each_online_cpu(cpu)
7aac7898 6225 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
afb6ebb3 6226#endif
d9c9a0b9 6227 }
b93e0f32
MH
6228
6229 spin_unlock(&lock);
6811378e
YG
6230}
6231
061f67bc
RV
6232static noinline void __init
6233build_all_zonelists_init(void)
6234{
afb6ebb3
MH
6235 int cpu;
6236
061f67bc 6237 __build_all_zonelists(NULL);
afb6ebb3
MH
6238
6239 /*
6240 * Initialize the boot_pagesets that are going to be used
6241 * for bootstrapping processors. The real pagesets for
6242 * each zone will be allocated later when the per cpu
6243 * allocator is available.
6244 *
6245 * boot_pagesets are used also for bootstrapping offline
6246 * cpus if the system is already booted because the pagesets
6247 * are needed to initialize allocators on a specific cpu too.
6248 * F.e. the percpu allocator needs the page allocator which
6249 * needs the percpu allocator in order to allocate its pagesets
6250 * (a chicken-egg dilemma).
6251 */
6252 for_each_possible_cpu(cpu)
69a8396a 6253 pageset_init(&per_cpu(boot_pageset, cpu));
afb6ebb3 6254
061f67bc
RV
6255 mminit_verify_zonelist();
6256 cpuset_init_current_mems_allowed();
6257}
6258
4eaf3f64 6259/*
4eaf3f64 6260 * unless system_state == SYSTEM_BOOTING.
061f67bc 6261 *
72675e13 6262 * __ref due to call of __init annotated helper build_all_zonelists_init
061f67bc 6263 * [protected by SYSTEM_BOOTING].
4eaf3f64 6264 */
72675e13 6265void __ref build_all_zonelists(pg_data_t *pgdat)
6811378e 6266{
0a18e607
DH
6267 unsigned long vm_total_pages;
6268
6811378e 6269 if (system_state == SYSTEM_BOOTING) {
061f67bc 6270 build_all_zonelists_init();
6811378e 6271 } else {
11cd8638 6272 __build_all_zonelists(pgdat);
6811378e
YG
6273 /* cpuset refresh routine should be here */
6274 }
56b9413b
DH
6275 /* Get the number of free pages beyond high watermark in all zones. */
6276 vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
9ef9acb0
MG
6277 /*
6278 * Disable grouping by mobility if the number of pages in the
6279 * system is too low to allow the mechanism to work. It would be
6280 * more accurate, but expensive to check per-zone. This check is
6281 * made on memory-hotadd so a system can start with mobility
6282 * disabled and enable it later
6283 */
d9c23400 6284 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
9ef9acb0
MG
6285 page_group_by_mobility_disabled = 1;
6286 else
6287 page_group_by_mobility_disabled = 0;
6288
ce0725f7 6289 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
756a025f 6290 nr_online_nodes,
756a025f
JP
6291 page_group_by_mobility_disabled ? "off" : "on",
6292 vm_total_pages);
f0c0b2b8 6293#ifdef CONFIG_NUMA
f88dfff5 6294 pr_info("Policy zone: %s\n", zone_names[policy_zone]);
f0c0b2b8 6295#endif
1da177e4
LT
6296}
6297
a9a9e77f
PT
6298/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
6299static bool __meminit
6300overlap_memmap_init(unsigned long zone, unsigned long *pfn)
6301{
a9a9e77f
PT
6302 static struct memblock_region *r;
6303
6304 if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
6305 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
cc6de168 6306 for_each_mem_region(r) {
a9a9e77f
PT
6307 if (*pfn < memblock_region_memory_end_pfn(r))
6308 break;
6309 }
6310 }
6311 if (*pfn >= memblock_region_memory_base_pfn(r) &&
6312 memblock_is_mirror(r)) {
6313 *pfn = memblock_region_memory_end_pfn(r);
6314 return true;
6315 }
6316 }
a9a9e77f
PT
6317 return false;
6318}
6319
1da177e4
LT
6320/*
6321 * Initially all pages are reserved - free ones are freed
c6ffc5ca 6322 * up by memblock_free_all() once the early boot process is
1da177e4 6323 * done. Non-atomic initialization, single-pass.
d882c006
DH
6324 *
6325 * All aligned pageblocks are initialized to the specified migratetype
6326 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
6327 * zone stats (e.g., nr_isolate_pageblock) are touched.
1da177e4 6328 */
ab28cb6e 6329void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
dc2da7b4 6330 unsigned long start_pfn, unsigned long zone_end_pfn,
d882c006
DH
6331 enum meminit_context context,
6332 struct vmem_altmap *altmap, int migratetype)
1da177e4 6333{
a9a9e77f 6334 unsigned long pfn, end_pfn = start_pfn + size;
d0dc12e8 6335 struct page *page;
1da177e4 6336
22b31eec
HD
6337 if (highest_memmap_pfn < end_pfn - 1)
6338 highest_memmap_pfn = end_pfn - 1;
6339
966cf44f 6340#ifdef CONFIG_ZONE_DEVICE
4b94ffdc
DW
6341 /*
6342 * Honor reservation requested by the driver for this ZONE_DEVICE
966cf44f
AD
6343 * memory. We limit the total number of pages to initialize to just
6344 * those that might contain the memory mapping. We will defer the
6345 * ZONE_DEVICE page initialization until after we have released
6346 * the hotplug lock.
4b94ffdc 6347 */
966cf44f
AD
6348 if (zone == ZONE_DEVICE) {
6349 if (!altmap)
6350 return;
6351
6352 if (start_pfn == altmap->base_pfn)
6353 start_pfn += altmap->reserve;
6354 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
6355 }
6356#endif
4b94ffdc 6357
948c436e 6358 for (pfn = start_pfn; pfn < end_pfn; ) {
a2f3aa02 6359 /*
b72d0ffb
AM
6360 * There can be holes in boot-time mem_map[]s handed to this
6361 * function. They do not exist on hotplugged memory.
a2f3aa02 6362 */
c1d0da83 6363 if (context == MEMINIT_EARLY) {
a9a9e77f
PT
6364 if (overlap_memmap_init(zone, &pfn))
6365 continue;
dc2da7b4 6366 if (defer_init(nid, pfn, zone_end_pfn))
a9a9e77f 6367 break;
a2f3aa02 6368 }
ac5d2539 6369
d0dc12e8
PT
6370 page = pfn_to_page(pfn);
6371 __init_single_page(page, pfn, zone, nid);
c1d0da83 6372 if (context == MEMINIT_HOTPLUG)
d483da5b 6373 __SetPageReserved(page);
d0dc12e8 6374
ac5d2539 6375 /*
d882c006
DH
6376 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
6377 * such that unmovable allocations won't be scattered all
6378 * over the place during system boot.
ac5d2539 6379 */
4eb29bd9 6380 if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
d882c006 6381 set_pageblock_migratetype(page, migratetype);
9b6e63cb 6382 cond_resched();
ac5d2539 6383 }
948c436e 6384 pfn++;
1da177e4
LT
6385 }
6386}
6387
966cf44f
AD
6388#ifdef CONFIG_ZONE_DEVICE
6389void __ref memmap_init_zone_device(struct zone *zone,
6390 unsigned long start_pfn,
1f8d75c1 6391 unsigned long nr_pages,
966cf44f
AD
6392 struct dev_pagemap *pgmap)
6393{
1f8d75c1 6394 unsigned long pfn, end_pfn = start_pfn + nr_pages;
966cf44f 6395 struct pglist_data *pgdat = zone->zone_pgdat;
514caf23 6396 struct vmem_altmap *altmap = pgmap_altmap(pgmap);
966cf44f
AD
6397 unsigned long zone_idx = zone_idx(zone);
6398 unsigned long start = jiffies;
6399 int nid = pgdat->node_id;
6400
46d945ae 6401 if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
966cf44f
AD
6402 return;
6403
6404 /*
122e093c 6405 * The call to memmap_init should have already taken care
966cf44f
AD
6406 * of the pages reserved for the memmap, so we can just jump to
6407 * the end of that region and start processing the device pages.
6408 */
514caf23 6409 if (altmap) {
966cf44f 6410 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
1f8d75c1 6411 nr_pages = end_pfn - start_pfn;
966cf44f
AD
6412 }
6413
6414 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
6415 struct page *page = pfn_to_page(pfn);
6416
6417 __init_single_page(page, pfn, zone_idx, nid);
6418
6419 /*
6420 * Mark page reserved as it will need to wait for onlining
6421 * phase for it to be fully associated with a zone.
6422 *
6423 * We can use the non-atomic __set_bit operation for setting
6424 * the flag as we are still initializing the pages.
6425 */
6426 __SetPageReserved(page);
6427
6428 /*
8a164fef
CH
6429 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
6430 * and zone_device_data. It is a bug if a ZONE_DEVICE page is
6431 * ever freed or placed on a driver-private list.
966cf44f
AD
6432 */
6433 page->pgmap = pgmap;
8a164fef 6434 page->zone_device_data = NULL;
966cf44f
AD
6435
6436 /*
6437 * Mark the block movable so that blocks are reserved for
6438 * movable at startup. This will force kernel allocations
6439 * to reserve their blocks rather than leaking throughout
6440 * the address space during boot when many long-lived
6441 * kernel allocations are made.
6442 *
c1d0da83 6443 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
ba72b4c8 6444 * because this is done early in section_activate()
966cf44f 6445 */
4eb29bd9 6446 if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
966cf44f
AD
6447 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
6448 cond_resched();
6449 }
6450 }
6451
fdc029b1 6452 pr_info("%s initialised %lu pages in %ums\n", __func__,
1f8d75c1 6453 nr_pages, jiffies_to_msecs(jiffies - start));
966cf44f
AD
6454}
6455
6456#endif
1e548deb 6457static void __meminit zone_init_free_lists(struct zone *zone)
1da177e4 6458{
7aeb09f9 6459 unsigned int order, t;
b2a0ac88
MG
6460 for_each_migratetype_order(order, t) {
6461 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
1da177e4
LT
6462 zone->free_area[order].nr_free = 0;
6463 }
6464}
6465
0740a50b
MR
6466#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
6467/*
6468 * Only struct pages that correspond to ranges defined by memblock.memory
6469 * are zeroed and initialized by going through __init_single_page() during
122e093c 6470 * memmap_init_zone_range().
0740a50b
MR
6471 *
6472 * But, there could be struct pages that correspond to holes in
6473 * memblock.memory. This can happen because of the following reasons:
6474 * - physical memory bank size is not necessarily the exact multiple of the
6475 * arbitrary section size
6476 * - early reserved memory may not be listed in memblock.memory
6477 * - memory layouts defined with memmap= kernel parameter may not align
6478 * nicely with memmap sections
6479 *
6480 * Explicitly initialize those struct pages so that:
6481 * - PG_Reserved is set
6482 * - zone and node links point to zone and node that span the page if the
6483 * hole is in the middle of a zone
6484 * - zone and node links point to adjacent zone/node if the hole falls on
6485 * the zone boundary; the pages in such holes will be prepended to the
6486 * zone/node above the hole except for the trailing pages in the last
6487 * section that will be appended to the zone/node below.
6488 */
122e093c
MR
6489static void __init init_unavailable_range(unsigned long spfn,
6490 unsigned long epfn,
6491 int zone, int node)
0740a50b
MR
6492{
6493 unsigned long pfn;
6494 u64 pgcnt = 0;
6495
6496 for (pfn = spfn; pfn < epfn; pfn++) {
6497 if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6498 pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6499 + pageblock_nr_pages - 1;
6500 continue;
6501 }
6502 __init_single_page(pfn_to_page(pfn), pfn, zone, node);
6503 __SetPageReserved(pfn_to_page(pfn));
6504 pgcnt++;
6505 }
6506
122e093c
MR
6507 if (pgcnt)
6508 pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
6509 node, zone_names[zone], pgcnt);
0740a50b
MR
6510}
6511#else
122e093c
MR
6512static inline void init_unavailable_range(unsigned long spfn,
6513 unsigned long epfn,
6514 int zone, int node)
0740a50b 6515{
0740a50b
MR
6516}
6517#endif
6518
122e093c
MR
6519static void __init memmap_init_zone_range(struct zone *zone,
6520 unsigned long start_pfn,
6521 unsigned long end_pfn,
6522 unsigned long *hole_pfn)
dfb3ccd0 6523{
3256ff83
BH
6524 unsigned long zone_start_pfn = zone->zone_start_pfn;
6525 unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
122e093c
MR
6526 int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
6527
6528 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
6529 end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
6530
6531 if (start_pfn >= end_pfn)
6532 return;
6533
6534 memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
6535 zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
6536
6537 if (*hole_pfn < start_pfn)
6538 init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
6539
6540 *hole_pfn = end_pfn;
6541}
6542
6543static void __init memmap_init(void)
6544{
73a6e474 6545 unsigned long start_pfn, end_pfn;
122e093c
MR
6546 unsigned long hole_pfn = 0;
6547 int i, j, zone_id, nid;
73a6e474 6548
122e093c
MR
6549 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
6550 struct pglist_data *node = NODE_DATA(nid);
6551
6552 for (j = 0; j < MAX_NR_ZONES; j++) {
6553 struct zone *zone = node->node_zones + j;
73a6e474 6554
122e093c
MR
6555 if (!populated_zone(zone))
6556 continue;
0740a50b 6557
122e093c
MR
6558 memmap_init_zone_range(zone, start_pfn, end_pfn,
6559 &hole_pfn);
6560 zone_id = j;
6561 }
73a6e474 6562 }
0740a50b
MR
6563
6564#ifdef CONFIG_SPARSEMEM
6565 /*
122e093c
MR
6566 * Initialize the memory map for hole in the range [memory_end,
6567 * section_end].
6568 * Append the pages in this hole to the highest zone in the last
6569 * node.
6570 * The call to init_unavailable_range() is outside the ifdef to
6571 * silence the compiler warining about zone_id set but not used;
6572 * for FLATMEM it is a nop anyway
0740a50b 6573 */
122e093c 6574 end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
0740a50b 6575 if (hole_pfn < end_pfn)
0740a50b 6576#endif
122e093c 6577 init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
dfb3ccd0 6578}
1da177e4 6579
7cd2b0a3 6580static int zone_batchsize(struct zone *zone)
e7c8d5c9 6581{
3a6be87f 6582#ifdef CONFIG_MMU
e7c8d5c9
CL
6583 int batch;
6584
6585 /*
6586 * The per-cpu-pages pools are set to around 1000th of the
d8a759b5 6587 * size of the zone.
e7c8d5c9 6588 */
9705bea5 6589 batch = zone_managed_pages(zone) / 1024;
d8a759b5
AL
6590 /* But no more than a meg. */
6591 if (batch * PAGE_SIZE > 1024 * 1024)
6592 batch = (1024 * 1024) / PAGE_SIZE;
e7c8d5c9
CL
6593 batch /= 4; /* We effectively *= 4 below */
6594 if (batch < 1)
6595 batch = 1;
6596
6597 /*
0ceaacc9
NP
6598 * Clamp the batch to a 2^n - 1 value. Having a power
6599 * of 2 value was found to be more likely to have
6600 * suboptimal cache aliasing properties in some cases.
e7c8d5c9 6601 *
0ceaacc9
NP
6602 * For example if 2 tasks are alternately allocating
6603 * batches of pages, one task can end up with a lot
6604 * of pages of one half of the possible page colors
6605 * and the other with pages of the other colors.
e7c8d5c9 6606 */
9155203a 6607 batch = rounddown_pow_of_two(batch + batch/2) - 1;
ba56e91c 6608
e7c8d5c9 6609 return batch;
3a6be87f
DH
6610
6611#else
6612 /* The deferral and batching of frees should be suppressed under NOMMU
6613 * conditions.
6614 *
6615 * The problem is that NOMMU needs to be able to allocate large chunks
6616 * of contiguous memory as there's no hardware page translation to
6617 * assemble apparent contiguous memory from discontiguous pages.
6618 *
6619 * Queueing large contiguous runs of pages for batching, however,
6620 * causes the pages to actually be freed in smaller chunks. As there
6621 * can be a significant delay between the individual batches being
6622 * recycled, this leads to the once large chunks of space being
6623 * fragmented and becoming unavailable for high-order allocations.
6624 */
6625 return 0;
6626#endif
e7c8d5c9
CL
6627}
6628
8d7a8fa9 6629/*
5c3ad2eb
VB
6630 * pcp->high and pcp->batch values are related and generally batch is lower
6631 * than high. They are also related to pcp->count such that count is lower
6632 * than high, and as soon as it reaches high, the pcplist is flushed.
8d7a8fa9 6633 *
5c3ad2eb
VB
6634 * However, guaranteeing these relations at all times would require e.g. write
6635 * barriers here but also careful usage of read barriers at the read side, and
6636 * thus be prone to error and bad for performance. Thus the update only prevents
6637 * store tearing. Any new users of pcp->batch and pcp->high should ensure they
6638 * can cope with those fields changing asynchronously, and fully trust only the
6639 * pcp->count field on the local CPU with interrupts disabled.
8d7a8fa9
CS
6640 *
6641 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
6642 * outside of boot time (or some other assurance that no concurrent updaters
6643 * exist).
6644 */
6645static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
6646 unsigned long batch)
6647{
5c3ad2eb
VB
6648 WRITE_ONCE(pcp->batch, batch);
6649 WRITE_ONCE(pcp->high, high);
8d7a8fa9
CS
6650}
6651
88c90dbc 6652static void pageset_init(struct per_cpu_pageset *p)
2caaad41
CL
6653{
6654 struct per_cpu_pages *pcp;
5f8dcc21 6655 int migratetype;
2caaad41 6656
1c6fe946
MD
6657 memset(p, 0, sizeof(*p));
6658
3dfa5721 6659 pcp = &p->pcp;
5f8dcc21
MG
6660 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
6661 INIT_LIST_HEAD(&pcp->lists[migratetype]);
2caaad41 6662
69a8396a
VB
6663 /*
6664 * Set batch and high values safe for a boot pageset. A true percpu
6665 * pageset's initialization will update them subsequently. Here we don't
6666 * need to be as careful as pageset_update() as nobody can access the
6667 * pageset yet.
6668 */
952eaf81
VB
6669 pcp->high = BOOT_PAGESET_HIGH;
6670 pcp->batch = BOOT_PAGESET_BATCH;
88c90dbc
CS
6671}
6672
3b1f3658 6673static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
ec6e8c7e
VB
6674 unsigned long batch)
6675{
6676 struct per_cpu_pageset *p;
6677 int cpu;
6678
6679 for_each_possible_cpu(cpu) {
6680 p = per_cpu_ptr(zone->pageset, cpu);
6681 pageset_update(&p->pcp, high, batch);
6682 }
6683}
6684
8ad4b1fb 6685/*
0a8b4f1d 6686 * Calculate and set new high and batch values for all per-cpu pagesets of a
7115ac6e 6687 * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
8ad4b1fb 6688 */
0a8b4f1d 6689static void zone_set_pageset_high_and_batch(struct zone *zone)
56cef2b8 6690{
7115ac6e
VB
6691 unsigned long new_high, new_batch;
6692
6693 if (percpu_pagelist_fraction) {
6694 new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
6695 new_batch = max(1UL, new_high / 4);
6696 if ((new_high / 4) > (PAGE_SHIFT * 8))
6697 new_batch = PAGE_SHIFT * 8;
6698 } else {
6699 new_batch = zone_batchsize(zone);
6700 new_high = 6 * new_batch;
6701 new_batch = max(1UL, 1 * new_batch);
6702 }
169f6c19 6703
952eaf81
VB
6704 if (zone->pageset_high == new_high &&
6705 zone->pageset_batch == new_batch)
6706 return;
6707
6708 zone->pageset_high = new_high;
6709 zone->pageset_batch = new_batch;
6710
ec6e8c7e 6711 __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
169f6c19
CS
6712}
6713
72675e13 6714void __meminit setup_zone_pageset(struct zone *zone)
319774e2 6715{
0a8b4f1d 6716 struct per_cpu_pageset *p;
319774e2 6717 int cpu;
0a8b4f1d 6718
319774e2 6719 zone->pageset = alloc_percpu(struct per_cpu_pageset);
0a8b4f1d
VB
6720 for_each_possible_cpu(cpu) {
6721 p = per_cpu_ptr(zone->pageset, cpu);
6722 pageset_init(p);
6723 }
6724
6725 zone_set_pageset_high_and_batch(zone);
319774e2
WF
6726}
6727
2caaad41 6728/*
99dcc3e5
CL
6729 * Allocate per cpu pagesets and initialize them.
6730 * Before this call only boot pagesets were available.
e7c8d5c9 6731 */
99dcc3e5 6732void __init setup_per_cpu_pageset(void)
e7c8d5c9 6733{
b4911ea2 6734 struct pglist_data *pgdat;
99dcc3e5 6735 struct zone *zone;
b418a0f9 6736 int __maybe_unused cpu;
e7c8d5c9 6737
319774e2
WF
6738 for_each_populated_zone(zone)
6739 setup_zone_pageset(zone);
b4911ea2 6740
b418a0f9
SD
6741#ifdef CONFIG_NUMA
6742 /*
6743 * Unpopulated zones continue using the boot pagesets.
6744 * The numa stats for these pagesets need to be reset.
6745 * Otherwise, they will end up skewing the stats of
6746 * the nodes these zones are associated with.
6747 */
6748 for_each_possible_cpu(cpu) {
6749 struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
6750 memset(pcp->vm_numa_stat_diff, 0,
6751 sizeof(pcp->vm_numa_stat_diff));
6752 }
6753#endif
6754
b4911ea2
MG
6755 for_each_online_pgdat(pgdat)
6756 pgdat->per_cpu_nodestats =
6757 alloc_percpu(struct per_cpu_nodestat);
e7c8d5c9
CL
6758}
6759
c09b4240 6760static __meminit void zone_pcp_init(struct zone *zone)
ed8ece2e 6761{
99dcc3e5
CL
6762 /*
6763 * per cpu subsystem is not up at this point. The following code
6764 * relies on the ability of the linker to provide the
6765 * offset of a (static) per cpu variable into the per cpu area.
6766 */
6767 zone->pageset = &boot_pageset;
952eaf81
VB
6768 zone->pageset_high = BOOT_PAGESET_HIGH;
6769 zone->pageset_batch = BOOT_PAGESET_BATCH;
ed8ece2e 6770
b38a8725 6771 if (populated_zone(zone))
99dcc3e5
CL
6772 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
6773 zone->name, zone->present_pages,
6774 zone_batchsize(zone));
ed8ece2e
DH
6775}
6776
dc0bbf3b 6777void __meminit init_currently_empty_zone(struct zone *zone,
718127cc 6778 unsigned long zone_start_pfn,
b171e409 6779 unsigned long size)
ed8ece2e
DH
6780{
6781 struct pglist_data *pgdat = zone->zone_pgdat;
8f416836 6782 int zone_idx = zone_idx(zone) + 1;
9dcb8b68 6783
8f416836
WY
6784 if (zone_idx > pgdat->nr_zones)
6785 pgdat->nr_zones = zone_idx;
ed8ece2e 6786
ed8ece2e
DH
6787 zone->zone_start_pfn = zone_start_pfn;
6788
708614e6
MG
6789 mminit_dprintk(MMINIT_TRACE, "memmap_init",
6790 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
6791 pgdat->node_id,
6792 (unsigned long)zone_idx(zone),
6793 zone_start_pfn, (zone_start_pfn + size));
6794
1e548deb 6795 zone_init_free_lists(zone);
9dcb8b68 6796 zone->initialized = 1;
ed8ece2e
DH
6797}
6798
c713216d
MG
6799/**
6800 * get_pfn_range_for_nid - Return the start and end page frames for a node
88ca3b94
RD
6801 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
6802 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
6803 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
c713216d
MG
6804 *
6805 * It returns the start and end page frame of a node based on information
7d018176 6806 * provided by memblock_set_node(). If called for a node
c713216d 6807 * with no available memory, a warning is printed and the start and end
88ca3b94 6808 * PFNs will be 0.
c713216d 6809 */
bbe5d993 6810void __init get_pfn_range_for_nid(unsigned int nid,
c713216d
MG
6811 unsigned long *start_pfn, unsigned long *end_pfn)
6812{
c13291a5 6813 unsigned long this_start_pfn, this_end_pfn;
c713216d 6814 int i;
c13291a5 6815
c713216d
MG
6816 *start_pfn = -1UL;
6817 *end_pfn = 0;
6818
c13291a5
TH
6819 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
6820 *start_pfn = min(*start_pfn, this_start_pfn);
6821 *end_pfn = max(*end_pfn, this_end_pfn);
c713216d
MG
6822 }
6823
633c0666 6824 if (*start_pfn == -1UL)
c713216d 6825 *start_pfn = 0;
c713216d
MG
6826}
6827
2a1e274a
MG
6828/*
6829 * This finds a zone that can be used for ZONE_MOVABLE pages. The
6830 * assumption is made that zones within a node are ordered in monotonic
6831 * increasing memory addresses so that the "highest" populated zone is used
6832 */
b69a7288 6833static void __init find_usable_zone_for_movable(void)
2a1e274a
MG
6834{
6835 int zone_index;
6836 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
6837 if (zone_index == ZONE_MOVABLE)
6838 continue;
6839
6840 if (arch_zone_highest_possible_pfn[zone_index] >
6841 arch_zone_lowest_possible_pfn[zone_index])
6842 break;
6843 }
6844
6845 VM_BUG_ON(zone_index == -1);
6846 movable_zone = zone_index;
6847}
6848
6849/*
6850 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
25985edc 6851 * because it is sized independent of architecture. Unlike the other zones,
2a1e274a
MG
6852 * the starting point for ZONE_MOVABLE is not fixed. It may be different
6853 * in each node depending on the size of each node and how evenly kernelcore
6854 * is distributed. This helper function adjusts the zone ranges
6855 * provided by the architecture for a given node by using the end of the
6856 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
6857 * zones within a node are in order of monotonic increases memory addresses
6858 */
bbe5d993 6859static void __init adjust_zone_range_for_zone_movable(int nid,
2a1e274a
MG
6860 unsigned long zone_type,
6861 unsigned long node_start_pfn,
6862 unsigned long node_end_pfn,
6863 unsigned long *zone_start_pfn,
6864 unsigned long *zone_end_pfn)
6865{
6866 /* Only adjust if ZONE_MOVABLE is on this node */
6867 if (zone_movable_pfn[nid]) {
6868 /* Size ZONE_MOVABLE */
6869 if (zone_type == ZONE_MOVABLE) {
6870 *zone_start_pfn = zone_movable_pfn[nid];
6871 *zone_end_pfn = min(node_end_pfn,
6872 arch_zone_highest_possible_pfn[movable_zone]);
6873
e506b996
XQ
6874 /* Adjust for ZONE_MOVABLE starting within this range */
6875 } else if (!mirrored_kernelcore &&
6876 *zone_start_pfn < zone_movable_pfn[nid] &&
6877 *zone_end_pfn > zone_movable_pfn[nid]) {
6878 *zone_end_pfn = zone_movable_pfn[nid];
6879
2a1e274a
MG
6880 /* Check if this whole range is within ZONE_MOVABLE */
6881 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
6882 *zone_start_pfn = *zone_end_pfn;
6883 }
6884}
6885
c713216d
MG
6886/*
6887 * Return the number of pages a zone spans in a node, including holes
6888 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6889 */
bbe5d993 6890static unsigned long __init zone_spanned_pages_in_node(int nid,
c713216d 6891 unsigned long zone_type,
7960aedd
ZY
6892 unsigned long node_start_pfn,
6893 unsigned long node_end_pfn,
d91749c1 6894 unsigned long *zone_start_pfn,
854e8848 6895 unsigned long *zone_end_pfn)
c713216d 6896{
299c83dc
LF
6897 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6898 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
b5685e92 6899 /* When hotadd a new node from cpu_up(), the node should be empty */
f9126ab9
XQ
6900 if (!node_start_pfn && !node_end_pfn)
6901 return 0;
6902
7960aedd 6903 /* Get the start and end of the zone */
299c83dc
LF
6904 *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
6905 *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
2a1e274a
MG
6906 adjust_zone_range_for_zone_movable(nid, zone_type,
6907 node_start_pfn, node_end_pfn,
d91749c1 6908 zone_start_pfn, zone_end_pfn);
c713216d
MG
6909
6910 /* Check that this node has pages within the zone's required range */
d91749c1 6911 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
c713216d
MG
6912 return 0;
6913
6914 /* Move the zone boundaries inside the node if necessary */
d91749c1
TI
6915 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
6916 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
c713216d
MG
6917
6918 /* Return the spanned pages */
d91749c1 6919 return *zone_end_pfn - *zone_start_pfn;
c713216d
MG
6920}
6921
6922/*
6923 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
88ca3b94 6924 * then all holes in the requested range will be accounted for.
c713216d 6925 */
bbe5d993 6926unsigned long __init __absent_pages_in_range(int nid,
c713216d
MG
6927 unsigned long range_start_pfn,
6928 unsigned long range_end_pfn)
6929{
96e907d1
TH
6930 unsigned long nr_absent = range_end_pfn - range_start_pfn;
6931 unsigned long start_pfn, end_pfn;
6932 int i;
c713216d 6933
96e907d1
TH
6934 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
6935 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
6936 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
6937 nr_absent -= end_pfn - start_pfn;
c713216d 6938 }
96e907d1 6939 return nr_absent;
c713216d
MG
6940}
6941
6942/**
6943 * absent_pages_in_range - Return number of page frames in holes within a range
6944 * @start_pfn: The start PFN to start searching for holes
6945 * @end_pfn: The end PFN to stop searching for holes
6946 *
a862f68a 6947 * Return: the number of pages frames in memory holes within a range.
c713216d
MG
6948 */
6949unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6950 unsigned long end_pfn)
6951{
6952 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
6953}
6954
6955/* Return the number of page frames in holes in a zone on a node */
bbe5d993 6956static unsigned long __init zone_absent_pages_in_node(int nid,
c713216d 6957 unsigned long zone_type,
7960aedd 6958 unsigned long node_start_pfn,
854e8848 6959 unsigned long node_end_pfn)
c713216d 6960{
96e907d1
TH
6961 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6962 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
9c7cd687 6963 unsigned long zone_start_pfn, zone_end_pfn;
342332e6 6964 unsigned long nr_absent;
9c7cd687 6965
b5685e92 6966 /* When hotadd a new node from cpu_up(), the node should be empty */
f9126ab9
XQ
6967 if (!node_start_pfn && !node_end_pfn)
6968 return 0;
6969
96e907d1
TH
6970 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
6971 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
9c7cd687 6972
2a1e274a
MG
6973 adjust_zone_range_for_zone_movable(nid, zone_type,
6974 node_start_pfn, node_end_pfn,
6975 &zone_start_pfn, &zone_end_pfn);
342332e6
TI
6976 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
6977
6978 /*
6979 * ZONE_MOVABLE handling.
6980 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
6981 * and vice versa.
6982 */
e506b996
XQ
6983 if (mirrored_kernelcore && zone_movable_pfn[nid]) {
6984 unsigned long start_pfn, end_pfn;
6985 struct memblock_region *r;
6986
cc6de168 6987 for_each_mem_region(r) {
e506b996
XQ
6988 start_pfn = clamp(memblock_region_memory_base_pfn(r),
6989 zone_start_pfn, zone_end_pfn);
6990 end_pfn = clamp(memblock_region_memory_end_pfn(r),
6991 zone_start_pfn, zone_end_pfn);
6992
6993 if (zone_type == ZONE_MOVABLE &&
6994 memblock_is_mirror(r))
6995 nr_absent += end_pfn - start_pfn;
6996
6997 if (zone_type == ZONE_NORMAL &&
6998 !memblock_is_mirror(r))
6999 nr_absent += end_pfn - start_pfn;
342332e6
TI
7000 }
7001 }
7002
7003 return nr_absent;
c713216d 7004}
0e0b864e 7005
bbe5d993 7006static void __init calculate_node_totalpages(struct pglist_data *pgdat,
7960aedd 7007 unsigned long node_start_pfn,
854e8848 7008 unsigned long node_end_pfn)
c713216d 7009{
febd5949 7010 unsigned long realtotalpages = 0, totalpages = 0;
c713216d
MG
7011 enum zone_type i;
7012
febd5949
GZ
7013 for (i = 0; i < MAX_NR_ZONES; i++) {
7014 struct zone *zone = pgdat->node_zones + i;
d91749c1 7015 unsigned long zone_start_pfn, zone_end_pfn;
3f08a302 7016 unsigned long spanned, absent;
febd5949 7017 unsigned long size, real_size;
c713216d 7018
854e8848
MR
7019 spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
7020 node_start_pfn,
7021 node_end_pfn,
7022 &zone_start_pfn,
7023 &zone_end_pfn);
7024 absent = zone_absent_pages_in_node(pgdat->node_id, i,
7025 node_start_pfn,
7026 node_end_pfn);
3f08a302
MR
7027
7028 size = spanned;
7029 real_size = size - absent;
7030
d91749c1
TI
7031 if (size)
7032 zone->zone_start_pfn = zone_start_pfn;
7033 else
7034 zone->zone_start_pfn = 0;
febd5949
GZ
7035 zone->spanned_pages = size;
7036 zone->present_pages = real_size;
7037
7038 totalpages += size;
7039 realtotalpages += real_size;
7040 }
7041
7042 pgdat->node_spanned_pages = totalpages;
c713216d
MG
7043 pgdat->node_present_pages = realtotalpages;
7044 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
7045 realtotalpages);
7046}
7047
835c134e
MG
7048#ifndef CONFIG_SPARSEMEM
7049/*
7050 * Calculate the size of the zone->blockflags rounded to an unsigned long
d9c23400
MG
7051 * Start by making sure zonesize is a multiple of pageblock_order by rounding
7052 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
835c134e
MG
7053 * round what is now in bits to nearest long in bits, then return it in
7054 * bytes.
7055 */
7c45512d 7056static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
835c134e
MG
7057{
7058 unsigned long usemapsize;
7059
7c45512d 7060 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
d9c23400
MG
7061 usemapsize = roundup(zonesize, pageblock_nr_pages);
7062 usemapsize = usemapsize >> pageblock_order;
835c134e
MG
7063 usemapsize *= NR_PAGEBLOCK_BITS;
7064 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
7065
7066 return usemapsize / 8;
7067}
7068
7010a6ec 7069static void __ref setup_usemap(struct zone *zone)
835c134e 7070{
7010a6ec
BH
7071 unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
7072 zone->spanned_pages);
835c134e 7073 zone->pageblock_flags = NULL;
23a7052a 7074 if (usemapsize) {
6782832e 7075 zone->pageblock_flags =
26fb3dae 7076 memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
7010a6ec 7077 zone_to_nid(zone));
23a7052a
MR
7078 if (!zone->pageblock_flags)
7079 panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
7010a6ec 7080 usemapsize, zone->name, zone_to_nid(zone));
23a7052a 7081 }
835c134e
MG
7082}
7083#else
7010a6ec 7084static inline void setup_usemap(struct zone *zone) {}
835c134e
MG
7085#endif /* CONFIG_SPARSEMEM */
7086
d9c23400 7087#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
ba72cb8c 7088
d9c23400 7089/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
03e85f9d 7090void __init set_pageblock_order(void)
d9c23400 7091{
955c1cd7
AM
7092 unsigned int order;
7093
d9c23400
MG
7094 /* Check that pageblock_nr_pages has not already been setup */
7095 if (pageblock_order)
7096 return;
7097
955c1cd7
AM
7098 if (HPAGE_SHIFT > PAGE_SHIFT)
7099 order = HUGETLB_PAGE_ORDER;
7100 else
7101 order = MAX_ORDER - 1;
7102
d9c23400
MG
7103 /*
7104 * Assume the largest contiguous order of interest is a huge page.
955c1cd7
AM
7105 * This value may be variable depending on boot parameters on IA64 and
7106 * powerpc.
d9c23400
MG
7107 */
7108 pageblock_order = order;
7109}
7110#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
7111
ba72cb8c
MG
7112/*
7113 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
955c1cd7
AM
7114 * is unused as pageblock_order is set at compile-time. See
7115 * include/linux/pageblock-flags.h for the values of pageblock_order based on
7116 * the kernel config
ba72cb8c 7117 */
03e85f9d 7118void __init set_pageblock_order(void)
ba72cb8c 7119{
ba72cb8c 7120}
d9c23400
MG
7121
7122#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
7123
03e85f9d 7124static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
7cc2a959 7125 unsigned long present_pages)
01cefaef
JL
7126{
7127 unsigned long pages = spanned_pages;
7128
7129 /*
7130 * Provide a more accurate estimation if there are holes within
7131 * the zone and SPARSEMEM is in use. If there are holes within the
7132 * zone, each populated memory region may cost us one or two extra
7133 * memmap pages due to alignment because memmap pages for each
89d790ab 7134 * populated regions may not be naturally aligned on page boundary.
01cefaef
JL
7135 * So the (present_pages >> 4) heuristic is a tradeoff for that.
7136 */
7137 if (spanned_pages > present_pages + (present_pages >> 4) &&
7138 IS_ENABLED(CONFIG_SPARSEMEM))
7139 pages = present_pages;
7140
7141 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
7142}
7143
ace1db39
OS
7144#ifdef CONFIG_TRANSPARENT_HUGEPAGE
7145static void pgdat_init_split_queue(struct pglist_data *pgdat)
7146{
364c1eeb
YS
7147 struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
7148
7149 spin_lock_init(&ds_queue->split_queue_lock);
7150 INIT_LIST_HEAD(&ds_queue->split_queue);
7151 ds_queue->split_queue_len = 0;
ace1db39
OS
7152}
7153#else
7154static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
7155#endif
7156
7157#ifdef CONFIG_COMPACTION
7158static void pgdat_init_kcompactd(struct pglist_data *pgdat)
7159{
7160 init_waitqueue_head(&pgdat->kcompactd_wait);
7161}
7162#else
7163static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
7164#endif
7165
03e85f9d 7166static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
1da177e4 7167{
208d54e5 7168 pgdat_resize_init(pgdat);
ace1db39 7169
ace1db39
OS
7170 pgdat_init_split_queue(pgdat);
7171 pgdat_init_kcompactd(pgdat);
7172
1da177e4 7173 init_waitqueue_head(&pgdat->kswapd_wait);
5515061d 7174 init_waitqueue_head(&pgdat->pfmemalloc_wait);
ace1db39 7175
eefa864b 7176 pgdat_page_ext_init(pgdat);
867e5e1d 7177 lruvec_init(&pgdat->__lruvec);
03e85f9d
OS
7178}
7179
7180static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
7181 unsigned long remaining_pages)
7182{
9705bea5 7183 atomic_long_set(&zone->managed_pages, remaining_pages);
03e85f9d
OS
7184 zone_set_nid(zone, nid);
7185 zone->name = zone_names[idx];
7186 zone->zone_pgdat = NODE_DATA(nid);
7187 spin_lock_init(&zone->lock);
7188 zone_seqlock_init(zone);
7189 zone_pcp_init(zone);
7190}
7191
7192/*
7193 * Set up the zone data structures
7194 * - init pgdat internals
7195 * - init all zones belonging to this node
7196 *
7197 * NOTE: this function is only called during memory hotplug
7198 */
7199#ifdef CONFIG_MEMORY_HOTPLUG
7200void __ref free_area_init_core_hotplug(int nid)
7201{
7202 enum zone_type z;
7203 pg_data_t *pgdat = NODE_DATA(nid);
7204
7205 pgdat_init_internals(pgdat);
7206 for (z = 0; z < MAX_NR_ZONES; z++)
7207 zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
7208}
7209#endif
7210
7211/*
7212 * Set up the zone data structures:
7213 * - mark all pages reserved
7214 * - mark all memory queues empty
7215 * - clear the memory bitmaps
7216 *
7217 * NOTE: pgdat should get zeroed by caller.
7218 * NOTE: this function is only called during early init.
7219 */
7220static void __init free_area_init_core(struct pglist_data *pgdat)
7221{
7222 enum zone_type j;
7223 int nid = pgdat->node_id;
5f63b720 7224
03e85f9d 7225 pgdat_init_internals(pgdat);
385386cf
JW
7226 pgdat->per_cpu_nodestats = &boot_nodestats;
7227
1da177e4
LT
7228 for (j = 0; j < MAX_NR_ZONES; j++) {
7229 struct zone *zone = pgdat->node_zones + j;
e6943859 7230 unsigned long size, freesize, memmap_pages;
1da177e4 7231
febd5949 7232 size = zone->spanned_pages;
e6943859 7233 freesize = zone->present_pages;
1da177e4 7234
0e0b864e 7235 /*
9feedc9d 7236 * Adjust freesize so that it accounts for how much memory
0e0b864e
MG
7237 * is used by this zone for memmap. This affects the watermark
7238 * and per-cpu initialisations
7239 */
e6943859 7240 memmap_pages = calc_memmap_size(size, freesize);
ba914f48
ZH
7241 if (!is_highmem_idx(j)) {
7242 if (freesize >= memmap_pages) {
7243 freesize -= memmap_pages;
7244 if (memmap_pages)
7245 printk(KERN_DEBUG
7246 " %s zone: %lu pages used for memmap\n",
7247 zone_names[j], memmap_pages);
7248 } else
1170532b 7249 pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
ba914f48
ZH
7250 zone_names[j], memmap_pages, freesize);
7251 }
0e0b864e 7252
6267276f 7253 /* Account for reserved pages */
9feedc9d
JL
7254 if (j == 0 && freesize > dma_reserve) {
7255 freesize -= dma_reserve;
d903ef9f 7256 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
6267276f 7257 zone_names[0], dma_reserve);
0e0b864e
MG
7258 }
7259
98d2b0eb 7260 if (!is_highmem_idx(j))
9feedc9d 7261 nr_kernel_pages += freesize;
01cefaef
JL
7262 /* Charge for highmem memmap if there are enough kernel pages */
7263 else if (nr_kernel_pages > memmap_pages * 2)
7264 nr_kernel_pages -= memmap_pages;
9feedc9d 7265 nr_all_pages += freesize;
1da177e4 7266
9feedc9d
JL
7267 /*
7268 * Set an approximate value for lowmem here, it will be adjusted
7269 * when the bootmem allocator frees pages into the buddy system.
7270 * And all highmem pages will be managed by the buddy system.
7271 */
03e85f9d 7272 zone_init_internals(zone, j, nid, freesize);
81c0a2bb 7273
d883c6cf 7274 if (!size)
1da177e4
LT
7275 continue;
7276
955c1cd7 7277 set_pageblock_order();
7010a6ec 7278 setup_usemap(zone);
9699ee7b 7279 init_currently_empty_zone(zone, zone->zone_start_pfn, size);
1da177e4
LT
7280 }
7281}
7282
0cd842f9 7283#ifdef CONFIG_FLAT_NODE_MEM_MAP
bd721ea7 7284static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
1da177e4 7285{
b0aeba74 7286 unsigned long __maybe_unused start = 0;
a1c34a3b
LA
7287 unsigned long __maybe_unused offset = 0;
7288
1da177e4
LT
7289 /* Skip empty nodes */
7290 if (!pgdat->node_spanned_pages)
7291 return;
7292
b0aeba74
TL
7293 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
7294 offset = pgdat->node_start_pfn - start;
1da177e4
LT
7295 /* ia64 gets its own node_mem_map, before this, without bootmem */
7296 if (!pgdat->node_mem_map) {
b0aeba74 7297 unsigned long size, end;
d41dee36
AW
7298 struct page *map;
7299
e984bb43
BP
7300 /*
7301 * The zone's endpoints aren't required to be MAX_ORDER
7302 * aligned but the node_mem_map endpoints must be in order
7303 * for the buddy allocator to function correctly.
7304 */
108bcc96 7305 end = pgdat_end_pfn(pgdat);
e984bb43
BP
7306 end = ALIGN(end, MAX_ORDER_NR_PAGES);
7307 size = (end - start) * sizeof(struct page);
26fb3dae
MR
7308 map = memblock_alloc_node(size, SMP_CACHE_BYTES,
7309 pgdat->node_id);
23a7052a
MR
7310 if (!map)
7311 panic("Failed to allocate %ld bytes for node %d memory map\n",
7312 size, pgdat->node_id);
a1c34a3b 7313 pgdat->node_mem_map = map + offset;
1da177e4 7314 }
0cd842f9
OS
7315 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
7316 __func__, pgdat->node_id, (unsigned long)pgdat,
7317 (unsigned long)pgdat->node_mem_map);
12d810c1 7318#ifndef CONFIG_NEED_MULTIPLE_NODES
1da177e4
LT
7319 /*
7320 * With no DISCONTIG, the global mem_map is just set as node 0's
7321 */
c713216d 7322 if (pgdat == NODE_DATA(0)) {
1da177e4 7323 mem_map = NODE_DATA(0)->node_mem_map;
c713216d 7324 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
a1c34a3b 7325 mem_map -= offset;
c713216d 7326 }
1da177e4
LT
7327#endif
7328}
0cd842f9
OS
7329#else
7330static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
7331#endif /* CONFIG_FLAT_NODE_MEM_MAP */
1da177e4 7332
0188dc98
OS
7333#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
7334static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
7335{
0188dc98
OS
7336 pgdat->first_deferred_pfn = ULONG_MAX;
7337}
7338#else
7339static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
7340#endif
7341
854e8848 7342static void __init free_area_init_node(int nid)
1da177e4 7343{
9109fb7b 7344 pg_data_t *pgdat = NODE_DATA(nid);
7960aedd
ZY
7345 unsigned long start_pfn = 0;
7346 unsigned long end_pfn = 0;
9109fb7b 7347
88fdf75d 7348 /* pg_data_t should be reset to zero when it's allocated */
97a225e6 7349 WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
88fdf75d 7350
854e8848 7351 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
88fdf75d 7352
1da177e4 7353 pgdat->node_id = nid;
854e8848 7354 pgdat->node_start_pfn = start_pfn;
75ef7184 7355 pgdat->per_cpu_nodestats = NULL;
854e8848 7356
8d29e18a 7357 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
4ada0c5a
ZL
7358 (u64)start_pfn << PAGE_SHIFT,
7359 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
854e8848 7360 calculate_node_totalpages(pgdat, start_pfn, end_pfn);
1da177e4
LT
7361
7362 alloc_node_mem_map(pgdat);
0188dc98 7363 pgdat_set_deferred_range(pgdat);
1da177e4 7364
7f3eb55b 7365 free_area_init_core(pgdat);
1da177e4
LT
7366}
7367
bc9331a1 7368void __init free_area_init_memoryless_node(int nid)
3f08a302 7369{
854e8848 7370 free_area_init_node(nid);
3f08a302
MR
7371}
7372
418508c1
MS
7373#if MAX_NUMNODES > 1
7374/*
7375 * Figure out the number of possible node ids.
7376 */
f9872caf 7377void __init setup_nr_node_ids(void)
418508c1 7378{
904a9553 7379 unsigned int highest;
418508c1 7380
904a9553 7381 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
418508c1
MS
7382 nr_node_ids = highest + 1;
7383}
418508c1
MS
7384#endif
7385
1e01979c
TH
7386/**
7387 * node_map_pfn_alignment - determine the maximum internode alignment
7388 *
7389 * This function should be called after node map is populated and sorted.
7390 * It calculates the maximum power of two alignment which can distinguish
7391 * all the nodes.
7392 *
7393 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
7394 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
7395 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
7396 * shifted, 1GiB is enough and this function will indicate so.
7397 *
7398 * This is used to test whether pfn -> nid mapping of the chosen memory
7399 * model has fine enough granularity to avoid incorrect mapping for the
7400 * populated node map.
7401 *
a862f68a 7402 * Return: the determined alignment in pfn's. 0 if there is no alignment
1e01979c
TH
7403 * requirement (single node).
7404 */
7405unsigned long __init node_map_pfn_alignment(void)
7406{
7407 unsigned long accl_mask = 0, last_end = 0;
c13291a5 7408 unsigned long start, end, mask;
98fa15f3 7409 int last_nid = NUMA_NO_NODE;
c13291a5 7410 int i, nid;
1e01979c 7411
c13291a5 7412 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
1e01979c
TH
7413 if (!start || last_nid < 0 || last_nid == nid) {
7414 last_nid = nid;
7415 last_end = end;
7416 continue;
7417 }
7418
7419 /*
7420 * Start with a mask granular enough to pin-point to the
7421 * start pfn and tick off bits one-by-one until it becomes
7422 * too coarse to separate the current node from the last.
7423 */
7424 mask = ~((1 << __ffs(start)) - 1);
7425 while (mask && last_end <= (start & (mask << 1)))
7426 mask <<= 1;
7427
7428 /* accumulate all internode masks */
7429 accl_mask |= mask;
7430 }
7431
7432 /* convert mask to number of pages */
7433 return ~accl_mask + 1;
7434}
7435
c713216d
MG
7436/**
7437 * find_min_pfn_with_active_regions - Find the minimum PFN registered
7438 *
a862f68a 7439 * Return: the minimum PFN based on information provided via
7d018176 7440 * memblock_set_node().
c713216d
MG
7441 */
7442unsigned long __init find_min_pfn_with_active_regions(void)
7443{
8a1b25fe 7444 return PHYS_PFN(memblock_start_of_DRAM());
c713216d
MG
7445}
7446
37b07e41
LS
7447/*
7448 * early_calculate_totalpages()
7449 * Sum pages in active regions for movable zone.
4b0ef1fe 7450 * Populate N_MEMORY for calculating usable_nodes.
37b07e41 7451 */
484f51f8 7452static unsigned long __init early_calculate_totalpages(void)
7e63efef 7453{
7e63efef 7454 unsigned long totalpages = 0;
c13291a5
TH
7455 unsigned long start_pfn, end_pfn;
7456 int i, nid;
7457
7458 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
7459 unsigned long pages = end_pfn - start_pfn;
7e63efef 7460
37b07e41
LS
7461 totalpages += pages;
7462 if (pages)
4b0ef1fe 7463 node_set_state(nid, N_MEMORY);
37b07e41 7464 }
b8af2941 7465 return totalpages;
7e63efef
MG
7466}
7467
2a1e274a
MG
7468/*
7469 * Find the PFN the Movable zone begins in each node. Kernel memory
7470 * is spread evenly between nodes as long as the nodes have enough
7471 * memory. When they don't, some nodes will have more kernelcore than
7472 * others
7473 */
b224ef85 7474static void __init find_zone_movable_pfns_for_nodes(void)
2a1e274a
MG
7475{
7476 int i, nid;
7477 unsigned long usable_startpfn;
7478 unsigned long kernelcore_node, kernelcore_remaining;
66918dcd 7479 /* save the state before borrow the nodemask */
4b0ef1fe 7480 nodemask_t saved_node_state = node_states[N_MEMORY];
37b07e41 7481 unsigned long totalpages = early_calculate_totalpages();
4b0ef1fe 7482 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
136199f0 7483 struct memblock_region *r;
b2f3eebe
TC
7484
7485 /* Need to find movable_zone earlier when movable_node is specified. */
7486 find_usable_zone_for_movable();
7487
7488 /*
7489 * If movable_node is specified, ignore kernelcore and movablecore
7490 * options.
7491 */
7492 if (movable_node_is_enabled()) {
cc6de168 7493 for_each_mem_region(r) {
136199f0 7494 if (!memblock_is_hotpluggable(r))
b2f3eebe
TC
7495 continue;
7496
d622abf7 7497 nid = memblock_get_region_node(r);
b2f3eebe 7498
136199f0 7499 usable_startpfn = PFN_DOWN(r->base);
b2f3eebe
TC
7500 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
7501 min(usable_startpfn, zone_movable_pfn[nid]) :
7502 usable_startpfn;
7503 }
7504
7505 goto out2;
7506 }
2a1e274a 7507
342332e6
TI
7508 /*
7509 * If kernelcore=mirror is specified, ignore movablecore option
7510 */
7511 if (mirrored_kernelcore) {
7512 bool mem_below_4gb_not_mirrored = false;
7513
cc6de168 7514 for_each_mem_region(r) {
342332e6
TI
7515 if (memblock_is_mirror(r))
7516 continue;
7517
d622abf7 7518 nid = memblock_get_region_node(r);
342332e6
TI
7519
7520 usable_startpfn = memblock_region_memory_base_pfn(r);
7521
7522 if (usable_startpfn < 0x100000) {
7523 mem_below_4gb_not_mirrored = true;
7524 continue;
7525 }
7526
7527 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
7528 min(usable_startpfn, zone_movable_pfn[nid]) :
7529 usable_startpfn;
7530 }
7531
7532 if (mem_below_4gb_not_mirrored)
633bf2fe 7533 pr_warn("This configuration results in unmirrored kernel memory.\n");
342332e6
TI
7534
7535 goto out2;
7536 }
7537
7e63efef 7538 /*
a5c6d650
DR
7539 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
7540 * amount of necessary memory.
7541 */
7542 if (required_kernelcore_percent)
7543 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
7544 10000UL;
7545 if (required_movablecore_percent)
7546 required_movablecore = (totalpages * 100 * required_movablecore_percent) /
7547 10000UL;
7548
7549 /*
7550 * If movablecore= was specified, calculate what size of
7e63efef
MG
7551 * kernelcore that corresponds so that memory usable for
7552 * any allocation type is evenly spread. If both kernelcore
7553 * and movablecore are specified, then the value of kernelcore
7554 * will be used for required_kernelcore if it's greater than
7555 * what movablecore would have allowed.
7556 */
7557 if (required_movablecore) {
7e63efef
MG
7558 unsigned long corepages;
7559
7560 /*
7561 * Round-up so that ZONE_MOVABLE is at least as large as what
7562 * was requested by the user
7563 */
7564 required_movablecore =
7565 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
9fd745d4 7566 required_movablecore = min(totalpages, required_movablecore);
7e63efef
MG
7567 corepages = totalpages - required_movablecore;
7568
7569 required_kernelcore = max(required_kernelcore, corepages);
7570 }
7571
bde304bd
XQ
7572 /*
7573 * If kernelcore was not specified or kernelcore size is larger
7574 * than totalpages, there is no ZONE_MOVABLE.
7575 */
7576 if (!required_kernelcore || required_kernelcore >= totalpages)
66918dcd 7577 goto out;
2a1e274a
MG
7578
7579 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
2a1e274a
MG
7580 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
7581
7582restart:
7583 /* Spread kernelcore memory as evenly as possible throughout nodes */
7584 kernelcore_node = required_kernelcore / usable_nodes;
4b0ef1fe 7585 for_each_node_state(nid, N_MEMORY) {
c13291a5
TH
7586 unsigned long start_pfn, end_pfn;
7587
2a1e274a
MG
7588 /*
7589 * Recalculate kernelcore_node if the division per node
7590 * now exceeds what is necessary to satisfy the requested
7591 * amount of memory for the kernel
7592 */
7593 if (required_kernelcore < kernelcore_node)
7594 kernelcore_node = required_kernelcore / usable_nodes;
7595
7596 /*
7597 * As the map is walked, we track how much memory is usable
7598 * by the kernel using kernelcore_remaining. When it is
7599 * 0, the rest of the node is usable by ZONE_MOVABLE
7600 */
7601 kernelcore_remaining = kernelcore_node;
7602
7603 /* Go through each range of PFNs within this node */
c13291a5 7604 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2a1e274a
MG
7605 unsigned long size_pages;
7606
c13291a5 7607 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
2a1e274a
MG
7608 if (start_pfn >= end_pfn)
7609 continue;
7610
7611 /* Account for what is only usable for kernelcore */
7612 if (start_pfn < usable_startpfn) {
7613 unsigned long kernel_pages;
7614 kernel_pages = min(end_pfn, usable_startpfn)
7615 - start_pfn;
7616
7617 kernelcore_remaining -= min(kernel_pages,
7618 kernelcore_remaining);
7619 required_kernelcore -= min(kernel_pages,
7620 required_kernelcore);
7621
7622 /* Continue if range is now fully accounted */
7623 if (end_pfn <= usable_startpfn) {
7624
7625 /*
7626 * Push zone_movable_pfn to the end so
7627 * that if we have to rebalance
7628 * kernelcore across nodes, we will
7629 * not double account here
7630 */
7631 zone_movable_pfn[nid] = end_pfn;
7632 continue;
7633 }
7634 start_pfn = usable_startpfn;
7635 }
7636
7637 /*
7638 * The usable PFN range for ZONE_MOVABLE is from
7639 * start_pfn->end_pfn. Calculate size_pages as the
7640 * number of pages used as kernelcore
7641 */
7642 size_pages = end_pfn - start_pfn;
7643 if (size_pages > kernelcore_remaining)
7644 size_pages = kernelcore_remaining;
7645 zone_movable_pfn[nid] = start_pfn + size_pages;
7646
7647 /*
7648 * Some kernelcore has been met, update counts and
7649 * break if the kernelcore for this node has been
b8af2941 7650 * satisfied
2a1e274a
MG
7651 */
7652 required_kernelcore -= min(required_kernelcore,
7653 size_pages);
7654 kernelcore_remaining -= size_pages;
7655 if (!kernelcore_remaining)
7656 break;
7657 }
7658 }
7659
7660 /*
7661 * If there is still required_kernelcore, we do another pass with one
7662 * less node in the count. This will push zone_movable_pfn[nid] further
7663 * along on the nodes that still have memory until kernelcore is
b8af2941 7664 * satisfied
2a1e274a
MG
7665 */
7666 usable_nodes--;
7667 if (usable_nodes && required_kernelcore > usable_nodes)
7668 goto restart;
7669
b2f3eebe 7670out2:
2a1e274a
MG
7671 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
7672 for (nid = 0; nid < MAX_NUMNODES; nid++)
7673 zone_movable_pfn[nid] =
7674 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
66918dcd 7675
20e6926d 7676out:
66918dcd 7677 /* restore the node_state */
4b0ef1fe 7678 node_states[N_MEMORY] = saved_node_state;
2a1e274a
MG
7679}
7680
4b0ef1fe
LJ
7681/* Any regular or high memory on that node ? */
7682static void check_for_memory(pg_data_t *pgdat, int nid)
37b07e41 7683{
37b07e41
LS
7684 enum zone_type zone_type;
7685
4b0ef1fe 7686 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
37b07e41 7687 struct zone *zone = &pgdat->node_zones[zone_type];
b38a8725 7688 if (populated_zone(zone)) {
7b0e0c0e
OS
7689 if (IS_ENABLED(CONFIG_HIGHMEM))
7690 node_set_state(nid, N_HIGH_MEMORY);
7691 if (zone_type <= ZONE_NORMAL)
4b0ef1fe 7692 node_set_state(nid, N_NORMAL_MEMORY);
d0048b0e
BL
7693 break;
7694 }
37b07e41 7695 }
37b07e41
LS
7696}
7697
51930df5 7698/*
f0953a1b 7699 * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
51930df5
MR
7700 * such cases we allow max_zone_pfn sorted in the descending order
7701 */
7702bool __weak arch_has_descending_max_zone_pfns(void)
7703{
7704 return false;
7705}
7706
c713216d 7707/**
9691a071 7708 * free_area_init - Initialise all pg_data_t and zone data
88ca3b94 7709 * @max_zone_pfn: an array of max PFNs for each zone
c713216d
MG
7710 *
7711 * This will call free_area_init_node() for each active node in the system.
7d018176 7712 * Using the page ranges provided by memblock_set_node(), the size of each
c713216d
MG
7713 * zone in each node and their holes is calculated. If the maximum PFN
7714 * between two adjacent zones match, it is assumed that the zone is empty.
7715 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
7716 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
7717 * starts where the previous one ended. For example, ZONE_DMA32 starts
7718 * at arch_max_dma_pfn.
7719 */
9691a071 7720void __init free_area_init(unsigned long *max_zone_pfn)
c713216d 7721{
c13291a5 7722 unsigned long start_pfn, end_pfn;
51930df5
MR
7723 int i, nid, zone;
7724 bool descending;
a6af2bc3 7725
c713216d
MG
7726 /* Record where the zone boundaries are */
7727 memset(arch_zone_lowest_possible_pfn, 0,
7728 sizeof(arch_zone_lowest_possible_pfn));
7729 memset(arch_zone_highest_possible_pfn, 0,
7730 sizeof(arch_zone_highest_possible_pfn));
90cae1fe
OH
7731
7732 start_pfn = find_min_pfn_with_active_regions();
51930df5 7733 descending = arch_has_descending_max_zone_pfns();
90cae1fe
OH
7734
7735 for (i = 0; i < MAX_NR_ZONES; i++) {
51930df5
MR
7736 if (descending)
7737 zone = MAX_NR_ZONES - i - 1;
7738 else
7739 zone = i;
7740
7741 if (zone == ZONE_MOVABLE)
2a1e274a 7742 continue;
90cae1fe 7743
51930df5
MR
7744 end_pfn = max(max_zone_pfn[zone], start_pfn);
7745 arch_zone_lowest_possible_pfn[zone] = start_pfn;
7746 arch_zone_highest_possible_pfn[zone] = end_pfn;
90cae1fe
OH
7747
7748 start_pfn = end_pfn;
c713216d 7749 }
2a1e274a
MG
7750
7751 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
7752 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
b224ef85 7753 find_zone_movable_pfns_for_nodes();
c713216d 7754
c713216d 7755 /* Print out the zone ranges */
f88dfff5 7756 pr_info("Zone ranges:\n");
2a1e274a
MG
7757 for (i = 0; i < MAX_NR_ZONES; i++) {
7758 if (i == ZONE_MOVABLE)
7759 continue;
f88dfff5 7760 pr_info(" %-8s ", zone_names[i]);
72f0ba02
DR
7761 if (arch_zone_lowest_possible_pfn[i] ==
7762 arch_zone_highest_possible_pfn[i])
f88dfff5 7763 pr_cont("empty\n");
72f0ba02 7764 else
8d29e18a
JG
7765 pr_cont("[mem %#018Lx-%#018Lx]\n",
7766 (u64)arch_zone_lowest_possible_pfn[i]
7767 << PAGE_SHIFT,
7768 ((u64)arch_zone_highest_possible_pfn[i]
a62e2f4f 7769 << PAGE_SHIFT) - 1);
2a1e274a
MG
7770 }
7771
7772 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
f88dfff5 7773 pr_info("Movable zone start for each node\n");
2a1e274a
MG
7774 for (i = 0; i < MAX_NUMNODES; i++) {
7775 if (zone_movable_pfn[i])
8d29e18a
JG
7776 pr_info(" Node %d: %#018Lx\n", i,
7777 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
2a1e274a 7778 }
c713216d 7779
f46edbd1
DW
7780 /*
7781 * Print out the early node map, and initialize the
7782 * subsection-map relative to active online memory ranges to
7783 * enable future "sub-section" extensions of the memory map.
7784 */
f88dfff5 7785 pr_info("Early memory node ranges\n");
f46edbd1 7786 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
8d29e18a
JG
7787 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
7788 (u64)start_pfn << PAGE_SHIFT,
7789 ((u64)end_pfn << PAGE_SHIFT) - 1);
f46edbd1
DW
7790 subsection_map_init(start_pfn, end_pfn - start_pfn);
7791 }
c713216d
MG
7792
7793 /* Initialise every node */
708614e6 7794 mminit_verify_pageflags_layout();
8ef82866 7795 setup_nr_node_ids();
c713216d
MG
7796 for_each_online_node(nid) {
7797 pg_data_t *pgdat = NODE_DATA(nid);
854e8848 7798 free_area_init_node(nid);
37b07e41
LS
7799
7800 /* Any memory on that node */
7801 if (pgdat->node_present_pages)
4b0ef1fe
LJ
7802 node_set_state(nid, N_MEMORY);
7803 check_for_memory(pgdat, nid);
c713216d 7804 }
122e093c
MR
7805
7806 memmap_init();
c713216d 7807}
2a1e274a 7808
a5c6d650
DR
7809static int __init cmdline_parse_core(char *p, unsigned long *core,
7810 unsigned long *percent)
2a1e274a
MG
7811{
7812 unsigned long long coremem;
a5c6d650
DR
7813 char *endptr;
7814
2a1e274a
MG
7815 if (!p)
7816 return -EINVAL;
7817
a5c6d650
DR
7818 /* Value may be a percentage of total memory, otherwise bytes */
7819 coremem = simple_strtoull(p, &endptr, 0);
7820 if (*endptr == '%') {
7821 /* Paranoid check for percent values greater than 100 */
7822 WARN_ON(coremem > 100);
2a1e274a 7823
a5c6d650
DR
7824 *percent = coremem;
7825 } else {
7826 coremem = memparse(p, &p);
7827 /* Paranoid check that UL is enough for the coremem value */
7828 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
2a1e274a 7829
a5c6d650
DR
7830 *core = coremem >> PAGE_SHIFT;
7831 *percent = 0UL;
7832 }
2a1e274a
MG
7833 return 0;
7834}
ed7ed365 7835
7e63efef
MG
7836/*
7837 * kernelcore=size sets the amount of memory for use for allocations that
7838 * cannot be reclaimed or migrated.
7839 */
7840static int __init cmdline_parse_kernelcore(char *p)
7841{
342332e6
TI
7842 /* parse kernelcore=mirror */
7843 if (parse_option_str(p, "mirror")) {
7844 mirrored_kernelcore = true;
7845 return 0;
7846 }
7847
a5c6d650
DR
7848 return cmdline_parse_core(p, &required_kernelcore,
7849 &required_kernelcore_percent);
7e63efef
MG
7850}
7851
7852/*
7853 * movablecore=size sets the amount of memory for use for allocations that
7854 * can be reclaimed or migrated.
7855 */
7856static int __init cmdline_parse_movablecore(char *p)
7857{
a5c6d650
DR
7858 return cmdline_parse_core(p, &required_movablecore,
7859 &required_movablecore_percent);
7e63efef
MG
7860}
7861
ed7ed365 7862early_param("kernelcore", cmdline_parse_kernelcore);
7e63efef 7863early_param("movablecore", cmdline_parse_movablecore);
ed7ed365 7864
c3d5f5f0
JL
7865void adjust_managed_page_count(struct page *page, long count)
7866{
9705bea5 7867 atomic_long_add(count, &page_zone(page)->managed_pages);
ca79b0c2 7868 totalram_pages_add(count);
3dcc0571
JL
7869#ifdef CONFIG_HIGHMEM
7870 if (PageHighMem(page))
ca79b0c2 7871 totalhigh_pages_add(count);
3dcc0571 7872#endif
c3d5f5f0 7873}
3dcc0571 7874EXPORT_SYMBOL(adjust_managed_page_count);
c3d5f5f0 7875
e5cb113f 7876unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
69afade7 7877{
11199692
JL
7878 void *pos;
7879 unsigned long pages = 0;
69afade7 7880
11199692
JL
7881 start = (void *)PAGE_ALIGN((unsigned long)start);
7882 end = (void *)((unsigned long)end & PAGE_MASK);
7883 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
0d834328
DH
7884 struct page *page = virt_to_page(pos);
7885 void *direct_map_addr;
7886
7887 /*
7888 * 'direct_map_addr' might be different from 'pos'
7889 * because some architectures' virt_to_page()
7890 * work with aliases. Getting the direct map
7891 * address ensures that we get a _writeable_
7892 * alias for the memset().
7893 */
7894 direct_map_addr = page_address(page);
c746170d
VF
7895 /*
7896 * Perform a kasan-unchecked memset() since this memory
7897 * has not been initialized.
7898 */
7899 direct_map_addr = kasan_reset_tag(direct_map_addr);
dbe67df4 7900 if ((unsigned int)poison <= 0xFF)
0d834328
DH
7901 memset(direct_map_addr, poison, PAGE_SIZE);
7902
7903 free_reserved_page(page);
69afade7
JL
7904 }
7905
7906 if (pages && s)
adb1fe9a
JP
7907 pr_info("Freeing %s memory: %ldK\n",
7908 s, pages << (PAGE_SHIFT - 10));
69afade7
JL
7909
7910 return pages;
7911}
7912
1f9d03c5 7913void __init mem_init_print_info(void)
7ee3d4e8
JL
7914{
7915 unsigned long physpages, codesize, datasize, rosize, bss_size;
7916 unsigned long init_code_size, init_data_size;
7917
7918 physpages = get_num_physpages();
7919 codesize = _etext - _stext;
7920 datasize = _edata - _sdata;
7921 rosize = __end_rodata - __start_rodata;
7922 bss_size = __bss_stop - __bss_start;
7923 init_data_size = __init_end - __init_begin;
7924 init_code_size = _einittext - _sinittext;
7925
7926 /*
7927 * Detect special cases and adjust section sizes accordingly:
7928 * 1) .init.* may be embedded into .data sections
7929 * 2) .init.text.* may be out of [__init_begin, __init_end],
7930 * please refer to arch/tile/kernel/vmlinux.lds.S.
7931 * 3) .rodata.* may be embedded into .text or .data sections.
7932 */
7933#define adj_init_size(start, end, size, pos, adj) \
b8af2941
PK
7934 do { \
7935 if (start <= pos && pos < end && size > adj) \
7936 size -= adj; \
7937 } while (0)
7ee3d4e8
JL
7938
7939 adj_init_size(__init_begin, __init_end, init_data_size,
7940 _sinittext, init_code_size);
7941 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
7942 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
7943 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
7944 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
7945
7946#undef adj_init_size
7947
756a025f 7948 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
7ee3d4e8 7949#ifdef CONFIG_HIGHMEM
756a025f 7950 ", %luK highmem"
7ee3d4e8 7951#endif
1f9d03c5 7952 ")\n",
756a025f
JP
7953 nr_free_pages() << (PAGE_SHIFT - 10),
7954 physpages << (PAGE_SHIFT - 10),
7955 codesize >> 10, datasize >> 10, rosize >> 10,
7956 (init_data_size + init_code_size) >> 10, bss_size >> 10,
ca79b0c2 7957 (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
1f9d03c5 7958 totalcma_pages << (PAGE_SHIFT - 10)
7ee3d4e8 7959#ifdef CONFIG_HIGHMEM
1f9d03c5 7960 , totalhigh_pages() << (PAGE_SHIFT - 10)
7ee3d4e8 7961#endif
1f9d03c5 7962 );
7ee3d4e8
JL
7963}
7964
0e0b864e 7965/**
88ca3b94
RD
7966 * set_dma_reserve - set the specified number of pages reserved in the first zone
7967 * @new_dma_reserve: The number of pages to mark reserved
0e0b864e 7968 *
013110a7 7969 * The per-cpu batchsize and zone watermarks are determined by managed_pages.
0e0b864e
MG
7970 * In the DMA zone, a significant percentage may be consumed by kernel image
7971 * and other unfreeable allocations which can skew the watermarks badly. This
88ca3b94
RD
7972 * function may optionally be used to account for unfreeable pages in the
7973 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
7974 * smaller per-cpu batchsize.
0e0b864e
MG
7975 */
7976void __init set_dma_reserve(unsigned long new_dma_reserve)
7977{
7978 dma_reserve = new_dma_reserve;
7979}
7980
005fd4bb 7981static int page_alloc_cpu_dead(unsigned int cpu)
1da177e4 7982{
1da177e4 7983
005fd4bb
SAS
7984 lru_add_drain_cpu(cpu);
7985 drain_pages(cpu);
9f8f2172 7986
005fd4bb
SAS
7987 /*
7988 * Spill the event counters of the dead processor
7989 * into the current processors event counters.
7990 * This artificially elevates the count of the current
7991 * processor.
7992 */
7993 vm_events_fold_cpu(cpu);
9f8f2172 7994
005fd4bb
SAS
7995 /*
7996 * Zero the differential counters of the dead processor
7997 * so that the vm statistics are consistent.
7998 *
7999 * This is only okay since the processor is dead and cannot
8000 * race with what we are doing.
8001 */
8002 cpu_vm_stats_fold(cpu);
8003 return 0;
1da177e4 8004}
1da177e4 8005
e03a5125
NP
8006#ifdef CONFIG_NUMA
8007int hashdist = HASHDIST_DEFAULT;
8008
8009static int __init set_hashdist(char *str)
8010{
8011 if (!str)
8012 return 0;
8013 hashdist = simple_strtoul(str, &str, 0);
8014 return 1;
8015}
8016__setup("hashdist=", set_hashdist);
8017#endif
8018
1da177e4
LT
8019void __init page_alloc_init(void)
8020{
005fd4bb
SAS
8021 int ret;
8022
e03a5125
NP
8023#ifdef CONFIG_NUMA
8024 if (num_node_state(N_MEMORY) == 1)
8025 hashdist = 0;
8026#endif
8027
005fd4bb
SAS
8028 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
8029 "mm/page_alloc:dead", NULL,
8030 page_alloc_cpu_dead);
8031 WARN_ON(ret < 0);
1da177e4
LT
8032}
8033
cb45b0e9 8034/*
34b10060 8035 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
cb45b0e9
HA
8036 * or min_free_kbytes changes.
8037 */
8038static void calculate_totalreserve_pages(void)
8039{
8040 struct pglist_data *pgdat;
8041 unsigned long reserve_pages = 0;
2f6726e5 8042 enum zone_type i, j;
cb45b0e9
HA
8043
8044 for_each_online_pgdat(pgdat) {
281e3726
MG
8045
8046 pgdat->totalreserve_pages = 0;
8047
cb45b0e9
HA
8048 for (i = 0; i < MAX_NR_ZONES; i++) {
8049 struct zone *zone = pgdat->node_zones + i;
3484b2de 8050 long max = 0;
9705bea5 8051 unsigned long managed_pages = zone_managed_pages(zone);
cb45b0e9
HA
8052
8053 /* Find valid and maximum lowmem_reserve in the zone */
8054 for (j = i; j < MAX_NR_ZONES; j++) {
8055 if (zone->lowmem_reserve[j] > max)
8056 max = zone->lowmem_reserve[j];
8057 }
8058
41858966
MG
8059 /* we treat the high watermark as reserved pages. */
8060 max += high_wmark_pages(zone);
cb45b0e9 8061
3d6357de
AK
8062 if (max > managed_pages)
8063 max = managed_pages;
a8d01437 8064
281e3726 8065 pgdat->totalreserve_pages += max;
a8d01437 8066
cb45b0e9
HA
8067 reserve_pages += max;
8068 }
8069 }
8070 totalreserve_pages = reserve_pages;
8071}
8072
1da177e4
LT
8073/*
8074 * setup_per_zone_lowmem_reserve - called whenever
34b10060 8075 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
1da177e4
LT
8076 * has a correct pages reserved value, so an adequate number of
8077 * pages are left in the zone after a successful __alloc_pages().
8078 */
8079static void setup_per_zone_lowmem_reserve(void)
8080{
8081 struct pglist_data *pgdat;
470c61d7 8082 enum zone_type i, j;
1da177e4 8083
ec936fc5 8084 for_each_online_pgdat(pgdat) {
470c61d7
LS
8085 for (i = 0; i < MAX_NR_ZONES - 1; i++) {
8086 struct zone *zone = &pgdat->node_zones[i];
8087 int ratio = sysctl_lowmem_reserve_ratio[i];
8088 bool clear = !ratio || !zone_managed_pages(zone);
8089 unsigned long managed_pages = 0;
8090
8091 for (j = i + 1; j < MAX_NR_ZONES; j++) {
8092 if (clear) {
8093 zone->lowmem_reserve[j] = 0;
d3cda233 8094 } else {
470c61d7
LS
8095 struct zone *upper_zone = &pgdat->node_zones[j];
8096
8097 managed_pages += zone_managed_pages(upper_zone);
8098 zone->lowmem_reserve[j] = managed_pages / ratio;
d3cda233 8099 }
1da177e4
LT
8100 }
8101 }
8102 }
cb45b0e9
HA
8103
8104 /* update totalreserve_pages */
8105 calculate_totalreserve_pages();
1da177e4
LT
8106}
8107
cfd3da1e 8108static void __setup_per_zone_wmarks(void)
1da177e4
LT
8109{
8110 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
8111 unsigned long lowmem_pages = 0;
8112 struct zone *zone;
8113 unsigned long flags;
8114
8115 /* Calculate total number of !ZONE_HIGHMEM pages */
8116 for_each_zone(zone) {
8117 if (!is_highmem(zone))
9705bea5 8118 lowmem_pages += zone_managed_pages(zone);
1da177e4
LT
8119 }
8120
8121 for_each_zone(zone) {
ac924c60
AM
8122 u64 tmp;
8123
1125b4e3 8124 spin_lock_irqsave(&zone->lock, flags);
9705bea5 8125 tmp = (u64)pages_min * zone_managed_pages(zone);
ac924c60 8126 do_div(tmp, lowmem_pages);
1da177e4
LT
8127 if (is_highmem(zone)) {
8128 /*
669ed175
NP
8129 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
8130 * need highmem pages, so cap pages_min to a small
8131 * value here.
8132 *
41858966 8133 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
8bb4e7a2 8134 * deltas control async page reclaim, and so should
669ed175 8135 * not be capped for highmem.
1da177e4 8136 */
90ae8d67 8137 unsigned long min_pages;
1da177e4 8138
9705bea5 8139 min_pages = zone_managed_pages(zone) / 1024;
90ae8d67 8140 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
a9214443 8141 zone->_watermark[WMARK_MIN] = min_pages;
1da177e4 8142 } else {
669ed175
NP
8143 /*
8144 * If it's a lowmem zone, reserve a number of pages
1da177e4
LT
8145 * proportionate to the zone's size.
8146 */
a9214443 8147 zone->_watermark[WMARK_MIN] = tmp;
1da177e4
LT
8148 }
8149
795ae7a0
JW
8150 /*
8151 * Set the kswapd watermarks distance according to the
8152 * scale factor in proportion to available memory, but
8153 * ensure a minimum size on small systems.
8154 */
8155 tmp = max_t(u64, tmp >> 2,
9705bea5 8156 mult_frac(zone_managed_pages(zone),
795ae7a0
JW
8157 watermark_scale_factor, 10000));
8158
aa092591 8159 zone->watermark_boost = 0;
a9214443
MG
8160 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
8161 zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
49f223a9 8162
1125b4e3 8163 spin_unlock_irqrestore(&zone->lock, flags);
1da177e4 8164 }
cb45b0e9
HA
8165
8166 /* update totalreserve_pages */
8167 calculate_totalreserve_pages();
1da177e4
LT
8168}
8169
cfd3da1e
MG
8170/**
8171 * setup_per_zone_wmarks - called when min_free_kbytes changes
8172 * or when memory is hot-{added|removed}
8173 *
8174 * Ensures that the watermark[min,low,high] values for each zone are set
8175 * correctly with respect to min_free_kbytes.
8176 */
8177void setup_per_zone_wmarks(void)
8178{
b93e0f32
MH
8179 static DEFINE_SPINLOCK(lock);
8180
8181 spin_lock(&lock);
cfd3da1e 8182 __setup_per_zone_wmarks();
b93e0f32 8183 spin_unlock(&lock);
cfd3da1e
MG
8184}
8185
1da177e4
LT
8186/*
8187 * Initialise min_free_kbytes.
8188 *
8189 * For small machines we want it small (128k min). For large machines
8beeae86 8190 * we want it large (256MB max). But it is not linear, because network
1da177e4
LT
8191 * bandwidth does not increase linearly with machine size. We use
8192 *
b8af2941 8193 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
1da177e4
LT
8194 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
8195 *
8196 * which yields
8197 *
8198 * 16MB: 512k
8199 * 32MB: 724k
8200 * 64MB: 1024k
8201 * 128MB: 1448k
8202 * 256MB: 2048k
8203 * 512MB: 2896k
8204 * 1024MB: 4096k
8205 * 2048MB: 5792k
8206 * 4096MB: 8192k
8207 * 8192MB: 11584k
8208 * 16384MB: 16384k
8209 */
1b79acc9 8210int __meminit init_per_zone_wmark_min(void)
1da177e4
LT
8211{
8212 unsigned long lowmem_kbytes;
5f12733e 8213 int new_min_free_kbytes;
1da177e4
LT
8214
8215 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5f12733e
MH
8216 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
8217
8218 if (new_min_free_kbytes > user_min_free_kbytes) {
8219 min_free_kbytes = new_min_free_kbytes;
8220 if (min_free_kbytes < 128)
8221 min_free_kbytes = 128;
ee8eb9a5
JS
8222 if (min_free_kbytes > 262144)
8223 min_free_kbytes = 262144;
5f12733e
MH
8224 } else {
8225 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
8226 new_min_free_kbytes, user_min_free_kbytes);
8227 }
bc75d33f 8228 setup_per_zone_wmarks();
a6cccdc3 8229 refresh_zone_stat_thresholds();
1da177e4 8230 setup_per_zone_lowmem_reserve();
6423aa81
JK
8231
8232#ifdef CONFIG_NUMA
8233 setup_min_unmapped_ratio();
8234 setup_min_slab_ratio();
8235#endif
8236
4aab2be0
VB
8237 khugepaged_min_free_kbytes_update();
8238
1da177e4
LT
8239 return 0;
8240}
e08d3fdf 8241postcore_initcall(init_per_zone_wmark_min)
1da177e4
LT
8242
8243/*
b8af2941 8244 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
1da177e4
LT
8245 * that we can call two helper functions whenever min_free_kbytes
8246 * changes.
8247 */
cccad5b9 8248int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
32927393 8249 void *buffer, size_t *length, loff_t *ppos)
1da177e4 8250{
da8c757b
HP
8251 int rc;
8252
8253 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
8254 if (rc)
8255 return rc;
8256
5f12733e
MH
8257 if (write) {
8258 user_min_free_kbytes = min_free_kbytes;
bc75d33f 8259 setup_per_zone_wmarks();
5f12733e 8260 }
1da177e4
LT
8261 return 0;
8262}
8263
795ae7a0 8264int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
32927393 8265 void *buffer, size_t *length, loff_t *ppos)
795ae7a0
JW
8266{
8267 int rc;
8268
8269 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
8270 if (rc)
8271 return rc;
8272
8273 if (write)
8274 setup_per_zone_wmarks();
8275
8276 return 0;
8277}
8278
9614634f 8279#ifdef CONFIG_NUMA
6423aa81 8280static void setup_min_unmapped_ratio(void)
9614634f 8281{
6423aa81 8282 pg_data_t *pgdat;
9614634f 8283 struct zone *zone;
9614634f 8284
a5f5f91d 8285 for_each_online_pgdat(pgdat)
81cbcbc2 8286 pgdat->min_unmapped_pages = 0;
a5f5f91d 8287
9614634f 8288 for_each_zone(zone)
9705bea5
AK
8289 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
8290 sysctl_min_unmapped_ratio) / 100;
9614634f 8291}
0ff38490 8292
6423aa81
JK
8293
8294int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
32927393 8295 void *buffer, size_t *length, loff_t *ppos)
0ff38490 8296{
0ff38490
CL
8297 int rc;
8298
8d65af78 8299 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
0ff38490
CL
8300 if (rc)
8301 return rc;
8302
6423aa81
JK
8303 setup_min_unmapped_ratio();
8304
8305 return 0;
8306}
8307
8308static void setup_min_slab_ratio(void)
8309{
8310 pg_data_t *pgdat;
8311 struct zone *zone;
8312
a5f5f91d
MG
8313 for_each_online_pgdat(pgdat)
8314 pgdat->min_slab_pages = 0;
8315
0ff38490 8316 for_each_zone(zone)
9705bea5
AK
8317 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
8318 sysctl_min_slab_ratio) / 100;
6423aa81
JK
8319}
8320
8321int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
32927393 8322 void *buffer, size_t *length, loff_t *ppos)
6423aa81
JK
8323{
8324 int rc;
8325
8326 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
8327 if (rc)
8328 return rc;
8329
8330 setup_min_slab_ratio();
8331
0ff38490
CL
8332 return 0;
8333}
9614634f
CL
8334#endif
8335
1da177e4
LT
8336/*
8337 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
8338 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
8339 * whenever sysctl_lowmem_reserve_ratio changes.
8340 *
8341 * The reserve ratio obviously has absolutely no relation with the
41858966 8342 * minimum watermarks. The lowmem reserve ratio can only make sense
1da177e4
LT
8343 * if in function of the boot time zone sizes.
8344 */
cccad5b9 8345int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
32927393 8346 void *buffer, size_t *length, loff_t *ppos)
1da177e4 8347{
86aaf255
BH
8348 int i;
8349
8d65af78 8350 proc_dointvec_minmax(table, write, buffer, length, ppos);
86aaf255
BH
8351
8352 for (i = 0; i < MAX_NR_ZONES; i++) {
8353 if (sysctl_lowmem_reserve_ratio[i] < 1)
8354 sysctl_lowmem_reserve_ratio[i] = 0;
8355 }
8356
1da177e4
LT
8357 setup_per_zone_lowmem_reserve();
8358 return 0;
8359}
8360
8ad4b1fb
RS
8361/*
8362 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
b8af2941
PK
8363 * cpu. It is the fraction of total pages in each zone that a hot per cpu
8364 * pagelist can have before it gets flushed back to buddy allocator.
8ad4b1fb 8365 */
cccad5b9 8366int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
32927393 8367 void *buffer, size_t *length, loff_t *ppos)
8ad4b1fb
RS
8368{
8369 struct zone *zone;
7cd2b0a3 8370 int old_percpu_pagelist_fraction;
8ad4b1fb
RS
8371 int ret;
8372
7cd2b0a3
DR
8373 mutex_lock(&pcp_batch_high_lock);
8374 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
8375
8d65af78 8376 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
7cd2b0a3
DR
8377 if (!write || ret < 0)
8378 goto out;
8379
8380 /* Sanity checking to avoid pcp imbalance */
8381 if (percpu_pagelist_fraction &&
8382 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
8383 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
8384 ret = -EINVAL;
8385 goto out;
8386 }
8387
8388 /* No change? */
8389 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
8390 goto out;
c8e251fa 8391
cb1ef534 8392 for_each_populated_zone(zone)
0a8b4f1d 8393 zone_set_pageset_high_and_batch(zone);
7cd2b0a3 8394out:
c8e251fa 8395 mutex_unlock(&pcp_batch_high_lock);
7cd2b0a3 8396 return ret;
8ad4b1fb
RS
8397}
8398
f6f34b43
SD
8399#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
8400/*
8401 * Returns the number of pages that arch has reserved but
8402 * is not known to alloc_large_system_hash().
8403 */
8404static unsigned long __init arch_reserved_kernel_pages(void)
8405{
8406 return 0;
8407}
8408#endif
8409
9017217b
PT
8410/*
8411 * Adaptive scale is meant to reduce sizes of hash tables on large memory
8412 * machines. As memory size is increased the scale is also increased but at
8413 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
8414 * quadruples the scale is increased by one, which means the size of hash table
8415 * only doubles, instead of quadrupling as well.
8416 * Because 32-bit systems cannot have large physical memory, where this scaling
8417 * makes sense, it is disabled on such platforms.
8418 */
8419#if __BITS_PER_LONG > 32
8420#define ADAPT_SCALE_BASE (64ul << 30)
8421#define ADAPT_SCALE_SHIFT 2
8422#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
8423#endif
8424
1da177e4
LT
8425/*
8426 * allocate a large system hash table from bootmem
8427 * - it is assumed that the hash table must contain an exact power-of-2
8428 * quantity of entries
8429 * - limit is the number of hash buckets, not the total allocation size
8430 */
8431void *__init alloc_large_system_hash(const char *tablename,
8432 unsigned long bucketsize,
8433 unsigned long numentries,
8434 int scale,
8435 int flags,
8436 unsigned int *_hash_shift,
8437 unsigned int *_hash_mask,
31fe62b9
TB
8438 unsigned long low_limit,
8439 unsigned long high_limit)
1da177e4 8440{
31fe62b9 8441 unsigned long long max = high_limit;
1da177e4
LT
8442 unsigned long log2qty, size;
8443 void *table = NULL;
3749a8f0 8444 gfp_t gfp_flags;
ec11408a 8445 bool virt;
121e6f32 8446 bool huge;
1da177e4
LT
8447
8448 /* allow the kernel cmdline to have a say */
8449 if (!numentries) {
8450 /* round applicable memory size up to nearest megabyte */
04903664 8451 numentries = nr_kernel_pages;
f6f34b43 8452 numentries -= arch_reserved_kernel_pages();
a7e83318
JZ
8453
8454 /* It isn't necessary when PAGE_SIZE >= 1MB */
8455 if (PAGE_SHIFT < 20)
8456 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
1da177e4 8457
9017217b
PT
8458#if __BITS_PER_LONG > 32
8459 if (!high_limit) {
8460 unsigned long adapt;
8461
8462 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
8463 adapt <<= ADAPT_SCALE_SHIFT)
8464 scale++;
8465 }
8466#endif
8467
1da177e4
LT
8468 /* limit to 1 bucket per 2^scale bytes of low memory */
8469 if (scale > PAGE_SHIFT)
8470 numentries >>= (scale - PAGE_SHIFT);
8471 else
8472 numentries <<= (PAGE_SHIFT - scale);
9ab37b8f
PM
8473
8474 /* Make sure we've got at least a 0-order allocation.. */
2c85f51d
JB
8475 if (unlikely(flags & HASH_SMALL)) {
8476 /* Makes no sense without HASH_EARLY */
8477 WARN_ON(!(flags & HASH_EARLY));
8478 if (!(numentries >> *_hash_shift)) {
8479 numentries = 1UL << *_hash_shift;
8480 BUG_ON(!numentries);
8481 }
8482 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
9ab37b8f 8483 numentries = PAGE_SIZE / bucketsize;
1da177e4 8484 }
6e692ed3 8485 numentries = roundup_pow_of_two(numentries);
1da177e4
LT
8486
8487 /* limit allocation size to 1/16 total memory by default */
8488 if (max == 0) {
8489 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
8490 do_div(max, bucketsize);
8491 }
074b8517 8492 max = min(max, 0x80000000ULL);
1da177e4 8493
31fe62b9
TB
8494 if (numentries < low_limit)
8495 numentries = low_limit;
1da177e4
LT
8496 if (numentries > max)
8497 numentries = max;
8498
f0d1b0b3 8499 log2qty = ilog2(numentries);
1da177e4 8500
3749a8f0 8501 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
1da177e4 8502 do {
ec11408a 8503 virt = false;
1da177e4 8504 size = bucketsize << log2qty;
ea1f5f37
PT
8505 if (flags & HASH_EARLY) {
8506 if (flags & HASH_ZERO)
26fb3dae 8507 table = memblock_alloc(size, SMP_CACHE_BYTES);
ea1f5f37 8508 else
7e1c4e27
MR
8509 table = memblock_alloc_raw(size,
8510 SMP_CACHE_BYTES);
ec11408a 8511 } else if (get_order(size) >= MAX_ORDER || hashdist) {
88dca4ca 8512 table = __vmalloc(size, gfp_flags);
ec11408a 8513 virt = true;
121e6f32 8514 huge = is_vm_area_hugepages(table);
ea1f5f37 8515 } else {
1037b83b
ED
8516 /*
8517 * If bucketsize is not a power-of-two, we may free
a1dd268c
MG
8518 * some pages at the end of hash table which
8519 * alloc_pages_exact() automatically does
1037b83b 8520 */
ec11408a
NP
8521 table = alloc_pages_exact(size, gfp_flags);
8522 kmemleak_alloc(table, size, 1, gfp_flags);
1da177e4
LT
8523 }
8524 } while (!table && size > PAGE_SIZE && --log2qty);
8525
8526 if (!table)
8527 panic("Failed to allocate %s hash table\n", tablename);
8528
ec11408a
NP
8529 pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
8530 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
121e6f32 8531 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
1da177e4
LT
8532
8533 if (_hash_shift)
8534 *_hash_shift = log2qty;
8535 if (_hash_mask)
8536 *_hash_mask = (1 << log2qty) - 1;
8537
8538 return table;
8539}
a117e66e 8540
a5d76b54 8541/*
80934513 8542 * This function checks whether pageblock includes unmovable pages or not.
80934513 8543 *
b8af2941 8544 * PageLRU check without isolation or lru_lock could race so that
0efadf48
YX
8545 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
8546 * check without lock_page also may miss some movable non-lru pages at
8547 * race condition. So you can't expect this function should be exact.
4a55c047
QC
8548 *
8549 * Returns a page without holding a reference. If the caller wants to
047b9967 8550 * dereference that page (e.g., dumping), it has to make sure that it
4a55c047
QC
8551 * cannot get removed (e.g., via memory unplug) concurrently.
8552 *
a5d76b54 8553 */
4a55c047
QC
8554struct page *has_unmovable_pages(struct zone *zone, struct page *page,
8555 int migratetype, int flags)
49ac8255 8556{
1a9f2191
QC
8557 unsigned long iter = 0;
8558 unsigned long pfn = page_to_pfn(page);
6a654e36 8559 unsigned long offset = pfn % pageblock_nr_pages;
47118af0 8560
1a9f2191
QC
8561 if (is_migrate_cma_page(page)) {
8562 /*
8563 * CMA allocations (alloc_contig_range) really need to mark
8564 * isolate CMA pageblocks even when they are not movable in fact
8565 * so consider them movable here.
8566 */
8567 if (is_migrate_cma(migratetype))
4a55c047 8568 return NULL;
1a9f2191 8569
3d680bdf 8570 return page;
1a9f2191 8571 }
4da2ce25 8572
6a654e36 8573 for (; iter < pageblock_nr_pages - offset; iter++) {
fe4c86c9 8574 if (!pfn_valid_within(pfn + iter))
49ac8255 8575 continue;
29723fcc 8576
fe4c86c9 8577 page = pfn_to_page(pfn + iter);
c8721bbb 8578
c9c510dc
DH
8579 /*
8580 * Both, bootmem allocations and memory holes are marked
8581 * PG_reserved and are unmovable. We can even have unmovable
8582 * allocations inside ZONE_MOVABLE, for example when
8583 * specifying "movablecore".
8584 */
d7ab3672 8585 if (PageReserved(page))
3d680bdf 8586 return page;
d7ab3672 8587
9d789999
MH
8588 /*
8589 * If the zone is movable and we have ruled out all reserved
8590 * pages then it should be reasonably safe to assume the rest
8591 * is movable.
8592 */
8593 if (zone_idx(zone) == ZONE_MOVABLE)
8594 continue;
8595
c8721bbb
NH
8596 /*
8597 * Hugepages are not in LRU lists, but they're movable.
1da2f328 8598 * THPs are on the LRU, but need to be counted as #small pages.
8bb4e7a2 8599 * We need not scan over tail pages because we don't
c8721bbb
NH
8600 * handle each tail page individually in migration.
8601 */
1da2f328 8602 if (PageHuge(page) || PageTransCompound(page)) {
17e2e7d7
OS
8603 struct page *head = compound_head(page);
8604 unsigned int skip_pages;
464c7ffb 8605
1da2f328
RR
8606 if (PageHuge(page)) {
8607 if (!hugepage_migration_supported(page_hstate(head)))
8608 return page;
8609 } else if (!PageLRU(head) && !__PageMovable(head)) {
3d680bdf 8610 return page;
1da2f328 8611 }
464c7ffb 8612
d8c6546b 8613 skip_pages = compound_nr(head) - (page - head);
17e2e7d7 8614 iter += skip_pages - 1;
c8721bbb
NH
8615 continue;
8616 }
8617
97d255c8
MK
8618 /*
8619 * We can't use page_count without pin a page
8620 * because another CPU can free compound page.
8621 * This check already skips compound tails of THP
0139aa7b 8622 * because their page->_refcount is zero at all time.
97d255c8 8623 */
fe896d18 8624 if (!page_ref_count(page)) {
49ac8255 8625 if (PageBuddy(page))
ab130f91 8626 iter += (1 << buddy_order(page)) - 1;
49ac8255
KH
8627 continue;
8628 }
97d255c8 8629
b023f468
WC
8630 /*
8631 * The HWPoisoned page may be not in buddy system, and
8632 * page_count() is not 0.
8633 */
756d25be 8634 if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
b023f468
WC
8635 continue;
8636
aa218795
DH
8637 /*
8638 * We treat all PageOffline() pages as movable when offlining
8639 * to give drivers a chance to decrement their reference count
8640 * in MEM_GOING_OFFLINE in order to indicate that these pages
8641 * can be offlined as there are no direct references anymore.
8642 * For actually unmovable PageOffline() where the driver does
8643 * not support this, we will fail later when trying to actually
8644 * move these pages that still have a reference count > 0.
8645 * (false negatives in this function only)
8646 */
8647 if ((flags & MEMORY_OFFLINE) && PageOffline(page))
8648 continue;
8649
fe4c86c9 8650 if (__PageMovable(page) || PageLRU(page))
0efadf48
YX
8651 continue;
8652
49ac8255 8653 /*
6b4f7799
JW
8654 * If there are RECLAIMABLE pages, we need to check
8655 * it. But now, memory offline itself doesn't call
8656 * shrink_node_slabs() and it still to be fixed.
49ac8255 8657 */
3d680bdf 8658 return page;
49ac8255 8659 }
4a55c047 8660 return NULL;
49ac8255
KH
8661}
8662
8df995f6 8663#ifdef CONFIG_CONTIG_ALLOC
041d3a8c
MN
8664static unsigned long pfn_max_align_down(unsigned long pfn)
8665{
8666 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
8667 pageblock_nr_pages) - 1);
8668}
8669
8670static unsigned long pfn_max_align_up(unsigned long pfn)
8671{
8672 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
8673 pageblock_nr_pages));
8674}
8675
a1394bdd
MK
8676#if defined(CONFIG_DYNAMIC_DEBUG) || \
8677 (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
8678/* Usage: See admin-guide/dynamic-debug-howto.rst */
8679static void alloc_contig_dump_pages(struct list_head *page_list)
8680{
8681 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
8682
8683 if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
8684 struct page *page;
8685
8686 dump_stack();
8687 list_for_each_entry(page, page_list, lru)
8688 dump_page(page, "migration failure");
8689 }
8690}
8691#else
8692static inline void alloc_contig_dump_pages(struct list_head *page_list)
8693{
8694}
8695#endif
8696
041d3a8c 8697/* [start, end) must belong to a single zone. */
bb13ffeb
MG
8698static int __alloc_contig_migrate_range(struct compact_control *cc,
8699 unsigned long start, unsigned long end)
041d3a8c
MN
8700{
8701 /* This function is based on compact_zone() from compaction.c. */
730ec8c0 8702 unsigned int nr_reclaimed;
041d3a8c
MN
8703 unsigned long pfn = start;
8704 unsigned int tries = 0;
8705 int ret = 0;
8b94e0b8
JK
8706 struct migration_target_control mtc = {
8707 .nid = zone_to_nid(cc->zone),
8708 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
8709 };
041d3a8c 8710
361a2a22 8711 lru_cache_disable();
041d3a8c 8712
bb13ffeb 8713 while (pfn < end || !list_empty(&cc->migratepages)) {
041d3a8c
MN
8714 if (fatal_signal_pending(current)) {
8715 ret = -EINTR;
8716 break;
8717 }
8718
bb13ffeb
MG
8719 if (list_empty(&cc->migratepages)) {
8720 cc->nr_migratepages = 0;
c2ad7a1f
OS
8721 ret = isolate_migratepages_range(cc, pfn, end);
8722 if (ret && ret != -EAGAIN)
041d3a8c 8723 break;
c2ad7a1f 8724 pfn = cc->migrate_pfn;
041d3a8c
MN
8725 tries = 0;
8726 } else if (++tries == 5) {
c8e28b47 8727 ret = -EBUSY;
041d3a8c
MN
8728 break;
8729 }
8730
beb51eaa
MK
8731 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
8732 &cc->migratepages);
8733 cc->nr_migratepages -= nr_reclaimed;
02c6de8d 8734
8b94e0b8
JK
8735 ret = migrate_pages(&cc->migratepages, alloc_migration_target,
8736 NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
c8e28b47
OS
8737
8738 /*
8739 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
8740 * to retry again over this error, so do the same here.
8741 */
8742 if (ret == -ENOMEM)
8743 break;
041d3a8c 8744 }
d479960e 8745
361a2a22 8746 lru_cache_enable();
2a6f5124 8747 if (ret < 0) {
a1394bdd 8748 alloc_contig_dump_pages(&cc->migratepages);
2a6f5124
SP
8749 putback_movable_pages(&cc->migratepages);
8750 return ret;
8751 }
8752 return 0;
041d3a8c
MN
8753}
8754
8755/**
8756 * alloc_contig_range() -- tries to allocate given range of pages
8757 * @start: start PFN to allocate
8758 * @end: one-past-the-last PFN to allocate
f0953a1b 8759 * @migratetype: migratetype of the underlying pageblocks (either
0815f3d8
MN
8760 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
8761 * in range must have the same migratetype and it must
8762 * be either of the two.
ca96b625 8763 * @gfp_mask: GFP mask to use during compaction
041d3a8c
MN
8764 *
8765 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
2c7452a0 8766 * aligned. The PFN range must belong to a single zone.
041d3a8c 8767 *
2c7452a0
MK
8768 * The first thing this routine does is attempt to MIGRATE_ISOLATE all
8769 * pageblocks in the range. Once isolated, the pageblocks should not
8770 * be modified by others.
041d3a8c 8771 *
a862f68a 8772 * Return: zero on success or negative error code. On success all
041d3a8c
MN
8773 * pages which PFN is in [start, end) are allocated for the caller and
8774 * need to be freed with free_contig_range().
8775 */
0815f3d8 8776int alloc_contig_range(unsigned long start, unsigned long end,
ca96b625 8777 unsigned migratetype, gfp_t gfp_mask)
041d3a8c 8778{
041d3a8c 8779 unsigned long outer_start, outer_end;
d00181b9
KS
8780 unsigned int order;
8781 int ret = 0;
041d3a8c 8782
bb13ffeb
MG
8783 struct compact_control cc = {
8784 .nr_migratepages = 0,
8785 .order = -1,
8786 .zone = page_zone(pfn_to_page(start)),
e0b9daeb 8787 .mode = MIGRATE_SYNC,
bb13ffeb 8788 .ignore_skip_hint = true,
2583d671 8789 .no_set_skip_hint = true,
7dea19f9 8790 .gfp_mask = current_gfp_context(gfp_mask),
b06eda09 8791 .alloc_contig = true,
bb13ffeb
MG
8792 };
8793 INIT_LIST_HEAD(&cc.migratepages);
8794
041d3a8c
MN
8795 /*
8796 * What we do here is we mark all pageblocks in range as
8797 * MIGRATE_ISOLATE. Because pageblock and max order pages may
8798 * have different sizes, and due to the way page allocator
8799 * work, we align the range to biggest of the two pages so
8800 * that page allocator won't try to merge buddies from
8801 * different pageblocks and change MIGRATE_ISOLATE to some
8802 * other migration type.
8803 *
8804 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
8805 * migrate the pages from an unaligned range (ie. pages that
8806 * we are interested in). This will put all the pages in
8807 * range back to page allocator as MIGRATE_ISOLATE.
8808 *
8809 * When this is done, we take the pages in range from page
8810 * allocator removing them from the buddy system. This way
8811 * page allocator will never consider using them.
8812 *
8813 * This lets us mark the pageblocks back as
8814 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
8815 * aligned range but not in the unaligned, original range are
8816 * put back to page allocator so that buddy can use them.
8817 */
8818
8819 ret = start_isolate_page_range(pfn_max_align_down(start),
d381c547 8820 pfn_max_align_up(end), migratetype, 0);
3fa0c7c7 8821 if (ret)
86a595f9 8822 return ret;
041d3a8c 8823
7612921f
VB
8824 drain_all_pages(cc.zone);
8825
8ef5849f
JK
8826 /*
8827 * In case of -EBUSY, we'd like to know which page causes problem.
63cd4489
MK
8828 * So, just fall through. test_pages_isolated() has a tracepoint
8829 * which will report the busy page.
8830 *
8831 * It is possible that busy pages could become available before
8832 * the call to test_pages_isolated, and the range will actually be
8833 * allocated. So, if we fall through be sure to clear ret so that
8834 * -EBUSY is not accidentally used or returned to caller.
8ef5849f 8835 */
bb13ffeb 8836 ret = __alloc_contig_migrate_range(&cc, start, end);
8ef5849f 8837 if (ret && ret != -EBUSY)
041d3a8c 8838 goto done;
68d68ff6 8839 ret = 0;
041d3a8c
MN
8840
8841 /*
8842 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
8843 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
8844 * more, all pages in [start, end) are free in page allocator.
8845 * What we are going to do is to allocate all pages from
8846 * [start, end) (that is remove them from page allocator).
8847 *
8848 * The only problem is that pages at the beginning and at the
8849 * end of interesting range may be not aligned with pages that
8850 * page allocator holds, ie. they can be part of higher order
8851 * pages. Because of this, we reserve the bigger range and
8852 * once this is done free the pages we are not interested in.
8853 *
8854 * We don't have to hold zone->lock here because the pages are
8855 * isolated thus they won't get removed from buddy.
8856 */
8857
041d3a8c
MN
8858 order = 0;
8859 outer_start = start;
8860 while (!PageBuddy(pfn_to_page(outer_start))) {
8861 if (++order >= MAX_ORDER) {
8ef5849f
JK
8862 outer_start = start;
8863 break;
041d3a8c
MN
8864 }
8865 outer_start &= ~0UL << order;
8866 }
8867
8ef5849f 8868 if (outer_start != start) {
ab130f91 8869 order = buddy_order(pfn_to_page(outer_start));
8ef5849f
JK
8870
8871 /*
8872 * outer_start page could be small order buddy page and
8873 * it doesn't include start page. Adjust outer_start
8874 * in this case to report failed page properly
8875 * on tracepoint in test_pages_isolated()
8876 */
8877 if (outer_start + (1UL << order) <= start)
8878 outer_start = start;
8879 }
8880
041d3a8c 8881 /* Make sure the range is really isolated. */
756d25be 8882 if (test_pages_isolated(outer_start, end, 0)) {
041d3a8c
MN
8883 ret = -EBUSY;
8884 goto done;
8885 }
8886
49f223a9 8887 /* Grab isolated pages from freelists. */
bb13ffeb 8888 outer_end = isolate_freepages_range(&cc, outer_start, end);
041d3a8c
MN
8889 if (!outer_end) {
8890 ret = -EBUSY;
8891 goto done;
8892 }
8893
8894 /* Free head and tail (if any) */
8895 if (start != outer_start)
8896 free_contig_range(outer_start, start - outer_start);
8897 if (end != outer_end)
8898 free_contig_range(end, outer_end - end);
8899
8900done:
8901 undo_isolate_page_range(pfn_max_align_down(start),
0815f3d8 8902 pfn_max_align_up(end), migratetype);
041d3a8c
MN
8903 return ret;
8904}
255f5985 8905EXPORT_SYMBOL(alloc_contig_range);
5e27a2df
AK
8906
8907static int __alloc_contig_pages(unsigned long start_pfn,
8908 unsigned long nr_pages, gfp_t gfp_mask)
8909{
8910 unsigned long end_pfn = start_pfn + nr_pages;
8911
8912 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
8913 gfp_mask);
8914}
8915
8916static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
8917 unsigned long nr_pages)
8918{
8919 unsigned long i, end_pfn = start_pfn + nr_pages;
8920 struct page *page;
8921
8922 for (i = start_pfn; i < end_pfn; i++) {
8923 page = pfn_to_online_page(i);
8924 if (!page)
8925 return false;
8926
8927 if (page_zone(page) != z)
8928 return false;
8929
8930 if (PageReserved(page))
8931 return false;
5e27a2df
AK
8932 }
8933 return true;
8934}
8935
8936static bool zone_spans_last_pfn(const struct zone *zone,
8937 unsigned long start_pfn, unsigned long nr_pages)
8938{
8939 unsigned long last_pfn = start_pfn + nr_pages - 1;
8940
8941 return zone_spans_pfn(zone, last_pfn);
8942}
8943
8944/**
8945 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
8946 * @nr_pages: Number of contiguous pages to allocate
8947 * @gfp_mask: GFP mask to limit search and used during compaction
8948 * @nid: Target node
8949 * @nodemask: Mask for other possible nodes
8950 *
8951 * This routine is a wrapper around alloc_contig_range(). It scans over zones
8952 * on an applicable zonelist to find a contiguous pfn range which can then be
8953 * tried for allocation with alloc_contig_range(). This routine is intended
8954 * for allocation requests which can not be fulfilled with the buddy allocator.
8955 *
8956 * The allocated memory is always aligned to a page boundary. If nr_pages is a
8957 * power of two then the alignment is guaranteed to be to the given nr_pages
8958 * (e.g. 1GB request would be aligned to 1GB).
8959 *
8960 * Allocated pages can be freed with free_contig_range() or by manually calling
8961 * __free_page() on each allocated page.
8962 *
8963 * Return: pointer to contiguous pages on success, or NULL if not successful.
8964 */
8965struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
8966 int nid, nodemask_t *nodemask)
8967{
8968 unsigned long ret, pfn, flags;
8969 struct zonelist *zonelist;
8970 struct zone *zone;
8971 struct zoneref *z;
8972
8973 zonelist = node_zonelist(nid, gfp_mask);
8974 for_each_zone_zonelist_nodemask(zone, z, zonelist,
8975 gfp_zone(gfp_mask), nodemask) {
8976 spin_lock_irqsave(&zone->lock, flags);
8977
8978 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
8979 while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
8980 if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
8981 /*
8982 * We release the zone lock here because
8983 * alloc_contig_range() will also lock the zone
8984 * at some point. If there's an allocation
8985 * spinning on this lock, it may win the race
8986 * and cause alloc_contig_range() to fail...
8987 */
8988 spin_unlock_irqrestore(&zone->lock, flags);
8989 ret = __alloc_contig_pages(pfn, nr_pages,
8990 gfp_mask);
8991 if (!ret)
8992 return pfn_to_page(pfn);
8993 spin_lock_irqsave(&zone->lock, flags);
8994 }
8995 pfn += nr_pages;
8996 }
8997 spin_unlock_irqrestore(&zone->lock, flags);
8998 }
8999 return NULL;
9000}
4eb0716e 9001#endif /* CONFIG_CONTIG_ALLOC */
041d3a8c 9002
78fa5150 9003void free_contig_range(unsigned long pfn, unsigned long nr_pages)
041d3a8c 9004{
78fa5150 9005 unsigned long count = 0;
bcc2b02f
MS
9006
9007 for (; nr_pages--; pfn++) {
9008 struct page *page = pfn_to_page(pfn);
9009
9010 count += page_count(page) != 1;
9011 __free_page(page);
9012 }
78fa5150 9013 WARN(count != 0, "%lu pages are still in use!\n", count);
041d3a8c 9014}
255f5985 9015EXPORT_SYMBOL(free_contig_range);
041d3a8c 9016
0a647f38
CS
9017/*
9018 * The zone indicated has a new number of managed_pages; batch sizes and percpu
f0953a1b 9019 * page high values need to be recalculated.
0a647f38 9020 */
4ed7e022
JL
9021void __meminit zone_pcp_update(struct zone *zone)
9022{
c8e251fa 9023 mutex_lock(&pcp_batch_high_lock);
0a8b4f1d 9024 zone_set_pageset_high_and_batch(zone);
c8e251fa 9025 mutex_unlock(&pcp_batch_high_lock);
4ed7e022 9026}
4ed7e022 9027
ec6e8c7e
VB
9028/*
9029 * Effectively disable pcplists for the zone by setting the high limit to 0
9030 * and draining all cpus. A concurrent page freeing on another CPU that's about
9031 * to put the page on pcplist will either finish before the drain and the page
9032 * will be drained, or observe the new high limit and skip the pcplist.
9033 *
9034 * Must be paired with a call to zone_pcp_enable().
9035 */
9036void zone_pcp_disable(struct zone *zone)
9037{
9038 mutex_lock(&pcp_batch_high_lock);
9039 __zone_set_pageset_high_and_batch(zone, 0, 1);
9040 __drain_all_pages(zone, true);
9041}
9042
9043void zone_pcp_enable(struct zone *zone)
9044{
9045 __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
9046 mutex_unlock(&pcp_batch_high_lock);
9047}
9048
340175b7
JL
9049void zone_pcp_reset(struct zone *zone)
9050{
5a883813
MK
9051 int cpu;
9052 struct per_cpu_pageset *pset;
340175b7 9053
340175b7 9054 if (zone->pageset != &boot_pageset) {
5a883813
MK
9055 for_each_online_cpu(cpu) {
9056 pset = per_cpu_ptr(zone->pageset, cpu);
9057 drain_zonestat(zone, pset);
9058 }
340175b7
JL
9059 free_percpu(zone->pageset);
9060 zone->pageset = &boot_pageset;
9061 }
340175b7
JL
9062}
9063
6dcd73d7 9064#ifdef CONFIG_MEMORY_HOTREMOVE
0c0e6195 9065/*
257bea71
DH
9066 * All pages in the range must be in a single zone, must not contain holes,
9067 * must span full sections, and must be isolated before calling this function.
0c0e6195 9068 */
257bea71 9069void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
0c0e6195 9070{
257bea71 9071 unsigned long pfn = start_pfn;
0c0e6195
KH
9072 struct page *page;
9073 struct zone *zone;
0ee5f4f3 9074 unsigned int order;
0c0e6195 9075 unsigned long flags;
5557c766 9076
2d070eab 9077 offline_mem_sections(pfn, end_pfn);
0c0e6195
KH
9078 zone = page_zone(pfn_to_page(pfn));
9079 spin_lock_irqsave(&zone->lock, flags);
0c0e6195 9080 while (pfn < end_pfn) {
0c0e6195 9081 page = pfn_to_page(pfn);
b023f468
WC
9082 /*
9083 * The HWPoisoned page may be not in buddy system, and
9084 * page_count() is not 0.
9085 */
9086 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
9087 pfn++;
b023f468
WC
9088 continue;
9089 }
aa218795
DH
9090 /*
9091 * At this point all remaining PageOffline() pages have a
9092 * reference count of 0 and can simply be skipped.
9093 */
9094 if (PageOffline(page)) {
9095 BUG_ON(page_count(page));
9096 BUG_ON(PageBuddy(page));
9097 pfn++;
aa218795
DH
9098 continue;
9099 }
b023f468 9100
0c0e6195
KH
9101 BUG_ON(page_count(page));
9102 BUG_ON(!PageBuddy(page));
ab130f91 9103 order = buddy_order(page);
6ab01363 9104 del_page_from_free_list(page, zone, order);
0c0e6195
KH
9105 pfn += (1 << order);
9106 }
9107 spin_unlock_irqrestore(&zone->lock, flags);
9108}
9109#endif
8d22ba1b 9110
8d22ba1b
WF
9111bool is_free_buddy_page(struct page *page)
9112{
9113 struct zone *zone = page_zone(page);
9114 unsigned long pfn = page_to_pfn(page);
9115 unsigned long flags;
7aeb09f9 9116 unsigned int order;
8d22ba1b
WF
9117
9118 spin_lock_irqsave(&zone->lock, flags);
9119 for (order = 0; order < MAX_ORDER; order++) {
9120 struct page *page_head = page - (pfn & ((1 << order) - 1));
9121
ab130f91 9122 if (PageBuddy(page_head) && buddy_order(page_head) >= order)
8d22ba1b
WF
9123 break;
9124 }
9125 spin_unlock_irqrestore(&zone->lock, flags);
9126
9127 return order < MAX_ORDER;
9128}
d4ae9916
NH
9129
9130#ifdef CONFIG_MEMORY_FAILURE
9131/*
06be6ff3
OS
9132 * Break down a higher-order page in sub-pages, and keep our target out of
9133 * buddy allocator.
d4ae9916 9134 */
06be6ff3
OS
9135static void break_down_buddy_pages(struct zone *zone, struct page *page,
9136 struct page *target, int low, int high,
9137 int migratetype)
9138{
9139 unsigned long size = 1 << high;
9140 struct page *current_buddy, *next_page;
9141
9142 while (high > low) {
9143 high--;
9144 size >>= 1;
9145
9146 if (target >= &page[size]) {
9147 next_page = page + size;
9148 current_buddy = page;
9149 } else {
9150 next_page = page;
9151 current_buddy = page + size;
9152 }
9153
9154 if (set_page_guard(zone, current_buddy, high, migratetype))
9155 continue;
9156
9157 if (current_buddy != target) {
9158 add_to_free_list(current_buddy, zone, high, migratetype);
ab130f91 9159 set_buddy_order(current_buddy, high);
06be6ff3
OS
9160 page = next_page;
9161 }
9162 }
9163}
9164
9165/*
9166 * Take a page that will be marked as poisoned off the buddy allocator.
9167 */
9168bool take_page_off_buddy(struct page *page)
d4ae9916
NH
9169{
9170 struct zone *zone = page_zone(page);
9171 unsigned long pfn = page_to_pfn(page);
9172 unsigned long flags;
9173 unsigned int order;
06be6ff3 9174 bool ret = false;
d4ae9916
NH
9175
9176 spin_lock_irqsave(&zone->lock, flags);
9177 for (order = 0; order < MAX_ORDER; order++) {
9178 struct page *page_head = page - (pfn & ((1 << order) - 1));
ab130f91 9179 int page_order = buddy_order(page_head);
d4ae9916 9180
ab130f91 9181 if (PageBuddy(page_head) && page_order >= order) {
06be6ff3
OS
9182 unsigned long pfn_head = page_to_pfn(page_head);
9183 int migratetype = get_pfnblock_migratetype(page_head,
9184 pfn_head);
9185
ab130f91 9186 del_page_from_free_list(page_head, zone, page_order);
06be6ff3 9187 break_down_buddy_pages(zone, page_head, page, 0,
ab130f91 9188 page_order, migratetype);
bac9c6fa
DH
9189 if (!is_migrate_isolate(migratetype))
9190 __mod_zone_freepage_state(zone, -1, migratetype);
06be6ff3 9191 ret = true;
d4ae9916
NH
9192 break;
9193 }
06be6ff3
OS
9194 if (page_count(page_head) > 0)
9195 break;
d4ae9916
NH
9196 }
9197 spin_unlock_irqrestore(&zone->lock, flags);
06be6ff3 9198 return ret;
d4ae9916
NH
9199}
9200#endif