1 // SPDX-License-Identifier: GPL-2.0-only
3 * linux/mm/page_alloc.c
5 * Manages the free list, the system allocates free pages here.
6 * Note that kmalloc() lives in slab.c
8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 * Swap reorganised 29.12.95, Stephen Tweedie
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
12 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
13 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
14 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
15 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
18 #include <linux/stddef.h>
20 #include <linux/highmem.h>
21 #include <linux/interrupt.h>
22 #include <linux/jiffies.h>
23 #include <linux/compiler.h>
24 #include <linux/kernel.h>
25 #include <linux/kasan.h>
26 #include <linux/kmsan.h>
27 #include <linux/module.h>
28 #include <linux/suspend.h>
29 #include <linux/ratelimit.h>
30 #include <linux/oom.h>
31 #include <linux/topology.h>
32 #include <linux/sysctl.h>
33 #include <linux/cpu.h>
34 #include <linux/cpuset.h>
35 #include <linux/pagevec.h>
36 #include <linux/memory_hotplug.h>
37 #include <linux/nodemask.h>
38 #include <linux/vmstat.h>
39 #include <linux/fault-inject.h>
40 #include <linux/compaction.h>
41 #include <trace/events/kmem.h>
42 #include <trace/events/oom.h>
43 #include <linux/prefetch.h>
44 #include <linux/mm_inline.h>
45 #include <linux/mmu_notifier.h>
46 #include <linux/migrate.h>
47 #include <linux/sched/mm.h>
48 #include <linux/page_owner.h>
49 #include <linux/page_table_check.h>
50 #include <linux/memcontrol.h>
51 #include <linux/ftrace.h>
52 #include <linux/lockdep.h>
53 #include <linux/psi.h>
54 #include <linux/khugepaged.h>
55 #include <linux/delayacct.h>
56 #include <linux/cacheinfo.h>
57 #include <linux/pgalloc_tag.h>
58 #include <asm/div64.h>
61 #include "page_reporting.h"
63 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
64 typedef int __bitwise fpi_t
;
66 /* No special request */
67 #define FPI_NONE ((__force fpi_t)0)
70 * Skip free page reporting notification for the (possibly merged) page.
71 * This does not hinder free page reporting from grabbing the page,
72 * reporting it and marking it "reported" - it only skips notifying
73 * the free page reporting infrastructure about a newly freed page. For
74 * example, used when temporarily pulling a page from a freelist and
75 * putting it back unmodified.
77 #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
80 * Place the (possibly merged) page to the tail of the freelist. Will ignore
81 * page shuffling (relevant code - e.g., memory onlining - is expected to
82 * shuffle the whole zone).
84 * Note: No code should rely on this flag for correctness - it's purely
85 * to allow for optimizations when handing back either fresh pages
86 * (memory onlining) or untouched pages (page isolation, free page
89 #define FPI_TO_TAIL ((__force fpi_t)BIT(1))
91 /* Free the page without taking locks. Rely on trylock only. */
92 #define FPI_TRYLOCK ((__force fpi_t)BIT(2))
94 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
95 static DEFINE_MUTEX(pcp_batch_high_lock
);
96 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
98 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
100 * On SMP, spin_trylock is sufficient protection.
101 * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
103 #define pcp_trylock_prepare(flags) do { } while (0)
104 #define pcp_trylock_finish(flag) do { } while (0)
107 /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
108 #define pcp_trylock_prepare(flags) local_irq_save(flags)
109 #define pcp_trylock_finish(flags) local_irq_restore(flags)
113 * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
114 * a migration causing the wrong PCP to be locked and remote memory being
115 * potentially allocated, pin the task to the CPU for the lookup+lock.
116 * preempt_disable is used on !RT because it is faster than migrate_disable.
117 * migrate_disable is used on RT because otherwise RT spinlock usage is
118 * interfered with and a high priority task cannot preempt the allocator.
120 #ifndef CONFIG_PREEMPT_RT
121 #define pcpu_task_pin() preempt_disable()
122 #define pcpu_task_unpin() preempt_enable()
124 #define pcpu_task_pin() migrate_disable()
125 #define pcpu_task_unpin() migrate_enable()
129 * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
130 * Return value should be used with equivalent unlock helper.
132 #define pcpu_spin_lock(type, member, ptr) \
136 _ret = this_cpu_ptr(ptr); \
137 spin_lock(&_ret->member); \
141 #define pcpu_spin_trylock(type, member, ptr) \
145 _ret = this_cpu_ptr(ptr); \
146 if (!spin_trylock(&_ret->member)) { \
153 #define pcpu_spin_unlock(member, ptr) \
155 spin_unlock(&ptr->member); \
159 /* struct per_cpu_pages specific helpers. */
160 #define pcp_spin_lock(ptr) \
161 pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
163 #define pcp_spin_trylock(ptr) \
164 pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
166 #define pcp_spin_unlock(ptr) \
167 pcpu_spin_unlock(lock, ptr)
169 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
170 DEFINE_PER_CPU(int, numa_node
);
171 EXPORT_PER_CPU_SYMBOL(numa_node
);
174 DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key
);
176 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
178 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
179 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
180 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
181 * defined in <linux/topology.h>.
183 DEFINE_PER_CPU(int, _numa_mem_
); /* Kernel "local memory" node */
184 EXPORT_PER_CPU_SYMBOL(_numa_mem_
);
187 static DEFINE_MUTEX(pcpu_drain_mutex
);
189 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
190 volatile unsigned long latent_entropy __latent_entropy
;
191 EXPORT_SYMBOL(latent_entropy
);
195 * Array of node states.
197 nodemask_t node_states
[NR_NODE_STATES
] __read_mostly
= {
198 [N_POSSIBLE
] = NODE_MASK_ALL
,
199 [N_ONLINE
] = { { [0] = 1UL } },
201 [N_NORMAL_MEMORY
] = { { [0] = 1UL } },
202 #ifdef CONFIG_HIGHMEM
203 [N_HIGH_MEMORY
] = { { [0] = 1UL } },
205 [N_MEMORY
] = { { [0] = 1UL } },
206 [N_CPU
] = { { [0] = 1UL } },
209 EXPORT_SYMBOL(node_states
);
211 gfp_t gfp_allowed_mask __read_mostly
= GFP_BOOT_MASK
;
213 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
214 unsigned int pageblock_order __read_mostly
;
217 static void __free_pages_ok(struct page
*page
, unsigned int order
,
221 * results with 256, 32 in the lowmem_reserve sysctl:
222 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
223 * 1G machine -> (16M dma, 784M normal, 224M high)
224 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
225 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
226 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
228 * TBD: should special case ZONE_DMA32 machines here - in those we normally
229 * don't need any ZONE_NORMAL reservation
231 static int sysctl_lowmem_reserve_ratio
[MAX_NR_ZONES
] = {
232 #ifdef CONFIG_ZONE_DMA
235 #ifdef CONFIG_ZONE_DMA32
239 #ifdef CONFIG_HIGHMEM
245 char * const zone_names
[MAX_NR_ZONES
] = {
246 #ifdef CONFIG_ZONE_DMA
249 #ifdef CONFIG_ZONE_DMA32
253 #ifdef CONFIG_HIGHMEM
257 #ifdef CONFIG_ZONE_DEVICE
262 const char * const migratetype_names
[MIGRATE_TYPES
] = {
270 #ifdef CONFIG_MEMORY_ISOLATION
275 int min_free_kbytes
= 1024;
276 int user_min_free_kbytes
= -1;
277 static int watermark_boost_factor __read_mostly
= 15000;
278 static int watermark_scale_factor
= 10;
281 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
283 EXPORT_SYMBOL(movable_zone
);
286 unsigned int nr_node_ids __read_mostly
= MAX_NUMNODES
;
287 unsigned int nr_online_nodes __read_mostly
= 1;
288 EXPORT_SYMBOL(nr_node_ids
);
289 EXPORT_SYMBOL(nr_online_nodes
);
292 static bool page_contains_unaccepted(struct page
*page
, unsigned int order
);
293 static bool cond_accept_memory(struct zone
*zone
, unsigned int order
,
295 static bool __free_unaccepted(struct page
*page
);
297 int page_group_by_mobility_disabled __read_mostly
;
299 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
301 * During boot we initialize deferred pages on-demand, as needed, but once
302 * page_alloc_init_late() has finished, the deferred pages are all initialized,
303 * and we can permanently disable that path.
305 DEFINE_STATIC_KEY_TRUE(deferred_pages
);
307 static inline bool deferred_pages_enabled(void)
309 return static_branch_unlikely(&deferred_pages
);
313 * deferred_grow_zone() is __init, but it is called from
314 * get_page_from_freelist() during early boot until deferred_pages permanently
315 * disables this call. This is why we have refdata wrapper to avoid warning,
316 * and to ensure that the function body gets unloaded.
319 _deferred_grow_zone(struct zone
*zone
, unsigned int order
)
321 return deferred_grow_zone(zone
, order
);
324 static inline bool deferred_pages_enabled(void)
329 static inline bool _deferred_grow_zone(struct zone
*zone
, unsigned int order
)
333 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
335 /* Return a pointer to the bitmap storing bits affecting a block of pages */
336 static inline unsigned long *get_pageblock_bitmap(const struct page
*page
,
339 #ifdef CONFIG_SPARSEMEM
340 return section_to_usemap(__pfn_to_section(pfn
));
342 return page_zone(page
)->pageblock_flags
;
343 #endif /* CONFIG_SPARSEMEM */
346 static inline int pfn_to_bitidx(const struct page
*page
, unsigned long pfn
)
348 #ifdef CONFIG_SPARSEMEM
349 pfn
&= (PAGES_PER_SECTION
-1);
351 pfn
= pfn
- pageblock_start_pfn(page_zone(page
)->zone_start_pfn
);
352 #endif /* CONFIG_SPARSEMEM */
353 return (pfn
>> pageblock_order
) * NR_PAGEBLOCK_BITS
;
357 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
358 * @page: The page within the block of interest
359 * @pfn: The target page frame number
360 * @mask: mask of bits that the caller is interested in
362 * Return: pageblock_bits flags
364 unsigned long get_pfnblock_flags_mask(const struct page
*page
,
365 unsigned long pfn
, unsigned long mask
)
367 unsigned long *bitmap
;
368 unsigned long bitidx
, word_bitidx
;
371 bitmap
= get_pageblock_bitmap(page
, pfn
);
372 bitidx
= pfn_to_bitidx(page
, pfn
);
373 word_bitidx
= bitidx
/ BITS_PER_LONG
;
374 bitidx
&= (BITS_PER_LONG
-1);
376 * This races, without locks, with set_pfnblock_flags_mask(). Ensure
377 * a consistent read of the memory array, so that results, even though
378 * racy, are not corrupted.
380 word
= READ_ONCE(bitmap
[word_bitidx
]);
381 return (word
>> bitidx
) & mask
;
384 static __always_inline
int get_pfnblock_migratetype(const struct page
*page
,
387 return get_pfnblock_flags_mask(page
, pfn
, MIGRATETYPE_MASK
);
391 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
392 * @page: The page within the block of interest
393 * @flags: The flags to set
394 * @pfn: The target page frame number
395 * @mask: mask of bits that the caller is interested in
397 void set_pfnblock_flags_mask(struct page
*page
, unsigned long flags
,
401 unsigned long *bitmap
;
402 unsigned long bitidx
, word_bitidx
;
405 BUILD_BUG_ON(NR_PAGEBLOCK_BITS
!= 4);
406 BUILD_BUG_ON(MIGRATE_TYPES
> (1 << PB_migratetype_bits
));
408 bitmap
= get_pageblock_bitmap(page
, pfn
);
409 bitidx
= pfn_to_bitidx(page
, pfn
);
410 word_bitidx
= bitidx
/ BITS_PER_LONG
;
411 bitidx
&= (BITS_PER_LONG
-1);
413 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page
), pfn
), page
);
418 word
= READ_ONCE(bitmap
[word_bitidx
]);
420 } while (!try_cmpxchg(&bitmap
[word_bitidx
], &word
, (word
& ~mask
) | flags
));
423 void set_pageblock_migratetype(struct page
*page
, int migratetype
)
425 if (unlikely(page_group_by_mobility_disabled
&&
426 migratetype
< MIGRATE_PCPTYPES
))
427 migratetype
= MIGRATE_UNMOVABLE
;
429 set_pfnblock_flags_mask(page
, (unsigned long)migratetype
,
430 page_to_pfn(page
), MIGRATETYPE_MASK
);
433 #ifdef CONFIG_DEBUG_VM
434 static int page_outside_zone_boundaries(struct zone
*zone
, struct page
*page
)
438 unsigned long pfn
= page_to_pfn(page
);
439 unsigned long sp
, start_pfn
;
442 seq
= zone_span_seqbegin(zone
);
443 start_pfn
= zone
->zone_start_pfn
;
444 sp
= zone
->spanned_pages
;
445 ret
= !zone_spans_pfn(zone
, pfn
);
446 } while (zone_span_seqretry(zone
, seq
));
449 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
450 pfn
, zone_to_nid(zone
), zone
->name
,
451 start_pfn
, start_pfn
+ sp
);
457 * Temporary debugging check for pages not lying within a given zone.
459 static bool __maybe_unused
bad_range(struct zone
*zone
, struct page
*page
)
461 if (page_outside_zone_boundaries(zone
, page
))
463 if (zone
!= page_zone(page
))
469 static inline bool __maybe_unused
bad_range(struct zone
*zone
, struct page
*page
)
475 static void bad_page(struct page
*page
, const char *reason
)
477 static unsigned long resume
;
478 static unsigned long nr_shown
;
479 static unsigned long nr_unshown
;
482 * Allow a burst of 60 reports, then keep quiet for that minute;
483 * or allow a steady drip of one report per second.
485 if (nr_shown
== 60) {
486 if (time_before(jiffies
, resume
)) {
492 "BUG: Bad page state: %lu messages suppressed\n",
499 resume
= jiffies
+ 60 * HZ
;
501 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
502 current
->comm
, page_to_pfn(page
));
503 dump_page(page
, reason
);
508 /* Leave bad fields for debug, except PageBuddy could make trouble */
510 __ClearPageBuddy(page
);
511 add_taint(TAINT_BAD_PAGE
, LOCKDEP_NOW_UNRELIABLE
);
514 static inline unsigned int order_to_pindex(int migratetype
, int order
)
517 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
519 if (order
> PAGE_ALLOC_COSTLY_ORDER
) {
520 VM_BUG_ON(order
!= HPAGE_PMD_ORDER
);
522 movable
= migratetype
== MIGRATE_MOVABLE
;
524 return NR_LOWORDER_PCP_LISTS
+ movable
;
527 VM_BUG_ON(order
> PAGE_ALLOC_COSTLY_ORDER
);
530 return (MIGRATE_PCPTYPES
* order
) + migratetype
;
533 static inline int pindex_to_order(unsigned int pindex
)
535 int order
= pindex
/ MIGRATE_PCPTYPES
;
537 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
538 if (pindex
>= NR_LOWORDER_PCP_LISTS
)
539 order
= HPAGE_PMD_ORDER
;
541 VM_BUG_ON(order
> PAGE_ALLOC_COSTLY_ORDER
);
547 static inline bool pcp_allowed_order(unsigned int order
)
549 if (order
<= PAGE_ALLOC_COSTLY_ORDER
)
551 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
552 if (order
== HPAGE_PMD_ORDER
)
559 * Higher-order pages are called "compound pages". They are structured thusly:
561 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
563 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
564 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
566 * The first tail page's ->compound_order holds the order of allocation.
567 * This usage means that zero-order pages may not be compound.
570 void prep_compound_page(struct page
*page
, unsigned int order
)
573 int nr_pages
= 1 << order
;
576 for (i
= 1; i
< nr_pages
; i
++)
577 prep_compound_tail(page
, i
);
579 prep_compound_head(page
, order
);
582 static inline void set_buddy_order(struct page
*page
, unsigned int order
)
584 set_page_private(page
, order
);
585 __SetPageBuddy(page
);
588 #ifdef CONFIG_COMPACTION
589 static inline struct capture_control
*task_capc(struct zone
*zone
)
591 struct capture_control
*capc
= current
->capture_control
;
593 return unlikely(capc
) &&
594 !(current
->flags
& PF_KTHREAD
) &&
596 capc
->cc
->zone
== zone
? capc
: NULL
;
600 compaction_capture(struct capture_control
*capc
, struct page
*page
,
601 int order
, int migratetype
)
603 if (!capc
|| order
!= capc
->cc
->order
)
606 /* Do not accidentally pollute CMA or isolated regions*/
607 if (is_migrate_cma(migratetype
) ||
608 is_migrate_isolate(migratetype
))
612 * Do not let lower order allocations pollute a movable pageblock
613 * unless compaction is also requesting movable pages.
614 * This might let an unmovable request use a reclaimable pageblock
615 * and vice-versa but no more than normal fallback logic which can
616 * have trouble finding a high-order free page.
618 if (order
< pageblock_order
&& migratetype
== MIGRATE_MOVABLE
&&
619 capc
->cc
->migratetype
!= MIGRATE_MOVABLE
)
622 if (migratetype
!= capc
->cc
->migratetype
)
623 trace_mm_page_alloc_extfrag(page
, capc
->cc
->order
, order
,
624 capc
->cc
->migratetype
, migratetype
);
631 static inline struct capture_control
*task_capc(struct zone
*zone
)
637 compaction_capture(struct capture_control
*capc
, struct page
*page
,
638 int order
, int migratetype
)
642 #endif /* CONFIG_COMPACTION */
644 static inline void account_freepages(struct zone
*zone
, int nr_pages
,
647 lockdep_assert_held(&zone
->lock
);
649 if (is_migrate_isolate(migratetype
))
652 __mod_zone_page_state(zone
, NR_FREE_PAGES
, nr_pages
);
654 if (is_migrate_cma(migratetype
))
655 __mod_zone_page_state(zone
, NR_FREE_CMA_PAGES
, nr_pages
);
656 else if (is_migrate_highatomic(migratetype
))
657 WRITE_ONCE(zone
->nr_free_highatomic
,
658 zone
->nr_free_highatomic
+ nr_pages
);
661 /* Used for pages not on another list */
662 static inline void __add_to_free_list(struct page
*page
, struct zone
*zone
,
663 unsigned int order
, int migratetype
,
666 struct free_area
*area
= &zone
->free_area
[order
];
667 int nr_pages
= 1 << order
;
669 VM_WARN_ONCE(get_pageblock_migratetype(page
) != migratetype
,
670 "page type is %lu, passed migratetype is %d (nr=%d)\n",
671 get_pageblock_migratetype(page
), migratetype
, nr_pages
);
674 list_add_tail(&page
->buddy_list
, &area
->free_list
[migratetype
]);
676 list_add(&page
->buddy_list
, &area
->free_list
[migratetype
]);
679 if (order
>= pageblock_order
&& !is_migrate_isolate(migratetype
))
680 __mod_zone_page_state(zone
, NR_FREE_PAGES_BLOCKS
, nr_pages
);
684 * Used for pages which are on another list. Move the pages to the tail
685 * of the list - so the moved pages won't immediately be considered for
686 * allocation again (e.g., optimization for memory onlining).
688 static inline void move_to_free_list(struct page
*page
, struct zone
*zone
,
689 unsigned int order
, int old_mt
, int new_mt
)
691 struct free_area
*area
= &zone
->free_area
[order
];
692 int nr_pages
= 1 << order
;
694 /* Free page moving can fail, so it happens before the type update */
695 VM_WARN_ONCE(get_pageblock_migratetype(page
) != old_mt
,
696 "page type is %lu, passed migratetype is %d (nr=%d)\n",
697 get_pageblock_migratetype(page
), old_mt
, nr_pages
);
699 list_move_tail(&page
->buddy_list
, &area
->free_list
[new_mt
]);
701 account_freepages(zone
, -nr_pages
, old_mt
);
702 account_freepages(zone
, nr_pages
, new_mt
);
704 if (order
>= pageblock_order
&&
705 is_migrate_isolate(old_mt
) != is_migrate_isolate(new_mt
)) {
706 if (!is_migrate_isolate(old_mt
))
707 nr_pages
= -nr_pages
;
708 __mod_zone_page_state(zone
, NR_FREE_PAGES_BLOCKS
, nr_pages
);
712 static inline void __del_page_from_free_list(struct page
*page
, struct zone
*zone
,
713 unsigned int order
, int migratetype
)
715 int nr_pages
= 1 << order
;
717 VM_WARN_ONCE(get_pageblock_migratetype(page
) != migratetype
,
718 "page type is %lu, passed migratetype is %d (nr=%d)\n",
719 get_pageblock_migratetype(page
), migratetype
, nr_pages
);
721 /* clear reported state and update reported page count */
722 if (page_reported(page
))
723 __ClearPageReported(page
);
725 list_del(&page
->buddy_list
);
726 __ClearPageBuddy(page
);
727 set_page_private(page
, 0);
728 zone
->free_area
[order
].nr_free
--;
730 if (order
>= pageblock_order
&& !is_migrate_isolate(migratetype
))
731 __mod_zone_page_state(zone
, NR_FREE_PAGES_BLOCKS
, -nr_pages
);
734 static inline void del_page_from_free_list(struct page
*page
, struct zone
*zone
,
735 unsigned int order
, int migratetype
)
737 __del_page_from_free_list(page
, zone
, order
, migratetype
);
738 account_freepages(zone
, -(1 << order
), migratetype
);
741 static inline struct page
*get_page_from_free_area(struct free_area
*area
,
744 return list_first_entry_or_null(&area
->free_list
[migratetype
],
745 struct page
, buddy_list
);
749 * If this is less than the 2nd largest possible page, check if the buddy
750 * of the next-higher order is free. If it is, it's possible
751 * that pages are being freed that will coalesce soon. In case,
752 * that is happening, add the free page to the tail of the list
753 * so it's less likely to be used soon and more likely to be merged
754 * as a 2-level higher order page
757 buddy_merge_likely(unsigned long pfn
, unsigned long buddy_pfn
,
758 struct page
*page
, unsigned int order
)
760 unsigned long higher_page_pfn
;
761 struct page
*higher_page
;
763 if (order
>= MAX_PAGE_ORDER
- 1)
766 higher_page_pfn
= buddy_pfn
& pfn
;
767 higher_page
= page
+ (higher_page_pfn
- pfn
);
769 return find_buddy_page_pfn(higher_page
, higher_page_pfn
, order
+ 1,
774 * Freeing function for a buddy system allocator.
776 * The concept of a buddy system is to maintain direct-mapped table
777 * (containing bit values) for memory blocks of various "orders".
778 * The bottom level table contains the map for the smallest allocatable
779 * units of memory (here, pages), and each level above it describes
780 * pairs of units from the levels below, hence, "buddies".
781 * At a high level, all that happens here is marking the table entry
782 * at the bottom level available, and propagating the changes upward
783 * as necessary, plus some accounting needed to play nicely with other
784 * parts of the VM system.
785 * At each level, we keep a list of pages, which are heads of continuous
786 * free pages of length of (1 << order) and marked with PageBuddy.
787 * Page's order is recorded in page_private(page) field.
788 * So when we are allocating or freeing one, we can derive the state of the
789 * other. That is, if we allocate a small block, and both were
790 * free, the remainder of the region must be split into blocks.
791 * If a block is freed, and its buddy is also free, then this
792 * triggers coalescing into a block of larger size.
797 static inline void __free_one_page(struct page
*page
,
799 struct zone
*zone
, unsigned int order
,
800 int migratetype
, fpi_t fpi_flags
)
802 struct capture_control
*capc
= task_capc(zone
);
803 unsigned long buddy_pfn
= 0;
804 unsigned long combined_pfn
;
808 VM_BUG_ON(!zone_is_initialized(zone
));
809 VM_BUG_ON_PAGE(page
->flags
& PAGE_FLAGS_CHECK_AT_PREP
, page
);
811 VM_BUG_ON(migratetype
== -1);
812 VM_BUG_ON_PAGE(pfn
& ((1 << order
) - 1), page
);
813 VM_BUG_ON_PAGE(bad_range(zone
, page
), page
);
815 account_freepages(zone
, 1 << order
, migratetype
);
817 while (order
< MAX_PAGE_ORDER
) {
818 int buddy_mt
= migratetype
;
820 if (compaction_capture(capc
, page
, order
, migratetype
)) {
821 account_freepages(zone
, -(1 << order
), migratetype
);
825 buddy
= find_buddy_page_pfn(page
, pfn
, order
, &buddy_pfn
);
829 if (unlikely(order
>= pageblock_order
)) {
831 * We want to prevent merge between freepages on pageblock
832 * without fallbacks and normal pageblock. Without this,
833 * pageblock isolation could cause incorrect freepage or CMA
834 * accounting or HIGHATOMIC accounting.
836 buddy_mt
= get_pfnblock_migratetype(buddy
, buddy_pfn
);
838 if (migratetype
!= buddy_mt
&&
839 (!migratetype_is_mergeable(migratetype
) ||
840 !migratetype_is_mergeable(buddy_mt
)))
845 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
846 * merge with it and move up one order.
848 if (page_is_guard(buddy
))
849 clear_page_guard(zone
, buddy
, order
);
851 __del_page_from_free_list(buddy
, zone
, order
, buddy_mt
);
853 if (unlikely(buddy_mt
!= migratetype
)) {
855 * Match buddy type. This ensures that an
856 * expand() down the line puts the sub-blocks
857 * on the right freelists.
859 set_pageblock_migratetype(buddy
, migratetype
);
862 combined_pfn
= buddy_pfn
& pfn
;
863 page
= page
+ (combined_pfn
- pfn
);
869 set_buddy_order(page
, order
);
871 if (fpi_flags
& FPI_TO_TAIL
)
873 else if (is_shuffle_order(order
))
874 to_tail
= shuffle_pick_tail();
876 to_tail
= buddy_merge_likely(pfn
, buddy_pfn
, page
, order
);
878 __add_to_free_list(page
, zone
, order
, migratetype
, to_tail
);
880 /* Notify page reporting subsystem of freed page */
881 if (!(fpi_flags
& FPI_SKIP_REPORT_NOTIFY
))
882 page_reporting_notify_free(order
);
886 * A bad page could be due to a number of fields. Instead of multiple branches,
887 * try and check multiple fields with one check. The caller must do a detailed
888 * check if necessary.
890 static inline bool page_expected_state(struct page
*page
,
891 unsigned long check_flags
)
893 if (unlikely(atomic_read(&page
->_mapcount
) != -1))
896 if (unlikely((unsigned long)page
->mapping
|
897 page_ref_count(page
) |
901 page_pool_page_is_pp(page
) |
902 (page
->flags
& check_flags
)))
908 static const char *page_bad_reason(struct page
*page
, unsigned long flags
)
910 const char *bad_reason
= NULL
;
912 if (unlikely(atomic_read(&page
->_mapcount
) != -1))
913 bad_reason
= "nonzero mapcount";
914 if (unlikely(page
->mapping
!= NULL
))
915 bad_reason
= "non-NULL mapping";
916 if (unlikely(page_ref_count(page
) != 0))
917 bad_reason
= "nonzero _refcount";
918 if (unlikely(page
->flags
& flags
)) {
919 if (flags
== PAGE_FLAGS_CHECK_AT_PREP
)
920 bad_reason
= "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
922 bad_reason
= "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
925 if (unlikely(page
->memcg_data
))
926 bad_reason
= "page still charged to cgroup";
928 if (unlikely(page_pool_page_is_pp(page
)))
929 bad_reason
= "page_pool leak";
933 static inline bool free_page_is_bad(struct page
*page
)
935 if (likely(page_expected_state(page
, PAGE_FLAGS_CHECK_AT_FREE
)))
938 /* Something has gone sideways, find it */
939 bad_page(page
, page_bad_reason(page
, PAGE_FLAGS_CHECK_AT_FREE
));
943 static inline bool is_check_pages_enabled(void)
945 return static_branch_unlikely(&check_pages_enabled
);
948 static int free_tail_page_prepare(struct page
*head_page
, struct page
*page
)
950 struct folio
*folio
= (struct folio
*)head_page
;
954 * We rely page->lru.next never has bit 0 set, unless the page
955 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
957 BUILD_BUG_ON((unsigned long)LIST_POISON1
& 1);
959 if (!is_check_pages_enabled()) {
963 switch (page
- head_page
) {
965 /* the first tail page: these may be in place of ->mapping */
966 if (unlikely(folio_large_mapcount(folio
))) {
967 bad_page(page
, "nonzero large_mapcount");
970 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT
) &&
971 unlikely(atomic_read(&folio
->_nr_pages_mapped
))) {
972 bad_page(page
, "nonzero nr_pages_mapped");
975 if (IS_ENABLED(CONFIG_MM_ID
)) {
976 if (unlikely(folio
->_mm_id_mapcount
[0] != -1)) {
977 bad_page(page
, "nonzero mm mapcount 0");
980 if (unlikely(folio
->_mm_id_mapcount
[1] != -1)) {
981 bad_page(page
, "nonzero mm mapcount 1");
985 if (IS_ENABLED(CONFIG_64BIT
)) {
986 if (unlikely(atomic_read(&folio
->_entire_mapcount
) + 1)) {
987 bad_page(page
, "nonzero entire_mapcount");
990 if (unlikely(atomic_read(&folio
->_pincount
))) {
991 bad_page(page
, "nonzero pincount");
997 /* the second tail page: deferred_list overlaps ->mapping */
998 if (unlikely(!list_empty(&folio
->_deferred_list
))) {
999 bad_page(page
, "on deferred list");
1002 if (!IS_ENABLED(CONFIG_64BIT
)) {
1003 if (unlikely(atomic_read(&folio
->_entire_mapcount
) + 1)) {
1004 bad_page(page
, "nonzero entire_mapcount");
1007 if (unlikely(atomic_read(&folio
->_pincount
))) {
1008 bad_page(page
, "nonzero pincount");
1014 /* the third tail page: hugetlb specifics overlap ->mappings */
1015 if (IS_ENABLED(CONFIG_HUGETLB_PAGE
))
1019 if (page
->mapping
!= TAIL_MAPPING
) {
1020 bad_page(page
, "corrupted mapping in tail page");
1025 if (unlikely(!PageTail(page
))) {
1026 bad_page(page
, "PageTail not set");
1029 if (unlikely(compound_head(page
) != head_page
)) {
1030 bad_page(page
, "compound_head not consistent");
1035 page
->mapping
= NULL
;
1036 clear_compound_head(page
);
1041 * Skip KASAN memory poisoning when either:
1043 * 1. For generic KASAN: deferred memory initialization has not yet completed.
1044 * Tag-based KASAN modes skip pages freed via deferred memory initialization
1045 * using page tags instead (see below).
1046 * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
1047 * that error detection is disabled for accesses via the page address.
1049 * Pages will have match-all tags in the following circumstances:
1051 * 1. Pages are being initialized for the first time, including during deferred
1052 * memory init; see the call to page_kasan_tag_reset in __init_single_page.
1053 * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
1054 * exception of pages unpoisoned by kasan_unpoison_vmalloc.
1055 * 3. The allocation was excluded from being checked due to sampling,
1056 * see the call to kasan_unpoison_pages.
1058 * Poisoning pages during deferred memory init will greatly lengthen the
1059 * process and cause problem in large memory systems as the deferred pages
1060 * initialization is done with interrupt disabled.
1062 * Assuming that there will be no reference to those newly initialized
1063 * pages before they are ever allocated, this should have no effect on
1064 * KASAN memory tracking as the poison will be properly inserted at page
1065 * allocation time. The only corner case is when pages are allocated by
1066 * on-demand allocation and then freed again before the deferred pages
1067 * initialization is done, but this is not likely to happen.
1069 static inline bool should_skip_kasan_poison(struct page
*page
)
1071 if (IS_ENABLED(CONFIG_KASAN_GENERIC
))
1072 return deferred_pages_enabled();
1074 return page_kasan_tag(page
) == KASAN_TAG_KERNEL
;
1077 static void kernel_init_pages(struct page
*page
, int numpages
)
1081 /* s390's use of memset() could override KASAN redzones. */
1082 kasan_disable_current();
1083 for (i
= 0; i
< numpages
; i
++)
1084 clear_highpage_kasan_tagged(page
+ i
);
1085 kasan_enable_current();
1088 #ifdef CONFIG_MEM_ALLOC_PROFILING
1090 /* Should be called only if mem_alloc_profiling_enabled() */
1091 void __clear_page_tag_ref(struct page
*page
)
1093 union pgtag_ref_handle handle
;
1094 union codetag_ref ref
;
1096 if (get_page_tag_ref(page
, &ref
, &handle
)) {
1097 set_codetag_empty(&ref
);
1098 update_page_tag_ref(handle
, &ref
);
1099 put_page_tag_ref(handle
);
1103 /* Should be called only if mem_alloc_profiling_enabled() */
1105 void __pgalloc_tag_add(struct page
*page
, struct task_struct
*task
,
1108 union pgtag_ref_handle handle
;
1109 union codetag_ref ref
;
1111 if (get_page_tag_ref(page
, &ref
, &handle
)) {
1112 alloc_tag_add(&ref
, task
->alloc_tag
, PAGE_SIZE
* nr
);
1113 update_page_tag_ref(handle
, &ref
);
1114 put_page_tag_ref(handle
);
1118 static inline void pgalloc_tag_add(struct page
*page
, struct task_struct
*task
,
1121 if (mem_alloc_profiling_enabled())
1122 __pgalloc_tag_add(page
, task
, nr
);
1125 /* Should be called only if mem_alloc_profiling_enabled() */
1127 void __pgalloc_tag_sub(struct page
*page
, unsigned int nr
)
1129 union pgtag_ref_handle handle
;
1130 union codetag_ref ref
;
1132 if (get_page_tag_ref(page
, &ref
, &handle
)) {
1133 alloc_tag_sub(&ref
, PAGE_SIZE
* nr
);
1134 update_page_tag_ref(handle
, &ref
);
1135 put_page_tag_ref(handle
);
1139 static inline void pgalloc_tag_sub(struct page
*page
, unsigned int nr
)
1141 if (mem_alloc_profiling_enabled())
1142 __pgalloc_tag_sub(page
, nr
);
1145 /* When tag is not NULL, assuming mem_alloc_profiling_enabled */
1146 static inline void pgalloc_tag_sub_pages(struct alloc_tag
*tag
, unsigned int nr
)
1149 this_cpu_sub(tag
->counters
->bytes
, PAGE_SIZE
* nr
);
1152 #else /* CONFIG_MEM_ALLOC_PROFILING */
1154 static inline void pgalloc_tag_add(struct page
*page
, struct task_struct
*task
,
1156 static inline void pgalloc_tag_sub(struct page
*page
, unsigned int nr
) {}
1157 static inline void pgalloc_tag_sub_pages(struct alloc_tag
*tag
, unsigned int nr
) {}
1159 #endif /* CONFIG_MEM_ALLOC_PROFILING */
1161 __always_inline
bool free_pages_prepare(struct page
*page
,
1165 bool skip_kasan_poison
= should_skip_kasan_poison(page
);
1166 bool init
= want_init_on_free();
1167 bool compound
= PageCompound(page
);
1168 struct folio
*folio
= page_folio(page
);
1170 VM_BUG_ON_PAGE(PageTail(page
), page
);
1172 trace_mm_page_free(page
, order
);
1173 kmsan_free_page(page
, order
);
1175 if (memcg_kmem_online() && PageMemcgKmem(page
))
1176 __memcg_kmem_uncharge_page(page
, order
);
1179 * In rare cases, when truncation or holepunching raced with
1180 * munlock after VM_LOCKED was cleared, Mlocked may still be
1181 * found set here. This does not indicate a problem, unless
1182 * "unevictable_pgs_cleared" appears worryingly large.
1184 if (unlikely(folio_test_mlocked(folio
))) {
1185 long nr_pages
= folio_nr_pages(folio
);
1187 __folio_clear_mlocked(folio
);
1188 zone_stat_mod_folio(folio
, NR_MLOCK
, -nr_pages
);
1189 count_vm_events(UNEVICTABLE_PGCLEARED
, nr_pages
);
1192 if (unlikely(PageHWPoison(page
)) && !order
) {
1193 /* Do not let hwpoison pages hit pcplists/buddy */
1194 reset_page_owner(page
, order
);
1195 page_table_check_free(page
, order
);
1196 pgalloc_tag_sub(page
, 1 << order
);
1199 * The page is isolated and accounted for.
1200 * Mark the codetag as empty to avoid accounting error
1201 * when the page is freed by unpoison_memory().
1203 clear_page_tag_ref(page
);
1207 VM_BUG_ON_PAGE(compound
&& compound_order(page
) != order
, page
);
1210 * Check tail pages before head page information is cleared to
1211 * avoid checking PageCompound for order-0 pages.
1213 if (unlikely(order
)) {
1217 page
[1].flags
&= ~PAGE_FLAGS_SECOND
;
1218 #ifdef NR_PAGES_IN_LARGE_FOLIO
1219 folio
->_nr_pages
= 0;
1222 for (i
= 1; i
< (1 << order
); i
++) {
1224 bad
+= free_tail_page_prepare(page
, page
+ i
);
1225 if (is_check_pages_enabled()) {
1226 if (free_page_is_bad(page
+ i
)) {
1231 (page
+ i
)->flags
&= ~PAGE_FLAGS_CHECK_AT_PREP
;
1234 if (PageMappingFlags(page
)) {
1236 mod_mthp_stat(order
, MTHP_STAT_NR_ANON
, -1);
1237 page
->mapping
= NULL
;
1239 if (is_check_pages_enabled()) {
1240 if (free_page_is_bad(page
))
1246 page_cpupid_reset_last(page
);
1247 page
->flags
&= ~PAGE_FLAGS_CHECK_AT_PREP
;
1248 reset_page_owner(page
, order
);
1249 page_table_check_free(page
, order
);
1250 pgalloc_tag_sub(page
, 1 << order
);
1252 if (!PageHighMem(page
)) {
1253 debug_check_no_locks_freed(page_address(page
),
1254 PAGE_SIZE
<< order
);
1255 debug_check_no_obj_freed(page_address(page
),
1256 PAGE_SIZE
<< order
);
1259 kernel_poison_pages(page
, 1 << order
);
1262 * As memory initialization might be integrated into KASAN,
1263 * KASAN poisoning and memory initialization code must be
1264 * kept together to avoid discrepancies in behavior.
1266 * With hardware tag-based KASAN, memory tags must be set before the
1267 * page becomes unavailable via debug_pagealloc or arch_free_page.
1269 if (!skip_kasan_poison
) {
1270 kasan_poison_pages(page
, order
, init
);
1272 /* Memory is already initialized if KASAN did it internally. */
1273 if (kasan_has_integrated_init())
1277 kernel_init_pages(page
, 1 << order
);
1280 * arch_free_page() can make the page's contents inaccessible. s390
1281 * does this. So nothing which can access the page's contents should
1282 * happen after this.
1284 arch_free_page(page
, order
);
1286 debug_pagealloc_unmap_pages(page
, 1 << order
);
1292 * Frees a number of pages from the PCP lists
1293 * Assumes all pages on list are in same zone.
1294 * count is the number of pages to free.
1296 static void free_pcppages_bulk(struct zone
*zone
, int count
,
1297 struct per_cpu_pages
*pcp
,
1300 unsigned long flags
;
1305 * Ensure proper count is passed which otherwise would stuck in the
1306 * below while (list_empty(list)) loop.
1308 count
= min(pcp
->count
, count
);
1310 /* Ensure requested pindex is drained first. */
1311 pindex
= pindex
- 1;
1313 spin_lock_irqsave(&zone
->lock
, flags
);
1316 struct list_head
*list
;
1319 /* Remove pages from lists in a round-robin fashion. */
1321 if (++pindex
> NR_PCP_LISTS
- 1)
1323 list
= &pcp
->lists
[pindex
];
1324 } while (list_empty(list
));
1326 order
= pindex_to_order(pindex
);
1327 nr_pages
= 1 << order
;
1332 page
= list_last_entry(list
, struct page
, pcp_list
);
1333 pfn
= page_to_pfn(page
);
1334 mt
= get_pfnblock_migratetype(page
, pfn
);
1336 /* must delete to avoid corrupting pcp list */
1337 list_del(&page
->pcp_list
);
1339 pcp
->count
-= nr_pages
;
1341 __free_one_page(page
, pfn
, zone
, order
, mt
, FPI_NONE
);
1342 trace_mm_page_pcpu_drain(page
, order
, mt
);
1343 } while (count
> 0 && !list_empty(list
));
1346 spin_unlock_irqrestore(&zone
->lock
, flags
);
1349 /* Split a multi-block free page into its individual pageblocks. */
1350 static void split_large_buddy(struct zone
*zone
, struct page
*page
,
1351 unsigned long pfn
, int order
, fpi_t fpi
)
1353 unsigned long end
= pfn
+ (1 << order
);
1355 VM_WARN_ON_ONCE(!IS_ALIGNED(pfn
, 1 << order
));
1356 /* Caller removed page from freelist, buddy info cleared! */
1357 VM_WARN_ON_ONCE(PageBuddy(page
));
1359 if (order
> pageblock_order
)
1360 order
= pageblock_order
;
1363 int mt
= get_pfnblock_migratetype(page
, pfn
);
1365 __free_one_page(page
, pfn
, zone
, order
, mt
, fpi
);
1369 page
= pfn_to_page(pfn
);
1373 static void add_page_to_zone_llist(struct zone
*zone
, struct page
*page
,
1376 /* Remember the order */
1377 page
->order
= order
;
1378 /* Add the page to the free list */
1379 llist_add(&page
->pcp_llist
, &zone
->trylock_free_pages
);
1382 static void free_one_page(struct zone
*zone
, struct page
*page
,
1383 unsigned long pfn
, unsigned int order
,
1386 struct llist_head
*llhead
;
1387 unsigned long flags
;
1389 if (unlikely(fpi_flags
& FPI_TRYLOCK
)) {
1390 if (!spin_trylock_irqsave(&zone
->lock
, flags
)) {
1391 add_page_to_zone_llist(zone
, page
, order
);
1395 spin_lock_irqsave(&zone
->lock
, flags
);
1398 /* The lock succeeded. Process deferred pages. */
1399 llhead
= &zone
->trylock_free_pages
;
1400 if (unlikely(!llist_empty(llhead
) && !(fpi_flags
& FPI_TRYLOCK
))) {
1401 struct llist_node
*llnode
;
1402 struct page
*p
, *tmp
;
1404 llnode
= llist_del_all(llhead
);
1405 llist_for_each_entry_safe(p
, tmp
, llnode
, pcp_llist
) {
1406 unsigned int p_order
= p
->order
;
1408 split_large_buddy(zone
, p
, page_to_pfn(p
), p_order
, fpi_flags
);
1409 __count_vm_events(PGFREE
, 1 << p_order
);
1412 split_large_buddy(zone
, page
, pfn
, order
, fpi_flags
);
1413 spin_unlock_irqrestore(&zone
->lock
, flags
);
1415 __count_vm_events(PGFREE
, 1 << order
);
1418 static void __free_pages_ok(struct page
*page
, unsigned int order
,
1421 unsigned long pfn
= page_to_pfn(page
);
1422 struct zone
*zone
= page_zone(page
);
1424 if (free_pages_prepare(page
, order
))
1425 free_one_page(zone
, page
, pfn
, order
, fpi_flags
);
1428 void __meminit
__free_pages_core(struct page
*page
, unsigned int order
,
1429 enum meminit_context context
)
1431 unsigned int nr_pages
= 1 << order
;
1432 struct page
*p
= page
;
1436 * When initializing the memmap, __init_single_page() sets the refcount
1437 * of all pages to 1 ("allocated"/"not free"). We have to set the
1438 * refcount of all involved pages to 0.
1440 * Note that hotplugged memory pages are initialized to PageOffline().
1441 * Pages freed from memblock might be marked as reserved.
1443 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG
) &&
1444 unlikely(context
== MEMINIT_HOTPLUG
)) {
1445 for (loop
= 0; loop
< nr_pages
; loop
++, p
++) {
1446 VM_WARN_ON_ONCE(PageReserved(p
));
1447 __ClearPageOffline(p
);
1448 set_page_count(p
, 0);
1451 adjust_managed_page_count(page
, nr_pages
);
1453 for (loop
= 0; loop
< nr_pages
; loop
++, p
++) {
1454 __ClearPageReserved(p
);
1455 set_page_count(p
, 0);
1458 /* memblock adjusts totalram_pages() manually. */
1459 atomic_long_add(nr_pages
, &page_zone(page
)->managed_pages
);
1462 if (page_contains_unaccepted(page
, order
)) {
1463 if (order
== MAX_PAGE_ORDER
&& __free_unaccepted(page
))
1466 accept_memory(page_to_phys(page
), PAGE_SIZE
<< order
);
1470 * Bypass PCP and place fresh pages right to the tail, primarily
1471 * relevant for memory onlining.
1473 __free_pages_ok(page
, order
, FPI_TO_TAIL
);
1477 * Check that the whole (or subset of) a pageblock given by the interval of
1478 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1479 * with the migration of free compaction scanner.
1481 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1483 * It's possible on some configurations to have a setup like node0 node1 node0
1484 * i.e. it's possible that all pages within a zones range of pages do not
1485 * belong to a single zone. We assume that a border between node0 and node1
1486 * can occur within a single pageblock, but not a node0 node1 node0
1487 * interleaving within a single pageblock. It is therefore sufficient to check
1488 * the first and last page of a pageblock and avoid checking each individual
1489 * page in a pageblock.
1491 * Note: the function may return non-NULL struct page even for a page block
1492 * which contains a memory hole (i.e. there is no physical memory for a subset
1493 * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
1494 * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
1495 * even though the start pfn is online and valid. This should be safe most of
1496 * the time because struct pages are still initialized via init_unavailable_range()
1497 * and pfn walkers shouldn't touch any physical memory range for which they do
1498 * not recognize any specific metadata in struct pages.
1500 struct page
*__pageblock_pfn_to_page(unsigned long start_pfn
,
1501 unsigned long end_pfn
, struct zone
*zone
)
1503 struct page
*start_page
;
1504 struct page
*end_page
;
1506 /* end_pfn is one past the range we are checking */
1509 if (!pfn_valid(end_pfn
))
1512 start_page
= pfn_to_online_page(start_pfn
);
1516 if (page_zone(start_page
) != zone
)
1519 end_page
= pfn_to_page(end_pfn
);
1521 /* This gives a shorter code than deriving page_zone(end_page) */
1522 if (page_zone_id(start_page
) != page_zone_id(end_page
))
1529 * The order of subdivision here is critical for the IO subsystem.
1530 * Please do not alter this order without good reasons and regression
1531 * testing. Specifically, as large blocks of memory are subdivided,
1532 * the order in which smaller blocks are delivered depends on the order
1533 * they're subdivided in this function. This is the primary factor
1534 * influencing the order in which pages are delivered to the IO
1535 * subsystem according to empirical testing, and this is also justified
1536 * by considering the behavior of a buddy system containing a single
1537 * large block of memory acted on by a series of small allocations.
1538 * This behavior is a critical factor in sglist merging's success.
1542 static inline unsigned int expand(struct zone
*zone
, struct page
*page
, int low
,
1543 int high
, int migratetype
)
1545 unsigned int size
= 1 << high
;
1546 unsigned int nr_added
= 0;
1548 while (high
> low
) {
1551 VM_BUG_ON_PAGE(bad_range(zone
, &page
[size
]), &page
[size
]);
1554 * Mark as guard pages (or page), that will allow to
1555 * merge back to allocator when buddy will be freed.
1556 * Corresponding page table entries will not be touched,
1557 * pages will stay not present in virtual address space
1559 if (set_page_guard(zone
, &page
[size
], high
))
1562 __add_to_free_list(&page
[size
], zone
, high
, migratetype
, false);
1563 set_buddy_order(&page
[size
], high
);
1570 static __always_inline
void page_del_and_expand(struct zone
*zone
,
1571 struct page
*page
, int low
,
1572 int high
, int migratetype
)
1574 int nr_pages
= 1 << high
;
1576 __del_page_from_free_list(page
, zone
, high
, migratetype
);
1577 nr_pages
-= expand(zone
, page
, low
, high
, migratetype
);
1578 account_freepages(zone
, -nr_pages
, migratetype
);
1581 static void check_new_page_bad(struct page
*page
)
1583 if (unlikely(PageHWPoison(page
))) {
1584 /* Don't complain about hwpoisoned pages */
1585 if (PageBuddy(page
))
1586 __ClearPageBuddy(page
);
1591 page_bad_reason(page
, PAGE_FLAGS_CHECK_AT_PREP
));
1595 * This page is about to be returned from the page allocator
1597 static bool check_new_page(struct page
*page
)
1599 if (likely(page_expected_state(page
,
1600 PAGE_FLAGS_CHECK_AT_PREP
|__PG_HWPOISON
)))
1603 check_new_page_bad(page
);
1607 static inline bool check_new_pages(struct page
*page
, unsigned int order
)
1609 if (is_check_pages_enabled()) {
1610 for (int i
= 0; i
< (1 << order
); i
++) {
1611 struct page
*p
= page
+ i
;
1613 if (check_new_page(p
))
1621 static inline bool should_skip_kasan_unpoison(gfp_t flags
)
1623 /* Don't skip if a software KASAN mode is enabled. */
1624 if (IS_ENABLED(CONFIG_KASAN_GENERIC
) ||
1625 IS_ENABLED(CONFIG_KASAN_SW_TAGS
))
1628 /* Skip, if hardware tag-based KASAN is not enabled. */
1629 if (!kasan_hw_tags_enabled())
1633 * With hardware tag-based KASAN enabled, skip if this has been
1634 * requested via __GFP_SKIP_KASAN.
1636 return flags
& __GFP_SKIP_KASAN
;
1639 static inline bool should_skip_init(gfp_t flags
)
1641 /* Don't skip, if hardware tag-based KASAN is not enabled. */
1642 if (!kasan_hw_tags_enabled())
1645 /* For hardware tag-based KASAN, skip if requested. */
1646 return (flags
& __GFP_SKIP_ZERO
);
1649 inline void post_alloc_hook(struct page
*page
, unsigned int order
,
1652 bool init
= !want_init_on_free() && want_init_on_alloc(gfp_flags
) &&
1653 !should_skip_init(gfp_flags
);
1654 bool zero_tags
= init
&& (gfp_flags
& __GFP_ZEROTAGS
);
1657 set_page_private(page
, 0);
1659 arch_alloc_page(page
, order
);
1660 debug_pagealloc_map_pages(page
, 1 << order
);
1663 * Page unpoisoning must happen before memory initialization.
1664 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
1665 * allocations and the page unpoisoning code will complain.
1667 kernel_unpoison_pages(page
, 1 << order
);
1670 * As memory initialization might be integrated into KASAN,
1671 * KASAN unpoisoning and memory initializion code must be
1672 * kept together to avoid discrepancies in behavior.
1676 * If memory tags should be zeroed
1677 * (which happens only when memory should be initialized as well).
1680 /* Initialize both memory and memory tags. */
1681 for (i
= 0; i
!= 1 << order
; ++i
)
1682 tag_clear_highpage(page
+ i
);
1684 /* Take note that memory was initialized by the loop above. */
1687 if (!should_skip_kasan_unpoison(gfp_flags
) &&
1688 kasan_unpoison_pages(page
, order
, init
)) {
1689 /* Take note that memory was initialized by KASAN. */
1690 if (kasan_has_integrated_init())
1694 * If memory tags have not been set by KASAN, reset the page
1695 * tags to ensure page_address() dereferencing does not fault.
1697 for (i
= 0; i
!= 1 << order
; ++i
)
1698 page_kasan_tag_reset(page
+ i
);
1700 /* If memory is still not initialized, initialize it now. */
1702 kernel_init_pages(page
, 1 << order
);
1704 set_page_owner(page
, order
, gfp_flags
);
1705 page_table_check_alloc(page
, order
);
1706 pgalloc_tag_add(page
, current
, 1 << order
);
1709 static void prep_new_page(struct page
*page
, unsigned int order
, gfp_t gfp_flags
,
1710 unsigned int alloc_flags
)
1712 post_alloc_hook(page
, order
, gfp_flags
);
1714 if (order
&& (gfp_flags
& __GFP_COMP
))
1715 prep_compound_page(page
, order
);
1718 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1719 * allocate the page. The expectation is that the caller is taking
1720 * steps that will free more memory. The caller should avoid the page
1721 * being used for !PFMEMALLOC purposes.
1723 if (alloc_flags
& ALLOC_NO_WATERMARKS
)
1724 set_page_pfmemalloc(page
);
1726 clear_page_pfmemalloc(page
);
1730 * Go through the free lists for the given migratetype and remove
1731 * the smallest available page from the freelists
1733 static __always_inline
1734 struct page
*__rmqueue_smallest(struct zone
*zone
, unsigned int order
,
1737 unsigned int current_order
;
1738 struct free_area
*area
;
1741 /* Find a page of the appropriate size in the preferred list */
1742 for (current_order
= order
; current_order
< NR_PAGE_ORDERS
; ++current_order
) {
1743 area
= &(zone
->free_area
[current_order
]);
1744 page
= get_page_from_free_area(area
, migratetype
);
1748 page_del_and_expand(zone
, page
, order
, current_order
,
1750 trace_mm_page_alloc_zone_locked(page
, order
, migratetype
,
1751 pcp_allowed_order(order
) &&
1752 migratetype
< MIGRATE_PCPTYPES
);
1761 * This array describes the order lists are fallen back to when
1762 * the free lists for the desirable migrate type are depleted
1764 * The other migratetypes do not have fallbacks.
1766 static int fallbacks
[MIGRATE_PCPTYPES
][MIGRATE_PCPTYPES
- 1] = {
1767 [MIGRATE_UNMOVABLE
] = { MIGRATE_RECLAIMABLE
, MIGRATE_MOVABLE
},
1768 [MIGRATE_MOVABLE
] = { MIGRATE_RECLAIMABLE
, MIGRATE_UNMOVABLE
},
1769 [MIGRATE_RECLAIMABLE
] = { MIGRATE_UNMOVABLE
, MIGRATE_MOVABLE
},
1773 static __always_inline
struct page
*__rmqueue_cma_fallback(struct zone
*zone
,
1776 return __rmqueue_smallest(zone
, order
, MIGRATE_CMA
);
1779 static inline struct page
*__rmqueue_cma_fallback(struct zone
*zone
,
1780 unsigned int order
) { return NULL
; }
1784 * Change the type of a block and move all its free pages to that
1787 static int __move_freepages_block(struct zone
*zone
, unsigned long start_pfn
,
1788 int old_mt
, int new_mt
)
1791 unsigned long pfn
, end_pfn
;
1793 int pages_moved
= 0;
1795 VM_WARN_ON(start_pfn
& (pageblock_nr_pages
- 1));
1796 end_pfn
= pageblock_end_pfn(start_pfn
);
1798 for (pfn
= start_pfn
; pfn
< end_pfn
;) {
1799 page
= pfn_to_page(pfn
);
1800 if (!PageBuddy(page
)) {
1805 /* Make sure we are not inadvertently changing nodes */
1806 VM_BUG_ON_PAGE(page_to_nid(page
) != zone_to_nid(zone
), page
);
1807 VM_BUG_ON_PAGE(page_zone(page
) != zone
, page
);
1809 order
= buddy_order(page
);
1811 move_to_free_list(page
, zone
, order
, old_mt
, new_mt
);
1814 pages_moved
+= 1 << order
;
1817 set_pageblock_migratetype(pfn_to_page(start_pfn
), new_mt
);
1822 static bool prep_move_freepages_block(struct zone
*zone
, struct page
*page
,
1823 unsigned long *start_pfn
,
1824 int *num_free
, int *num_movable
)
1826 unsigned long pfn
, start
, end
;
1828 pfn
= page_to_pfn(page
);
1829 start
= pageblock_start_pfn(pfn
);
1830 end
= pageblock_end_pfn(pfn
);
1833 * The caller only has the lock for @zone, don't touch ranges
1834 * that straddle into other zones. While we could move part of
1835 * the range that's inside the zone, this call is usually
1836 * accompanied by other operations such as migratetype updates
1837 * which also should be locked.
1839 if (!zone_spans_pfn(zone
, start
))
1841 if (!zone_spans_pfn(zone
, end
- 1))
1849 for (pfn
= start
; pfn
< end
;) {
1850 page
= pfn_to_page(pfn
);
1851 if (PageBuddy(page
)) {
1852 int nr
= 1 << buddy_order(page
);
1859 * We assume that pages that could be isolated for
1860 * migration are movable. But we don't actually try
1861 * isolating, as that would be expensive.
1863 if (PageLRU(page
) || __PageMovable(page
))
1872 static int move_freepages_block(struct zone
*zone
, struct page
*page
,
1873 int old_mt
, int new_mt
)
1875 unsigned long start_pfn
;
1877 if (!prep_move_freepages_block(zone
, page
, &start_pfn
, NULL
, NULL
))
1880 return __move_freepages_block(zone
, start_pfn
, old_mt
, new_mt
);
1883 #ifdef CONFIG_MEMORY_ISOLATION
1884 /* Look for a buddy that straddles start_pfn */
1885 static unsigned long find_large_buddy(unsigned long start_pfn
)
1889 unsigned long pfn
= start_pfn
;
1891 while (!PageBuddy(page
= pfn_to_page(pfn
))) {
1893 if (++order
> MAX_PAGE_ORDER
)
1895 pfn
&= ~0UL << order
;
1899 * Found a preceding buddy, but does it straddle?
1901 if (pfn
+ (1 << buddy_order(page
)) > start_pfn
)
1909 * move_freepages_block_isolate - move free pages in block for page isolation
1911 * @page: the pageblock page
1912 * @migratetype: migratetype to set on the pageblock
1914 * This is similar to move_freepages_block(), but handles the special
1915 * case encountered in page isolation, where the block of interest
1916 * might be part of a larger buddy spanning multiple pageblocks.
1918 * Unlike the regular page allocator path, which moves pages while
1919 * stealing buddies off the freelist, page isolation is interested in
1920 * arbitrary pfn ranges that may have overlapping buddies on both ends.
1922 * This function handles that. Straddling buddies are split into
1923 * individual pageblocks. Only the block of interest is moved.
1925 * Returns %true if pages could be moved, %false otherwise.
1927 bool move_freepages_block_isolate(struct zone
*zone
, struct page
*page
,
1930 unsigned long start_pfn
, pfn
;
1932 if (!prep_move_freepages_block(zone
, page
, &start_pfn
, NULL
, NULL
))
1935 /* No splits needed if buddies can't span multiple blocks */
1936 if (pageblock_order
== MAX_PAGE_ORDER
)
1939 /* We're a tail block in a larger buddy */
1940 pfn
= find_large_buddy(start_pfn
);
1941 if (pfn
!= start_pfn
) {
1942 struct page
*buddy
= pfn_to_page(pfn
);
1943 int order
= buddy_order(buddy
);
1945 del_page_from_free_list(buddy
, zone
, order
,
1946 get_pfnblock_migratetype(buddy
, pfn
));
1947 set_pageblock_migratetype(page
, migratetype
);
1948 split_large_buddy(zone
, buddy
, pfn
, order
, FPI_NONE
);
1952 /* We're the starting block of a larger buddy */
1953 if (PageBuddy(page
) && buddy_order(page
) > pageblock_order
) {
1954 int order
= buddy_order(page
);
1956 del_page_from_free_list(page
, zone
, order
,
1957 get_pfnblock_migratetype(page
, pfn
));
1958 set_pageblock_migratetype(page
, migratetype
);
1959 split_large_buddy(zone
, page
, pfn
, order
, FPI_NONE
);
1963 __move_freepages_block(zone
, start_pfn
,
1964 get_pfnblock_migratetype(page
, start_pfn
),
1968 #endif /* CONFIG_MEMORY_ISOLATION */
1970 static void change_pageblock_range(struct page
*pageblock_page
,
1971 int start_order
, int migratetype
)
1973 int nr_pageblocks
= 1 << (start_order
- pageblock_order
);
1975 while (nr_pageblocks
--) {
1976 set_pageblock_migratetype(pageblock_page
, migratetype
);
1977 pageblock_page
+= pageblock_nr_pages
;
1981 static inline bool boost_watermark(struct zone
*zone
)
1983 unsigned long max_boost
;
1985 if (!watermark_boost_factor
)
1988 * Don't bother in zones that are unlikely to produce results.
1989 * On small machines, including kdump capture kernels running
1990 * in a small area, boosting the watermark can cause an out of
1991 * memory situation immediately.
1993 if ((pageblock_nr_pages
* 4) > zone_managed_pages(zone
))
1996 max_boost
= mult_frac(zone
->_watermark
[WMARK_HIGH
],
1997 watermark_boost_factor
, 10000);
2000 * high watermark may be uninitialised if fragmentation occurs
2001 * very early in boot so do not boost. We do not fall
2002 * through and boost by pageblock_nr_pages as failing
2003 * allocations that early means that reclaim is not going
2004 * to help and it may even be impossible to reclaim the
2005 * boosted watermark resulting in a hang.
2010 max_boost
= max(pageblock_nr_pages
, max_boost
);
2012 zone
->watermark_boost
= min(zone
->watermark_boost
+ pageblock_nr_pages
,
2019 * When we are falling back to another migratetype during allocation, should we
2020 * try to claim an entire block to satisfy further allocations, instead of
2021 * polluting multiple pageblocks?
2023 static bool should_try_claim_block(unsigned int order
, int start_mt
)
2026 * Leaving this order check is intended, although there is
2027 * relaxed order check in next check. The reason is that
2028 * we can actually claim the whole pageblock if this condition met,
2029 * but, below check doesn't guarantee it and that is just heuristic
2030 * so could be changed anytime.
2032 if (order
>= pageblock_order
)
2036 * Above a certain threshold, always try to claim, as it's likely there
2037 * will be more free pages in the pageblock.
2039 if (order
>= pageblock_order
/ 2)
2043 * Unmovable/reclaimable allocations would cause permanent
2044 * fragmentations if they fell back to allocating from a movable block
2045 * (polluting it), so we try to claim the whole block regardless of the
2046 * allocation size. Later movable allocations can always steal from this
2047 * block, which is less problematic.
2049 if (start_mt
== MIGRATE_RECLAIMABLE
|| start_mt
== MIGRATE_UNMOVABLE
)
2052 if (page_group_by_mobility_disabled
)
2056 * Movable pages won't cause permanent fragmentation, so when you alloc
2057 * small pages, we just need to temporarily steal unmovable or
2058 * reclaimable pages that are closest to the request size. After a
2059 * while, memory compaction may occur to form large contiguous pages,
2060 * and the next movable allocation may not need to steal.
2066 * Check whether there is a suitable fallback freepage with requested order.
2067 * If claimable is true, this function returns fallback_mt only if
2068 * we would do this whole-block claiming. This would help to reduce
2069 * fragmentation due to mixed migratetype pages in one pageblock.
2071 int find_suitable_fallback(struct free_area
*area
, unsigned int order
,
2072 int migratetype
, bool claimable
)
2076 if (claimable
&& !should_try_claim_block(order
, migratetype
))
2079 if (area
->nr_free
== 0)
2082 for (i
= 0; i
< MIGRATE_PCPTYPES
- 1 ; i
++) {
2083 int fallback_mt
= fallbacks
[migratetype
][i
];
2085 if (!free_area_empty(area
, fallback_mt
))
2093 * This function implements actual block claiming behaviour. If order is large
2094 * enough, we can claim the whole pageblock for the requested migratetype. If
2095 * not, we check the pageblock for constituent pages; if at least half of the
2096 * pages are free or compatible, we can still claim the whole block, so pages
2097 * freed in the future will be put on the correct free list.
2099 static struct page
*
2100 try_to_claim_block(struct zone
*zone
, struct page
*page
,
2101 int current_order
, int order
, int start_type
,
2102 int block_type
, unsigned int alloc_flags
)
2104 int free_pages
, movable_pages
, alike_pages
;
2105 unsigned long start_pfn
;
2107 /* Take ownership for orders >= pageblock_order */
2108 if (current_order
>= pageblock_order
) {
2109 unsigned int nr_added
;
2111 del_page_from_free_list(page
, zone
, current_order
, block_type
);
2112 change_pageblock_range(page
, current_order
, start_type
);
2113 nr_added
= expand(zone
, page
, order
, current_order
, start_type
);
2114 account_freepages(zone
, nr_added
, start_type
);
2119 * Boost watermarks to increase reclaim pressure to reduce the
2120 * likelihood of future fallbacks. Wake kswapd now as the node
2121 * may be balanced overall and kswapd will not wake naturally.
2123 if (boost_watermark(zone
) && (alloc_flags
& ALLOC_KSWAPD
))
2124 set_bit(ZONE_BOOSTED_WATERMARK
, &zone
->flags
);
2126 /* moving whole block can fail due to zone boundary conditions */
2127 if (!prep_move_freepages_block(zone
, page
, &start_pfn
, &free_pages
,
2132 * Determine how many pages are compatible with our allocation.
2133 * For movable allocation, it's the number of movable pages which
2134 * we just obtained. For other types it's a bit more tricky.
2136 if (start_type
== MIGRATE_MOVABLE
) {
2137 alike_pages
= movable_pages
;
2140 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2141 * to MOVABLE pageblock, consider all non-movable pages as
2142 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2143 * vice versa, be conservative since we can't distinguish the
2144 * exact migratetype of non-movable pages.
2146 if (block_type
== MIGRATE_MOVABLE
)
2147 alike_pages
= pageblock_nr_pages
2148 - (free_pages
+ movable_pages
);
2153 * If a sufficient number of pages in the block are either free or of
2154 * compatible migratability as our allocation, claim the whole block.
2156 if (free_pages
+ alike_pages
>= (1 << (pageblock_order
-1)) ||
2157 page_group_by_mobility_disabled
) {
2158 __move_freepages_block(zone
, start_pfn
, block_type
, start_type
);
2159 return __rmqueue_smallest(zone
, order
, start_type
);
2166 * Try to allocate from some fallback migratetype by claiming the entire block,
2167 * i.e. converting it to the allocation's start migratetype.
2169 * The use of signed ints for order and current_order is a deliberate
2170 * deviation from the rest of this file, to make the for loop
2171 * condition simpler.
2173 static __always_inline
struct page
*
2174 __rmqueue_claim(struct zone
*zone
, int order
, int start_migratetype
,
2175 unsigned int alloc_flags
)
2177 struct free_area
*area
;
2179 int min_order
= order
;
2184 * Do not steal pages from freelists belonging to other pageblocks
2185 * i.e. orders < pageblock_order. If there are no local zones free,
2186 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2188 if (order
< pageblock_order
&& alloc_flags
& ALLOC_NOFRAGMENT
)
2189 min_order
= pageblock_order
;
2192 * Find the largest available free page in the other list. This roughly
2193 * approximates finding the pageblock with the most free pages, which
2194 * would be too costly to do exactly.
2196 for (current_order
= MAX_PAGE_ORDER
; current_order
>= min_order
;
2198 area
= &(zone
->free_area
[current_order
]);
2199 fallback_mt
= find_suitable_fallback(area
, current_order
,
2200 start_migratetype
, true);
2202 /* No block in that order */
2203 if (fallback_mt
== -1)
2206 /* Advanced into orders too low to claim, abort */
2207 if (fallback_mt
== -2)
2210 page
= get_page_from_free_area(area
, fallback_mt
);
2211 page
= try_to_claim_block(zone
, page
, current_order
, order
,
2212 start_migratetype
, fallback_mt
,
2215 trace_mm_page_alloc_extfrag(page
, order
, current_order
,
2216 start_migratetype
, fallback_mt
);
2225 * Try to steal a single page from some fallback migratetype. Leave the rest of
2226 * the block as its current migratetype, potentially causing fragmentation.
2228 static __always_inline
struct page
*
2229 __rmqueue_steal(struct zone
*zone
, int order
, int start_migratetype
)
2231 struct free_area
*area
;
2236 for (current_order
= order
; current_order
< NR_PAGE_ORDERS
; current_order
++) {
2237 area
= &(zone
->free_area
[current_order
]);
2238 fallback_mt
= find_suitable_fallback(area
, current_order
,
2239 start_migratetype
, false);
2240 if (fallback_mt
== -1)
2243 page
= get_page_from_free_area(area
, fallback_mt
);
2244 page_del_and_expand(zone
, page
, order
, current_order
, fallback_mt
);
2245 trace_mm_page_alloc_extfrag(page
, order
, current_order
,
2246 start_migratetype
, fallback_mt
);
2261 * Do the hard work of removing an element from the buddy allocator.
2262 * Call me with the zone->lock already held.
2264 static __always_inline
struct page
*
2265 __rmqueue(struct zone
*zone
, unsigned int order
, int migratetype
,
2266 unsigned int alloc_flags
, enum rmqueue_mode
*mode
)
2270 if (IS_ENABLED(CONFIG_CMA
)) {
2272 * Balance movable allocations between regular and CMA areas by
2273 * allocating from CMA when over half of the zone's free memory
2274 * is in the CMA area.
2276 if (alloc_flags
& ALLOC_CMA
&&
2277 zone_page_state(zone
, NR_FREE_CMA_PAGES
) >
2278 zone_page_state(zone
, NR_FREE_PAGES
) / 2) {
2279 page
= __rmqueue_cma_fallback(zone
, order
);
2286 * First try the freelists of the requested migratetype, then try
2287 * fallbacks modes with increasing levels of fragmentation risk.
2289 * The fallback logic is expensive and rmqueue_bulk() calls in
2290 * a loop with the zone->lock held, meaning the freelists are
2291 * not subject to any outside changes. Remember in *mode where
2292 * we found pay dirt, to save us the search on the next call.
2295 case RMQUEUE_NORMAL
:
2296 page
= __rmqueue_smallest(zone
, order
, migratetype
);
2301 if (alloc_flags
& ALLOC_CMA
) {
2302 page
= __rmqueue_cma_fallback(zone
, order
);
2304 *mode
= RMQUEUE_CMA
;
2310 page
= __rmqueue_claim(zone
, order
, migratetype
, alloc_flags
);
2312 /* Replenished preferred freelist, back to normal mode. */
2313 *mode
= RMQUEUE_NORMAL
;
2318 if (!(alloc_flags
& ALLOC_NOFRAGMENT
)) {
2319 page
= __rmqueue_steal(zone
, order
, migratetype
);
2321 *mode
= RMQUEUE_STEAL
;
2330 * Obtain a specified number of elements from the buddy allocator, all under
2331 * a single hold of the lock, for efficiency. Add them to the supplied list.
2332 * Returns the number of new pages which were placed at *list.
2334 static int rmqueue_bulk(struct zone
*zone
, unsigned int order
,
2335 unsigned long count
, struct list_head
*list
,
2336 int migratetype
, unsigned int alloc_flags
)
2338 enum rmqueue_mode rmqm
= RMQUEUE_NORMAL
;
2339 unsigned long flags
;
2342 if (unlikely(alloc_flags
& ALLOC_TRYLOCK
)) {
2343 if (!spin_trylock_irqsave(&zone
->lock
, flags
))
2346 spin_lock_irqsave(&zone
->lock
, flags
);
2348 for (i
= 0; i
< count
; ++i
) {
2349 struct page
*page
= __rmqueue(zone
, order
, migratetype
,
2350 alloc_flags
, &rmqm
);
2351 if (unlikely(page
== NULL
))
2355 * Split buddy pages returned by expand() are received here in
2356 * physical page order. The page is added to the tail of
2357 * caller's list. From the callers perspective, the linked list
2358 * is ordered by page number under some conditions. This is
2359 * useful for IO devices that can forward direction from the
2360 * head, thus also in the physical page order. This is useful
2361 * for IO devices that can merge IO requests if the physical
2362 * pages are ordered properly.
2364 list_add_tail(&page
->pcp_list
, list
);
2366 spin_unlock_irqrestore(&zone
->lock
, flags
);
2372 * Called from the vmstat counter updater to decay the PCP high.
2373 * Return whether there are addition works to do.
2375 int decay_pcp_high(struct zone
*zone
, struct per_cpu_pages
*pcp
)
2377 int high_min
, to_drain
, batch
;
2380 high_min
= READ_ONCE(pcp
->high_min
);
2381 batch
= READ_ONCE(pcp
->batch
);
2383 * Decrease pcp->high periodically to try to free possible
2384 * idle PCP pages. And, avoid to free too many pages to
2385 * control latency. This caps pcp->high decrement too.
2387 if (pcp
->high
> high_min
) {
2388 pcp
->high
= max3(pcp
->count
- (batch
<< CONFIG_PCP_BATCH_SCALE_MAX
),
2389 pcp
->high
- (pcp
->high
>> 3), high_min
);
2390 if (pcp
->high
> high_min
)
2394 to_drain
= pcp
->count
- pcp
->high
;
2396 spin_lock(&pcp
->lock
);
2397 free_pcppages_bulk(zone
, to_drain
, pcp
, 0);
2398 spin_unlock(&pcp
->lock
);
2407 * Called from the vmstat counter updater to drain pagesets of this
2408 * currently executing processor on remote nodes after they have
2411 void drain_zone_pages(struct zone
*zone
, struct per_cpu_pages
*pcp
)
2413 int to_drain
, batch
;
2415 batch
= READ_ONCE(pcp
->batch
);
2416 to_drain
= min(pcp
->count
, batch
);
2418 spin_lock(&pcp
->lock
);
2419 free_pcppages_bulk(zone
, to_drain
, pcp
, 0);
2420 spin_unlock(&pcp
->lock
);
2426 * Drain pcplists of the indicated processor and zone.
2428 static void drain_pages_zone(unsigned int cpu
, struct zone
*zone
)
2430 struct per_cpu_pages
*pcp
= per_cpu_ptr(zone
->per_cpu_pageset
, cpu
);
2434 spin_lock(&pcp
->lock
);
2437 int to_drain
= min(count
,
2438 pcp
->batch
<< CONFIG_PCP_BATCH_SCALE_MAX
);
2440 free_pcppages_bulk(zone
, to_drain
, pcp
, 0);
2443 spin_unlock(&pcp
->lock
);
2448 * Drain pcplists of all zones on the indicated processor.
2450 static void drain_pages(unsigned int cpu
)
2454 for_each_populated_zone(zone
) {
2455 drain_pages_zone(cpu
, zone
);
2460 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2462 void drain_local_pages(struct zone
*zone
)
2464 int cpu
= smp_processor_id();
2467 drain_pages_zone(cpu
, zone
);
2473 * The implementation of drain_all_pages(), exposing an extra parameter to
2474 * drain on all cpus.
2476 * drain_all_pages() is optimized to only execute on cpus where pcplists are
2477 * not empty. The check for non-emptiness can however race with a free to
2478 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
2479 * that need the guarantee that every CPU has drained can disable the
2480 * optimizing racy check.
2482 static void __drain_all_pages(struct zone
*zone
, bool force_all_cpus
)
2487 * Allocate in the BSS so we won't require allocation in
2488 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2490 static cpumask_t cpus_with_pcps
;
2493 * Do not drain if one is already in progress unless it's specific to
2494 * a zone. Such callers are primarily CMA and memory hotplug and need
2495 * the drain to be complete when the call returns.
2497 if (unlikely(!mutex_trylock(&pcpu_drain_mutex
))) {
2500 mutex_lock(&pcpu_drain_mutex
);
2504 * We don't care about racing with CPU hotplug event
2505 * as offline notification will cause the notified
2506 * cpu to drain that CPU pcps and on_each_cpu_mask
2507 * disables preemption as part of its processing
2509 for_each_online_cpu(cpu
) {
2510 struct per_cpu_pages
*pcp
;
2512 bool has_pcps
= false;
2514 if (force_all_cpus
) {
2516 * The pcp.count check is racy, some callers need a
2517 * guarantee that no cpu is missed.
2521 pcp
= per_cpu_ptr(zone
->per_cpu_pageset
, cpu
);
2525 for_each_populated_zone(z
) {
2526 pcp
= per_cpu_ptr(z
->per_cpu_pageset
, cpu
);
2535 cpumask_set_cpu(cpu
, &cpus_with_pcps
);
2537 cpumask_clear_cpu(cpu
, &cpus_with_pcps
);
2540 for_each_cpu(cpu
, &cpus_with_pcps
) {
2542 drain_pages_zone(cpu
, zone
);
2547 mutex_unlock(&pcpu_drain_mutex
);
2551 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2553 * When zone parameter is non-NULL, spill just the single zone's pages.
2555 void drain_all_pages(struct zone
*zone
)
2557 __drain_all_pages(zone
, false);
2560 static int nr_pcp_free(struct per_cpu_pages
*pcp
, int batch
, int high
, bool free_high
)
2562 int min_nr_free
, max_nr_free
;
2564 /* Free as much as possible if batch freeing high-order pages. */
2565 if (unlikely(free_high
))
2566 return min(pcp
->count
, batch
<< CONFIG_PCP_BATCH_SCALE_MAX
);
2568 /* Check for PCP disabled or boot pageset */
2569 if (unlikely(high
< batch
))
2572 /* Leave at least pcp->batch pages on the list */
2573 min_nr_free
= batch
;
2574 max_nr_free
= high
- batch
;
2577 * Increase the batch number to the number of the consecutive
2578 * freed pages to reduce zone lock contention.
2580 batch
= clamp_t(int, pcp
->free_count
, min_nr_free
, max_nr_free
);
2585 static int nr_pcp_high(struct per_cpu_pages
*pcp
, struct zone
*zone
,
2586 int batch
, bool free_high
)
2588 int high
, high_min
, high_max
;
2590 high_min
= READ_ONCE(pcp
->high_min
);
2591 high_max
= READ_ONCE(pcp
->high_max
);
2592 high
= pcp
->high
= clamp(pcp
->high
, high_min
, high_max
);
2594 if (unlikely(!high
))
2597 if (unlikely(free_high
)) {
2598 pcp
->high
= max(high
- (batch
<< CONFIG_PCP_BATCH_SCALE_MAX
),
2604 * If reclaim is active, limit the number of pages that can be
2605 * stored on pcp lists
2607 if (test_bit(ZONE_RECLAIM_ACTIVE
, &zone
->flags
)) {
2608 int free_count
= max_t(int, pcp
->free_count
, batch
);
2610 pcp
->high
= max(high
- free_count
, high_min
);
2611 return min(batch
<< 2, pcp
->high
);
2614 if (high_min
== high_max
)
2617 if (test_bit(ZONE_BELOW_HIGH
, &zone
->flags
)) {
2618 int free_count
= max_t(int, pcp
->free_count
, batch
);
2620 pcp
->high
= max(high
- free_count
, high_min
);
2621 high
= max(pcp
->count
, high_min
);
2622 } else if (pcp
->count
>= high
) {
2623 int need_high
= pcp
->free_count
+ batch
;
2625 /* pcp->high should be large enough to hold batch freed pages */
2626 if (pcp
->high
< need_high
)
2627 pcp
->high
= clamp(need_high
, high_min
, high_max
);
2633 static void free_frozen_page_commit(struct zone
*zone
,
2634 struct per_cpu_pages
*pcp
, struct page
*page
, int migratetype
,
2635 unsigned int order
, fpi_t fpi_flags
)
2639 bool free_high
= false;
2642 * On freeing, reduce the number of pages that are batch allocated.
2643 * See nr_pcp_alloc() where alloc_factor is increased for subsequent
2646 pcp
->alloc_factor
>>= 1;
2647 __count_vm_events(PGFREE
, 1 << order
);
2648 pindex
= order_to_pindex(migratetype
, order
);
2649 list_add(&page
->pcp_list
, &pcp
->lists
[pindex
]);
2650 pcp
->count
+= 1 << order
;
2652 batch
= READ_ONCE(pcp
->batch
);
2654 * As high-order pages other than THP's stored on PCP can contribute
2655 * to fragmentation, limit the number stored when PCP is heavily
2656 * freeing without allocation. The remainder after bulk freeing
2657 * stops will be drained from vmstat refresh context.
2659 if (order
&& order
<= PAGE_ALLOC_COSTLY_ORDER
) {
2660 free_high
= (pcp
->free_count
>= (batch
+ pcp
->high_min
/ 2) &&
2661 (pcp
->flags
& PCPF_PREV_FREE_HIGH_ORDER
) &&
2662 (!(pcp
->flags
& PCPF_FREE_HIGH_BATCH
) ||
2663 pcp
->count
>= batch
));
2664 pcp
->flags
|= PCPF_PREV_FREE_HIGH_ORDER
;
2665 } else if (pcp
->flags
& PCPF_PREV_FREE_HIGH_ORDER
) {
2666 pcp
->flags
&= ~PCPF_PREV_FREE_HIGH_ORDER
;
2668 if (pcp
->free_count
< (batch
<< CONFIG_PCP_BATCH_SCALE_MAX
))
2669 pcp
->free_count
+= (1 << order
);
2671 if (unlikely(fpi_flags
& FPI_TRYLOCK
)) {
2673 * Do not attempt to take a zone lock. Let pcp->count get
2674 * over high mark temporarily.
2678 high
= nr_pcp_high(pcp
, zone
, batch
, free_high
);
2679 if (pcp
->count
>= high
) {
2680 free_pcppages_bulk(zone
, nr_pcp_free(pcp
, batch
, high
, free_high
),
2682 if (test_bit(ZONE_BELOW_HIGH
, &zone
->flags
) &&
2683 zone_watermark_ok(zone
, 0, high_wmark_pages(zone
),
2685 clear_bit(ZONE_BELOW_HIGH
, &zone
->flags
);
2692 static void __free_frozen_pages(struct page
*page
, unsigned int order
,
2695 unsigned long __maybe_unused UP_flags
;
2696 struct per_cpu_pages
*pcp
;
2698 unsigned long pfn
= page_to_pfn(page
);
2701 if (!pcp_allowed_order(order
)) {
2702 __free_pages_ok(page
, order
, fpi_flags
);
2706 if (!free_pages_prepare(page
, order
))
2710 * We only track unmovable, reclaimable and movable on pcp lists.
2711 * Place ISOLATE pages on the isolated list because they are being
2712 * offlined but treat HIGHATOMIC and CMA as movable pages so we can
2713 * get those areas back if necessary. Otherwise, we may have to free
2714 * excessively into the page allocator
2716 zone
= page_zone(page
);
2717 migratetype
= get_pfnblock_migratetype(page
, pfn
);
2718 if (unlikely(migratetype
>= MIGRATE_PCPTYPES
)) {
2719 if (unlikely(is_migrate_isolate(migratetype
))) {
2720 free_one_page(zone
, page
, pfn
, order
, fpi_flags
);
2723 migratetype
= MIGRATE_MOVABLE
;
2726 if (unlikely((fpi_flags
& FPI_TRYLOCK
) && IS_ENABLED(CONFIG_PREEMPT_RT
)
2727 && (in_nmi() || in_hardirq()))) {
2728 add_page_to_zone_llist(zone
, page
, order
);
2731 pcp_trylock_prepare(UP_flags
);
2732 pcp
= pcp_spin_trylock(zone
->per_cpu_pageset
);
2734 free_frozen_page_commit(zone
, pcp
, page
, migratetype
, order
, fpi_flags
);
2735 pcp_spin_unlock(pcp
);
2737 free_one_page(zone
, page
, pfn
, order
, fpi_flags
);
2739 pcp_trylock_finish(UP_flags
);
2742 void free_frozen_pages(struct page
*page
, unsigned int order
)
2744 __free_frozen_pages(page
, order
, FPI_NONE
);
2748 * Free a batch of folios
2750 void free_unref_folios(struct folio_batch
*folios
)
2752 unsigned long __maybe_unused UP_flags
;
2753 struct per_cpu_pages
*pcp
= NULL
;
2754 struct zone
*locked_zone
= NULL
;
2757 /* Prepare folios for freeing */
2758 for (i
= 0, j
= 0; i
< folios
->nr
; i
++) {
2759 struct folio
*folio
= folios
->folios
[i
];
2760 unsigned long pfn
= folio_pfn(folio
);
2761 unsigned int order
= folio_order(folio
);
2763 if (!free_pages_prepare(&folio
->page
, order
))
2766 * Free orders not handled on the PCP directly to the
2769 if (!pcp_allowed_order(order
)) {
2770 free_one_page(folio_zone(folio
), &folio
->page
,
2771 pfn
, order
, FPI_NONE
);
2774 folio
->private = (void *)(unsigned long)order
;
2776 folios
->folios
[j
] = folio
;
2781 for (i
= 0; i
< folios
->nr
; i
++) {
2782 struct folio
*folio
= folios
->folios
[i
];
2783 struct zone
*zone
= folio_zone(folio
);
2784 unsigned long pfn
= folio_pfn(folio
);
2785 unsigned int order
= (unsigned long)folio
->private;
2788 folio
->private = NULL
;
2789 migratetype
= get_pfnblock_migratetype(&folio
->page
, pfn
);
2791 /* Different zone requires a different pcp lock */
2792 if (zone
!= locked_zone
||
2793 is_migrate_isolate(migratetype
)) {
2795 pcp_spin_unlock(pcp
);
2796 pcp_trylock_finish(UP_flags
);
2802 * Free isolated pages directly to the
2803 * allocator, see comment in free_frozen_pages.
2805 if (is_migrate_isolate(migratetype
)) {
2806 free_one_page(zone
, &folio
->page
, pfn
,
2812 * trylock is necessary as folios may be getting freed
2813 * from IRQ or SoftIRQ context after an IO completion.
2815 pcp_trylock_prepare(UP_flags
);
2816 pcp
= pcp_spin_trylock(zone
->per_cpu_pageset
);
2817 if (unlikely(!pcp
)) {
2818 pcp_trylock_finish(UP_flags
);
2819 free_one_page(zone
, &folio
->page
, pfn
,
2827 * Non-isolated types over MIGRATE_PCPTYPES get added
2828 * to the MIGRATE_MOVABLE pcp list.
2830 if (unlikely(migratetype
>= MIGRATE_PCPTYPES
))
2831 migratetype
= MIGRATE_MOVABLE
;
2833 trace_mm_page_free_batched(&folio
->page
);
2834 free_frozen_page_commit(zone
, pcp
, &folio
->page
, migratetype
,
2839 pcp_spin_unlock(pcp
);
2840 pcp_trylock_finish(UP_flags
);
2842 folio_batch_reinit(folios
);
2846 * split_page takes a non-compound higher-order page, and splits it into
2847 * n (1<<order) sub-pages: page[0..n]
2848 * Each sub-page must be freed individually.
2850 * Note: this is probably too low level an operation for use in drivers.
2851 * Please consult with lkml before using this in your driver.
2853 void split_page(struct page
*page
, unsigned int order
)
2857 VM_BUG_ON_PAGE(PageCompound(page
), page
);
2858 VM_BUG_ON_PAGE(!page_count(page
), page
);
2860 for (i
= 1; i
< (1 << order
); i
++)
2861 set_page_refcounted(page
+ i
);
2862 split_page_owner(page
, order
, 0);
2863 pgalloc_tag_split(page_folio(page
), order
, 0);
2864 split_page_memcg(page
, order
);
2866 EXPORT_SYMBOL_GPL(split_page
);
2868 int __isolate_free_page(struct page
*page
, unsigned int order
)
2870 struct zone
*zone
= page_zone(page
);
2871 int mt
= get_pageblock_migratetype(page
);
2873 if (!is_migrate_isolate(mt
)) {
2874 unsigned long watermark
;
2876 * Obey watermarks as if the page was being allocated. We can
2877 * emulate a high-order watermark check with a raised order-0
2878 * watermark, because we already know our high-order page
2881 watermark
= zone
->_watermark
[WMARK_MIN
] + (1UL << order
);
2882 if (!zone_watermark_ok(zone
, 0, watermark
, 0, ALLOC_CMA
))
2886 del_page_from_free_list(page
, zone
, order
, mt
);
2889 * Set the pageblock if the isolated page is at least half of a
2892 if (order
>= pageblock_order
- 1) {
2893 struct page
*endpage
= page
+ (1 << order
) - 1;
2894 for (; page
< endpage
; page
+= pageblock_nr_pages
) {
2895 int mt
= get_pageblock_migratetype(page
);
2897 * Only change normal pageblocks (i.e., they can merge
2900 if (migratetype_is_mergeable(mt
))
2901 move_freepages_block(zone
, page
, mt
,
2906 return 1UL << order
;
2910 * __putback_isolated_page - Return a now-isolated page back where we got it
2911 * @page: Page that was isolated
2912 * @order: Order of the isolated page
2913 * @mt: The page's pageblock's migratetype
2915 * This function is meant to return a page pulled from the free lists via
2916 * __isolate_free_page back to the free lists they were pulled from.
2918 void __putback_isolated_page(struct page
*page
, unsigned int order
, int mt
)
2920 struct zone
*zone
= page_zone(page
);
2922 /* zone lock should be held when this function is called */
2923 lockdep_assert_held(&zone
->lock
);
2925 /* Return isolated page to tail of freelist. */
2926 __free_one_page(page
, page_to_pfn(page
), zone
, order
, mt
,
2927 FPI_SKIP_REPORT_NOTIFY
| FPI_TO_TAIL
);
2931 * Update NUMA hit/miss statistics
2933 static inline void zone_statistics(struct zone
*preferred_zone
, struct zone
*z
,
2937 enum numa_stat_item local_stat
= NUMA_LOCAL
;
2939 /* skip numa counters update if numa stats is disabled */
2940 if (!static_branch_likely(&vm_numa_stat_key
))
2943 if (zone_to_nid(z
) != numa_node_id())
2944 local_stat
= NUMA_OTHER
;
2946 if (zone_to_nid(z
) == zone_to_nid(preferred_zone
))
2947 __count_numa_events(z
, NUMA_HIT
, nr_account
);
2949 __count_numa_events(z
, NUMA_MISS
, nr_account
);
2950 __count_numa_events(preferred_zone
, NUMA_FOREIGN
, nr_account
);
2952 __count_numa_events(z
, local_stat
, nr_account
);
2956 static __always_inline
2957 struct page
*rmqueue_buddy(struct zone
*preferred_zone
, struct zone
*zone
,
2958 unsigned int order
, unsigned int alloc_flags
,
2962 unsigned long flags
;
2966 if (unlikely(alloc_flags
& ALLOC_TRYLOCK
)) {
2967 if (!spin_trylock_irqsave(&zone
->lock
, flags
))
2970 spin_lock_irqsave(&zone
->lock
, flags
);
2972 if (alloc_flags
& ALLOC_HIGHATOMIC
)
2973 page
= __rmqueue_smallest(zone
, order
, MIGRATE_HIGHATOMIC
);
2975 enum rmqueue_mode rmqm
= RMQUEUE_NORMAL
;
2977 page
= __rmqueue(zone
, order
, migratetype
, alloc_flags
, &rmqm
);
2980 * If the allocation fails, allow OOM handling and
2981 * order-0 (atomic) allocs access to HIGHATOMIC
2982 * reserves as failing now is worse than failing a
2983 * high-order atomic allocation in the future.
2985 if (!page
&& (alloc_flags
& (ALLOC_OOM
|ALLOC_NON_BLOCK
)))
2986 page
= __rmqueue_smallest(zone
, order
, MIGRATE_HIGHATOMIC
);
2989 spin_unlock_irqrestore(&zone
->lock
, flags
);
2993 spin_unlock_irqrestore(&zone
->lock
, flags
);
2994 } while (check_new_pages(page
, order
));
2996 __count_zid_vm_events(PGALLOC
, page_zonenum(page
), 1 << order
);
2997 zone_statistics(preferred_zone
, zone
, 1);
3002 static int nr_pcp_alloc(struct per_cpu_pages
*pcp
, struct zone
*zone
, int order
)
3004 int high
, base_batch
, batch
, max_nr_alloc
;
3005 int high_max
, high_min
;
3007 base_batch
= READ_ONCE(pcp
->batch
);
3008 high_min
= READ_ONCE(pcp
->high_min
);
3009 high_max
= READ_ONCE(pcp
->high_max
);
3010 high
= pcp
->high
= clamp(pcp
->high
, high_min
, high_max
);
3012 /* Check for PCP disabled or boot pageset */
3013 if (unlikely(high
< base_batch
))
3019 batch
= (base_batch
<< pcp
->alloc_factor
);
3022 * If we had larger pcp->high, we could avoid to allocate from
3025 if (high_min
!= high_max
&& !test_bit(ZONE_BELOW_HIGH
, &zone
->flags
))
3026 high
= pcp
->high
= min(high
+ batch
, high_max
);
3029 max_nr_alloc
= max(high
- pcp
->count
- base_batch
, base_batch
);
3031 * Double the number of pages allocated each time there is
3032 * subsequent allocation of order-0 pages without any freeing.
3034 if (batch
<= max_nr_alloc
&&
3035 pcp
->alloc_factor
< CONFIG_PCP_BATCH_SCALE_MAX
)
3036 pcp
->alloc_factor
++;
3037 batch
= min(batch
, max_nr_alloc
);
3041 * Scale batch relative to order if batch implies free pages
3042 * can be stored on the PCP. Batch can be 1 for small zones or
3043 * for boot pagesets which should never store free pages as
3044 * the pages may belong to arbitrary zones.
3047 batch
= max(batch
>> order
, 2);
3052 /* Remove page from the per-cpu list, caller must protect the list */
3054 struct page
*__rmqueue_pcplist(struct zone
*zone
, unsigned int order
,
3056 unsigned int alloc_flags
,
3057 struct per_cpu_pages
*pcp
,
3058 struct list_head
*list
)
3063 if (list_empty(list
)) {
3064 int batch
= nr_pcp_alloc(pcp
, zone
, order
);
3067 alloced
= rmqueue_bulk(zone
, order
,
3069 migratetype
, alloc_flags
);
3071 pcp
->count
+= alloced
<< order
;
3072 if (unlikely(list_empty(list
)))
3076 page
= list_first_entry(list
, struct page
, pcp_list
);
3077 list_del(&page
->pcp_list
);
3078 pcp
->count
-= 1 << order
;
3079 } while (check_new_pages(page
, order
));
3084 /* Lock and remove page from the per-cpu list */
3085 static struct page
*rmqueue_pcplist(struct zone
*preferred_zone
,
3086 struct zone
*zone
, unsigned int order
,
3087 int migratetype
, unsigned int alloc_flags
)
3089 struct per_cpu_pages
*pcp
;
3090 struct list_head
*list
;
3092 unsigned long __maybe_unused UP_flags
;
3094 /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
3095 pcp_trylock_prepare(UP_flags
);
3096 pcp
= pcp_spin_trylock(zone
->per_cpu_pageset
);
3098 pcp_trylock_finish(UP_flags
);
3103 * On allocation, reduce the number of pages that are batch freed.
3104 * See nr_pcp_free() where free_factor is increased for subsequent
3107 pcp
->free_count
>>= 1;
3108 list
= &pcp
->lists
[order_to_pindex(migratetype
, order
)];
3109 page
= __rmqueue_pcplist(zone
, order
, migratetype
, alloc_flags
, pcp
, list
);
3110 pcp_spin_unlock(pcp
);
3111 pcp_trylock_finish(UP_flags
);
3113 __count_zid_vm_events(PGALLOC
, page_zonenum(page
), 1 << order
);
3114 zone_statistics(preferred_zone
, zone
, 1);
3120 * Allocate a page from the given zone.
3121 * Use pcplists for THP or "cheap" high-order allocations.
3125 * Do not instrument rmqueue() with KMSAN. This function may call
3126 * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
3127 * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
3128 * may call rmqueue() again, which will result in a deadlock.
3130 __no_sanitize_memory
3132 struct page
*rmqueue(struct zone
*preferred_zone
,
3133 struct zone
*zone
, unsigned int order
,
3134 gfp_t gfp_flags
, unsigned int alloc_flags
,
3139 if (likely(pcp_allowed_order(order
))) {
3140 page
= rmqueue_pcplist(preferred_zone
, zone
, order
,
3141 migratetype
, alloc_flags
);
3146 page
= rmqueue_buddy(preferred_zone
, zone
, order
, alloc_flags
,
3150 /* Separate test+clear to avoid unnecessary atomics */
3151 if ((alloc_flags
& ALLOC_KSWAPD
) &&
3152 unlikely(test_bit(ZONE_BOOSTED_WATERMARK
, &zone
->flags
))) {
3153 clear_bit(ZONE_BOOSTED_WATERMARK
, &zone
->flags
);
3154 wakeup_kswapd(zone
, 0, 0, zone_idx(zone
));
3157 VM_BUG_ON_PAGE(page
&& bad_range(zone
, page
), page
);
3162 * Reserve the pageblock(s) surrounding an allocation request for
3163 * exclusive use of high-order atomic allocations if there are no
3164 * empty page blocks that contain a page with a suitable order
3166 static void reserve_highatomic_pageblock(struct page
*page
, int order
,
3170 unsigned long max_managed
, flags
;
3173 * The number reserved as: minimum is 1 pageblock, maximum is
3174 * roughly 1% of a zone. But if 1% of a zone falls below a
3175 * pageblock size, then don't reserve any pageblocks.
3176 * Check is race-prone but harmless.
3178 if ((zone_managed_pages(zone
) / 100) < pageblock_nr_pages
)
3180 max_managed
= ALIGN((zone_managed_pages(zone
) / 100), pageblock_nr_pages
);
3181 if (zone
->nr_reserved_highatomic
>= max_managed
)
3184 spin_lock_irqsave(&zone
->lock
, flags
);
3186 /* Recheck the nr_reserved_highatomic limit under the lock */
3187 if (zone
->nr_reserved_highatomic
>= max_managed
)
3191 mt
= get_pageblock_migratetype(page
);
3192 /* Only reserve normal pageblocks (i.e., they can merge with others) */
3193 if (!migratetype_is_mergeable(mt
))
3196 if (order
< pageblock_order
) {
3197 if (move_freepages_block(zone
, page
, mt
, MIGRATE_HIGHATOMIC
) == -1)
3199 zone
->nr_reserved_highatomic
+= pageblock_nr_pages
;
3201 change_pageblock_range(page
, order
, MIGRATE_HIGHATOMIC
);
3202 zone
->nr_reserved_highatomic
+= 1 << order
;
3206 spin_unlock_irqrestore(&zone
->lock
, flags
);
3210 * Used when an allocation is about to fail under memory pressure. This
3211 * potentially hurts the reliability of high-order allocations when under
3212 * intense memory pressure but failed atomic allocations should be easier
3213 * to recover from than an OOM.
3215 * If @force is true, try to unreserve pageblocks even though highatomic
3216 * pageblock is exhausted.
3218 static bool unreserve_highatomic_pageblock(const struct alloc_context
*ac
,
3221 struct zonelist
*zonelist
= ac
->zonelist
;
3222 unsigned long flags
;
3229 for_each_zone_zonelist_nodemask(zone
, z
, zonelist
, ac
->highest_zoneidx
,
3232 * Preserve at least one pageblock unless memory pressure
3235 if (!force
&& zone
->nr_reserved_highatomic
<=
3239 spin_lock_irqsave(&zone
->lock
, flags
);
3240 for (order
= 0; order
< NR_PAGE_ORDERS
; order
++) {
3241 struct free_area
*area
= &(zone
->free_area
[order
]);
3244 page
= get_page_from_free_area(area
, MIGRATE_HIGHATOMIC
);
3248 size
= max(pageblock_nr_pages
, 1UL << order
);
3250 * It should never happen but changes to
3251 * locking could inadvertently allow a per-cpu
3252 * drain to add pages to MIGRATE_HIGHATOMIC
3253 * while unreserving so be safe and watch for
3256 if (WARN_ON_ONCE(size
> zone
->nr_reserved_highatomic
))
3257 size
= zone
->nr_reserved_highatomic
;
3258 zone
->nr_reserved_highatomic
-= size
;
3261 * Convert to ac->migratetype and avoid the normal
3262 * pageblock stealing heuristics. Minimally, the caller
3263 * is doing the work and needs the pages. More
3264 * importantly, if the block was always converted to
3265 * MIGRATE_UNMOVABLE or another type then the number
3266 * of pageblocks that cannot be completely freed
3269 if (order
< pageblock_order
)
3270 ret
= move_freepages_block(zone
, page
,
3274 move_to_free_list(page
, zone
, order
,
3277 change_pageblock_range(page
, order
,
3282 * Reserving the block(s) already succeeded,
3283 * so this should not fail on zone boundaries.
3285 WARN_ON_ONCE(ret
== -1);
3287 spin_unlock_irqrestore(&zone
->lock
, flags
);
3291 spin_unlock_irqrestore(&zone
->lock
, flags
);
3297 static inline long __zone_watermark_unusable_free(struct zone
*z
,
3298 unsigned int order
, unsigned int alloc_flags
)
3300 long unusable_free
= (1 << order
) - 1;
3303 * If the caller does not have rights to reserves below the min
3304 * watermark then subtract the free pages reserved for highatomic.
3306 if (likely(!(alloc_flags
& ALLOC_RESERVES
)))
3307 unusable_free
+= READ_ONCE(z
->nr_free_highatomic
);
3310 /* If allocation can't use CMA areas don't use free CMA pages */
3311 if (!(alloc_flags
& ALLOC_CMA
))
3312 unusable_free
+= zone_page_state(z
, NR_FREE_CMA_PAGES
);
3315 return unusable_free
;
3319 * Return true if free base pages are above 'mark'. For high-order checks it
3320 * will return true of the order-0 watermark is reached and there is at least
3321 * one free page of a suitable size. Checking now avoids taking the zone lock
3322 * to check in the allocation paths if no pages are free.
3324 bool __zone_watermark_ok(struct zone
*z
, unsigned int order
, unsigned long mark
,
3325 int highest_zoneidx
, unsigned int alloc_flags
,
3331 /* free_pages may go negative - that's OK */
3332 free_pages
-= __zone_watermark_unusable_free(z
, order
, alloc_flags
);
3334 if (unlikely(alloc_flags
& ALLOC_RESERVES
)) {
3336 * __GFP_HIGH allows access to 50% of the min reserve as well
3339 if (alloc_flags
& ALLOC_MIN_RESERVE
) {
3343 * Non-blocking allocations (e.g. GFP_ATOMIC) can
3344 * access more reserves than just __GFP_HIGH. Other
3345 * non-blocking allocations requests such as GFP_NOWAIT
3346 * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
3347 * access to the min reserve.
3349 if (alloc_flags
& ALLOC_NON_BLOCK
)
3354 * OOM victims can try even harder than the normal reserve
3355 * users on the grounds that it's definitely going to be in
3356 * the exit path shortly and free memory. Any allocation it
3357 * makes during the free path will be small and short-lived.
3359 if (alloc_flags
& ALLOC_OOM
)
3364 * Check watermarks for an order-0 allocation request. If these
3365 * are not met, then a high-order request also cannot go ahead
3366 * even if a suitable page happened to be free.
3368 if (free_pages
<= min
+ z
->lowmem_reserve
[highest_zoneidx
])
3371 /* If this is an order-0 request then the watermark is fine */
3375 /* For a high-order request, check at least one suitable page is free */
3376 for (o
= order
; o
< NR_PAGE_ORDERS
; o
++) {
3377 struct free_area
*area
= &z
->free_area
[o
];
3383 for (mt
= 0; mt
< MIGRATE_PCPTYPES
; mt
++) {
3384 if (!free_area_empty(area
, mt
))
3389 if ((alloc_flags
& ALLOC_CMA
) &&
3390 !free_area_empty(area
, MIGRATE_CMA
)) {
3394 if ((alloc_flags
& (ALLOC_HIGHATOMIC
|ALLOC_OOM
)) &&
3395 !free_area_empty(area
, MIGRATE_HIGHATOMIC
)) {
3402 bool zone_watermark_ok(struct zone
*z
, unsigned int order
, unsigned long mark
,
3403 int highest_zoneidx
, unsigned int alloc_flags
)
3405 return __zone_watermark_ok(z
, order
, mark
, highest_zoneidx
, alloc_flags
,
3406 zone_page_state(z
, NR_FREE_PAGES
));
3409 static inline bool zone_watermark_fast(struct zone
*z
, unsigned int order
,
3410 unsigned long mark
, int highest_zoneidx
,
3411 unsigned int alloc_flags
, gfp_t gfp_mask
)
3415 free_pages
= zone_page_state(z
, NR_FREE_PAGES
);
3418 * Fast check for order-0 only. If this fails then the reserves
3419 * need to be calculated.
3425 usable_free
= free_pages
;
3426 reserved
= __zone_watermark_unusable_free(z
, 0, alloc_flags
);
3428 /* reserved may over estimate high-atomic reserves. */
3429 usable_free
-= min(usable_free
, reserved
);
3430 if (usable_free
> mark
+ z
->lowmem_reserve
[highest_zoneidx
])
3434 if (__zone_watermark_ok(z
, order
, mark
, highest_zoneidx
, alloc_flags
,
3439 * Ignore watermark boosting for __GFP_HIGH order-0 allocations
3440 * when checking the min watermark. The min watermark is the
3441 * point where boosting is ignored so that kswapd is woken up
3442 * when below the low watermark.
3444 if (unlikely(!order
&& (alloc_flags
& ALLOC_MIN_RESERVE
) && z
->watermark_boost
3445 && ((alloc_flags
& ALLOC_WMARK_MASK
) == WMARK_MIN
))) {
3446 mark
= z
->_watermark
[WMARK_MIN
];
3447 return __zone_watermark_ok(z
, order
, mark
, highest_zoneidx
,
3448 alloc_flags
, free_pages
);
3455 int __read_mostly node_reclaim_distance
= RECLAIM_DISTANCE
;
3457 static bool zone_allows_reclaim(struct zone
*local_zone
, struct zone
*zone
)
3459 return node_distance(zone_to_nid(local_zone
), zone_to_nid(zone
)) <=
3460 node_reclaim_distance
;
3462 #else /* CONFIG_NUMA */
3463 static bool zone_allows_reclaim(struct zone
*local_zone
, struct zone
*zone
)
3467 #endif /* CONFIG_NUMA */
3470 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3471 * fragmentation is subtle. If the preferred zone was HIGHMEM then
3472 * premature use of a lower zone may cause lowmem pressure problems that
3473 * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3474 * probably too small. It only makes sense to spread allocations to avoid
3475 * fragmentation between the Normal and DMA32 zones.
3477 static inline unsigned int
3478 alloc_flags_nofragment(struct zone
*zone
, gfp_t gfp_mask
)
3480 unsigned int alloc_flags
;
3483 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3486 alloc_flags
= (__force
int) (gfp_mask
& __GFP_KSWAPD_RECLAIM
);
3489 alloc_flags
|= ALLOC_NOFRAGMENT
;
3493 #ifdef CONFIG_ZONE_DMA32
3497 if (zone_idx(zone
) != ZONE_NORMAL
)
3501 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3502 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3503 * on UMA that if Normal is populated then so is DMA32.
3505 BUILD_BUG_ON(ZONE_NORMAL
- ZONE_DMA32
!= 1);
3506 if (nr_online_nodes
> 1 && !populated_zone(--zone
))
3509 alloc_flags
|= ALLOC_NOFRAGMENT
;
3510 #endif /* CONFIG_ZONE_DMA32 */
3514 /* Must be called after current_gfp_context() which can change gfp_mask */
3515 static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask
,
3516 unsigned int alloc_flags
)
3519 if (gfp_migratetype(gfp_mask
) == MIGRATE_MOVABLE
)
3520 alloc_flags
|= ALLOC_CMA
;
3526 * get_page_from_freelist goes through the zonelist trying to allocate
3529 static struct page
*
3530 get_page_from_freelist(gfp_t gfp_mask
, unsigned int order
, int alloc_flags
,
3531 const struct alloc_context
*ac
)
3535 struct pglist_data
*last_pgdat
= NULL
;
3536 bool last_pgdat_dirty_ok
= false;
3541 * Scan zonelist, looking for a zone with enough free.
3542 * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
3544 no_fallback
= alloc_flags
& ALLOC_NOFRAGMENT
;
3545 z
= ac
->preferred_zoneref
;
3546 for_next_zone_zonelist_nodemask(zone
, z
, ac
->highest_zoneidx
,
3551 if (cpusets_enabled() &&
3552 (alloc_flags
& ALLOC_CPUSET
) &&
3553 !__cpuset_zone_allowed(zone
, gfp_mask
))
3556 * When allocating a page cache page for writing, we
3557 * want to get it from a node that is within its dirty
3558 * limit, such that no single node holds more than its
3559 * proportional share of globally allowed dirty pages.
3560 * The dirty limits take into account the node's
3561 * lowmem reserves and high watermark so that kswapd
3562 * should be able to balance it without having to
3563 * write pages from its LRU list.
3565 * XXX: For now, allow allocations to potentially
3566 * exceed the per-node dirty limit in the slowpath
3567 * (spread_dirty_pages unset) before going into reclaim,
3568 * which is important when on a NUMA setup the allowed
3569 * nodes are together not big enough to reach the
3570 * global limit. The proper fix for these situations
3571 * will require awareness of nodes in the
3572 * dirty-throttling and the flusher threads.
3574 if (ac
->spread_dirty_pages
) {
3575 if (last_pgdat
!= zone
->zone_pgdat
) {
3576 last_pgdat
= zone
->zone_pgdat
;
3577 last_pgdat_dirty_ok
= node_dirty_ok(zone
->zone_pgdat
);
3580 if (!last_pgdat_dirty_ok
)
3584 if (no_fallback
&& !defrag_mode
&& nr_online_nodes
> 1 &&
3585 zone
!= zonelist_zone(ac
->preferred_zoneref
)) {
3589 * If moving to a remote node, retry but allow
3590 * fragmenting fallbacks. Locality is more important
3591 * than fragmentation avoidance.
3593 local_nid
= zonelist_node_idx(ac
->preferred_zoneref
);
3594 if (zone_to_nid(zone
) != local_nid
) {
3595 alloc_flags
&= ~ALLOC_NOFRAGMENT
;
3600 cond_accept_memory(zone
, order
, alloc_flags
);
3603 * Detect whether the number of free pages is below high
3604 * watermark. If so, we will decrease pcp->high and free
3605 * PCP pages in free path to reduce the possibility of
3606 * premature page reclaiming. Detection is done here to
3607 * avoid to do that in hotter free path.
3609 if (test_bit(ZONE_BELOW_HIGH
, &zone
->flags
))
3610 goto check_alloc_wmark
;
3612 mark
= high_wmark_pages(zone
);
3613 if (zone_watermark_fast(zone
, order
, mark
,
3614 ac
->highest_zoneidx
, alloc_flags
,
3618 set_bit(ZONE_BELOW_HIGH
, &zone
->flags
);
3621 mark
= wmark_pages(zone
, alloc_flags
& ALLOC_WMARK_MASK
);
3622 if (!zone_watermark_fast(zone
, order
, mark
,
3623 ac
->highest_zoneidx
, alloc_flags
,
3627 if (cond_accept_memory(zone
, order
, alloc_flags
))
3631 * Watermark failed for this zone, but see if we can
3632 * grow this zone if it contains deferred pages.
3634 if (deferred_pages_enabled()) {
3635 if (_deferred_grow_zone(zone
, order
))
3638 /* Checked here to keep the fast path fast */
3639 BUILD_BUG_ON(ALLOC_NO_WATERMARKS
< NR_WMARK
);
3640 if (alloc_flags
& ALLOC_NO_WATERMARKS
)
3643 if (!node_reclaim_enabled() ||
3644 !zone_allows_reclaim(zonelist_zone(ac
->preferred_zoneref
), zone
))
3647 ret
= node_reclaim(zone
->zone_pgdat
, gfp_mask
, order
);
3649 case NODE_RECLAIM_NOSCAN
:
3652 case NODE_RECLAIM_FULL
:
3653 /* scanned but unreclaimable */
3656 /* did we reclaim enough */
3657 if (zone_watermark_ok(zone
, order
, mark
,
3658 ac
->highest_zoneidx
, alloc_flags
))
3666 page
= rmqueue(zonelist_zone(ac
->preferred_zoneref
), zone
, order
,
3667 gfp_mask
, alloc_flags
, ac
->migratetype
);
3669 prep_new_page(page
, order
, gfp_mask
, alloc_flags
);
3672 * If this is a high-order atomic allocation then check
3673 * if the pageblock should be reserved for the future
3675 if (unlikely(alloc_flags
& ALLOC_HIGHATOMIC
))
3676 reserve_highatomic_pageblock(page
, order
, zone
);
3680 if (cond_accept_memory(zone
, order
, alloc_flags
))
3683 /* Try again if zone has deferred pages */
3684 if (deferred_pages_enabled()) {
3685 if (_deferred_grow_zone(zone
, order
))
3692 * It's possible on a UMA machine to get through all zones that are
3693 * fragmented. If avoiding fragmentation, reset and try again.
3695 if (no_fallback
&& !defrag_mode
) {
3696 alloc_flags
&= ~ALLOC_NOFRAGMENT
;
3703 static void warn_alloc_show_mem(gfp_t gfp_mask
, nodemask_t
*nodemask
)
3705 unsigned int filter
= SHOW_MEM_FILTER_NODES
;
3708 * This documents exceptions given to allocations in certain
3709 * contexts that are allowed to allocate outside current's set
3712 if (!(gfp_mask
& __GFP_NOMEMALLOC
))
3713 if (tsk_is_oom_victim(current
) ||
3714 (current
->flags
& (PF_MEMALLOC
| PF_EXITING
)))
3715 filter
&= ~SHOW_MEM_FILTER_NODES
;
3716 if (!in_task() || !(gfp_mask
& __GFP_DIRECT_RECLAIM
))
3717 filter
&= ~SHOW_MEM_FILTER_NODES
;
3719 __show_mem(filter
, nodemask
, gfp_zone(gfp_mask
));
3722 void warn_alloc(gfp_t gfp_mask
, nodemask_t
*nodemask
, const char *fmt
, ...)
3724 struct va_format vaf
;
3726 static DEFINE_RATELIMIT_STATE(nopage_rs
, 10*HZ
, 1);
3728 if ((gfp_mask
& __GFP_NOWARN
) ||
3729 !__ratelimit(&nopage_rs
) ||
3730 ((gfp_mask
& __GFP_DMA
) && !has_managed_dma()))
3733 va_start(args
, fmt
);
3736 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3737 current
->comm
, &vaf
, gfp_mask
, &gfp_mask
,
3738 nodemask_pr_args(nodemask
));
3741 cpuset_print_current_mems_allowed();
3744 warn_alloc_show_mem(gfp_mask
, nodemask
);
3747 static inline struct page
*
3748 __alloc_pages_cpuset_fallback(gfp_t gfp_mask
, unsigned int order
,
3749 unsigned int alloc_flags
,
3750 const struct alloc_context
*ac
)
3754 page
= get_page_from_freelist(gfp_mask
, order
,
3755 alloc_flags
|ALLOC_CPUSET
, ac
);
3757 * fallback to ignore cpuset restriction if our nodes
3761 page
= get_page_from_freelist(gfp_mask
, order
,
3766 static inline struct page
*
3767 __alloc_pages_may_oom(gfp_t gfp_mask
, unsigned int order
,
3768 const struct alloc_context
*ac
, unsigned long *did_some_progress
)
3770 struct oom_control oc
= {
3771 .zonelist
= ac
->zonelist
,
3772 .nodemask
= ac
->nodemask
,
3774 .gfp_mask
= gfp_mask
,
3779 *did_some_progress
= 0;
3782 * Acquire the oom lock. If that fails, somebody else is
3783 * making progress for us.
3785 if (!mutex_trylock(&oom_lock
)) {
3786 *did_some_progress
= 1;
3787 schedule_timeout_uninterruptible(1);
3792 * Go through the zonelist yet one more time, keep very high watermark
3793 * here, this is only to catch a parallel oom killing, we must fail if
3794 * we're still under heavy pressure. But make sure that this reclaim
3795 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3796 * allocation which will never fail due to oom_lock already held.
3798 page
= get_page_from_freelist((gfp_mask
| __GFP_HARDWALL
) &
3799 ~__GFP_DIRECT_RECLAIM
, order
,
3800 ALLOC_WMARK_HIGH
|ALLOC_CPUSET
, ac
);
3804 /* Coredumps can quickly deplete all memory reserves */
3805 if (current
->flags
& PF_DUMPCORE
)
3807 /* The OOM killer will not help higher order allocs */
3808 if (order
> PAGE_ALLOC_COSTLY_ORDER
)
3811 * We have already exhausted all our reclaim opportunities without any
3812 * success so it is time to admit defeat. We will skip the OOM killer
3813 * because it is very likely that the caller has a more reasonable
3814 * fallback than shooting a random task.
3816 * The OOM killer may not free memory on a specific node.
3818 if (gfp_mask
& (__GFP_RETRY_MAYFAIL
| __GFP_THISNODE
))
3820 /* The OOM killer does not needlessly kill tasks for lowmem */
3821 if (ac
->highest_zoneidx
< ZONE_NORMAL
)
3823 if (pm_suspended_storage())
3826 * XXX: GFP_NOFS allocations should rather fail than rely on
3827 * other request to make a forward progress.
3828 * We are in an unfortunate situation where out_of_memory cannot
3829 * do much for this context but let's try it to at least get
3830 * access to memory reserved if the current task is killed (see
3831 * out_of_memory). Once filesystems are ready to handle allocation
3832 * failures more gracefully we should just bail out here.
3835 /* Exhausted what can be done so it's blame time */
3836 if (out_of_memory(&oc
) ||
3837 WARN_ON_ONCE_GFP(gfp_mask
& __GFP_NOFAIL
, gfp_mask
)) {
3838 *did_some_progress
= 1;
3841 * Help non-failing allocations by giving them access to memory
3844 if (gfp_mask
& __GFP_NOFAIL
)
3845 page
= __alloc_pages_cpuset_fallback(gfp_mask
, order
,
3846 ALLOC_NO_WATERMARKS
, ac
);
3849 mutex_unlock(&oom_lock
);
3854 * Maximum number of compaction retries with a progress before OOM
3855 * killer is consider as the only way to move forward.
3857 #define MAX_COMPACT_RETRIES 16
3859 #ifdef CONFIG_COMPACTION
3860 /* Try memory compaction for high-order allocations before reclaim */
3861 static struct page
*
3862 __alloc_pages_direct_compact(gfp_t gfp_mask
, unsigned int order
,
3863 unsigned int alloc_flags
, const struct alloc_context
*ac
,
3864 enum compact_priority prio
, enum compact_result
*compact_result
)
3866 struct page
*page
= NULL
;
3867 unsigned long pflags
;
3868 unsigned int noreclaim_flag
;
3873 psi_memstall_enter(&pflags
);
3874 delayacct_compact_start();
3875 noreclaim_flag
= memalloc_noreclaim_save();
3877 *compact_result
= try_to_compact_pages(gfp_mask
, order
, alloc_flags
, ac
,
3880 memalloc_noreclaim_restore(noreclaim_flag
);
3881 psi_memstall_leave(&pflags
);
3882 delayacct_compact_end();
3884 if (*compact_result
== COMPACT_SKIPPED
)
3887 * At least in one zone compaction wasn't deferred or skipped, so let's
3888 * count a compaction stall
3890 count_vm_event(COMPACTSTALL
);
3892 /* Prep a captured page if available */
3894 prep_new_page(page
, order
, gfp_mask
, alloc_flags
);
3896 /* Try get a page from the freelist if available */
3898 page
= get_page_from_freelist(gfp_mask
, order
, alloc_flags
, ac
);
3901 struct zone
*zone
= page_zone(page
);
3903 zone
->compact_blockskip_flush
= false;
3904 compaction_defer_reset(zone
, order
, true);
3905 count_vm_event(COMPACTSUCCESS
);
3910 * It's bad if compaction run occurs and fails. The most likely reason
3911 * is that pages exist, but not enough to satisfy watermarks.
3913 count_vm_event(COMPACTFAIL
);
3921 should_compact_retry(struct alloc_context
*ac
, int order
, int alloc_flags
,
3922 enum compact_result compact_result
,
3923 enum compact_priority
*compact_priority
,
3924 int *compaction_retries
)
3926 int max_retries
= MAX_COMPACT_RETRIES
;
3929 int retries
= *compaction_retries
;
3930 enum compact_priority priority
= *compact_priority
;
3935 if (fatal_signal_pending(current
))
3939 * Compaction was skipped due to a lack of free order-0
3940 * migration targets. Continue if reclaim can help.
3942 if (compact_result
== COMPACT_SKIPPED
) {
3943 ret
= compaction_zonelist_suitable(ac
, order
, alloc_flags
);
3948 * Compaction managed to coalesce some page blocks, but the
3949 * allocation failed presumably due to a race. Retry some.
3951 if (compact_result
== COMPACT_SUCCESS
) {
3953 * !costly requests are much more important than
3954 * __GFP_RETRY_MAYFAIL costly ones because they are de
3955 * facto nofail and invoke OOM killer to move on while
3956 * costly can fail and users are ready to cope with
3957 * that. 1/4 retries is rather arbitrary but we would
3958 * need much more detailed feedback from compaction to
3959 * make a better decision.
3961 if (order
> PAGE_ALLOC_COSTLY_ORDER
)
3964 if (++(*compaction_retries
) <= max_retries
) {
3971 * Compaction failed. Retry with increasing priority.
3973 min_priority
= (order
> PAGE_ALLOC_COSTLY_ORDER
) ?
3974 MIN_COMPACT_COSTLY_PRIORITY
: MIN_COMPACT_PRIORITY
;
3976 if (*compact_priority
> min_priority
) {
3977 (*compact_priority
)--;
3978 *compaction_retries
= 0;
3982 trace_compact_retry(order
, priority
, compact_result
, retries
, max_retries
, ret
);
3986 static inline struct page
*
3987 __alloc_pages_direct_compact(gfp_t gfp_mask
, unsigned int order
,
3988 unsigned int alloc_flags
, const struct alloc_context
*ac
,
3989 enum compact_priority prio
, enum compact_result
*compact_result
)
3991 *compact_result
= COMPACT_SKIPPED
;
3996 should_compact_retry(struct alloc_context
*ac
, unsigned int order
, int alloc_flags
,
3997 enum compact_result compact_result
,
3998 enum compact_priority
*compact_priority
,
3999 int *compaction_retries
)
4004 if (!order
|| order
> PAGE_ALLOC_COSTLY_ORDER
)
4008 * There are setups with compaction disabled which would prefer to loop
4009 * inside the allocator rather than hit the oom killer prematurely.
4010 * Let's give them a good hope and keep retrying while the order-0
4011 * watermarks are OK.
4013 for_each_zone_zonelist_nodemask(zone
, z
, ac
->zonelist
,
4014 ac
->highest_zoneidx
, ac
->nodemask
) {
4015 if (zone_watermark_ok(zone
, 0, min_wmark_pages(zone
),
4016 ac
->highest_zoneidx
, alloc_flags
))
4021 #endif /* CONFIG_COMPACTION */
4023 #ifdef CONFIG_LOCKDEP
4024 static struct lockdep_map __fs_reclaim_map
=
4025 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map
);
4027 static bool __need_reclaim(gfp_t gfp_mask
)
4029 /* no reclaim without waiting on it */
4030 if (!(gfp_mask
& __GFP_DIRECT_RECLAIM
))
4033 /* this guy won't enter reclaim */
4034 if (current
->flags
& PF_MEMALLOC
)
4037 if (gfp_mask
& __GFP_NOLOCKDEP
)
4043 void __fs_reclaim_acquire(unsigned long ip
)
4045 lock_acquire_exclusive(&__fs_reclaim_map
, 0, 0, NULL
, ip
);
4048 void __fs_reclaim_release(unsigned long ip
)
4050 lock_release(&__fs_reclaim_map
, ip
);
4053 void fs_reclaim_acquire(gfp_t gfp_mask
)
4055 gfp_mask
= current_gfp_context(gfp_mask
);
4057 if (__need_reclaim(gfp_mask
)) {
4058 if (gfp_mask
& __GFP_FS
)
4059 __fs_reclaim_acquire(_RET_IP_
);
4061 #ifdef CONFIG_MMU_NOTIFIER
4062 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map
);
4063 lock_map_release(&__mmu_notifier_invalidate_range_start_map
);
4068 EXPORT_SYMBOL_GPL(fs_reclaim_acquire
);
4070 void fs_reclaim_release(gfp_t gfp_mask
)
4072 gfp_mask
= current_gfp_context(gfp_mask
);
4074 if (__need_reclaim(gfp_mask
)) {
4075 if (gfp_mask
& __GFP_FS
)
4076 __fs_reclaim_release(_RET_IP_
);
4079 EXPORT_SYMBOL_GPL(fs_reclaim_release
);
4083 * Zonelists may change due to hotplug during allocation. Detect when zonelists
4084 * have been rebuilt so allocation retries. Reader side does not lock and
4085 * retries the allocation if zonelist changes. Writer side is protected by the
4086 * embedded spin_lock.
4088 static DEFINE_SEQLOCK(zonelist_update_seq
);
4090 static unsigned int zonelist_iter_begin(void)
4092 if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE
))
4093 return read_seqbegin(&zonelist_update_seq
);
4098 static unsigned int check_retry_zonelist(unsigned int seq
)
4100 if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE
))
4101 return read_seqretry(&zonelist_update_seq
, seq
);
4106 /* Perform direct synchronous page reclaim */
4107 static unsigned long
4108 __perform_reclaim(gfp_t gfp_mask
, unsigned int order
,
4109 const struct alloc_context
*ac
)
4111 unsigned int noreclaim_flag
;
4112 unsigned long progress
;
4116 /* We now go into synchronous reclaim */
4117 cpuset_memory_pressure_bump();
4118 fs_reclaim_acquire(gfp_mask
);
4119 noreclaim_flag
= memalloc_noreclaim_save();
4121 progress
= try_to_free_pages(ac
->zonelist
, order
, gfp_mask
,
4124 memalloc_noreclaim_restore(noreclaim_flag
);
4125 fs_reclaim_release(gfp_mask
);
4132 /* The really slow allocator path where we enter direct reclaim */
4133 static inline struct page
*
4134 __alloc_pages_direct_reclaim(gfp_t gfp_mask
, unsigned int order
,
4135 unsigned int alloc_flags
, const struct alloc_context
*ac
,
4136 unsigned long *did_some_progress
)
4138 struct page
*page
= NULL
;
4139 unsigned long pflags
;
4140 bool drained
= false;
4142 psi_memstall_enter(&pflags
);
4143 *did_some_progress
= __perform_reclaim(gfp_mask
, order
, ac
);
4144 if (unlikely(!(*did_some_progress
)))
4148 page
= get_page_from_freelist(gfp_mask
, order
, alloc_flags
, ac
);
4151 * If an allocation failed after direct reclaim, it could be because
4152 * pages are pinned on the per-cpu lists or in high alloc reserves.
4153 * Shrink them and try again
4155 if (!page
&& !drained
) {
4156 unreserve_highatomic_pageblock(ac
, false);
4157 drain_all_pages(NULL
);
4162 psi_memstall_leave(&pflags
);
4167 static void wake_all_kswapds(unsigned int order
, gfp_t gfp_mask
,
4168 const struct alloc_context
*ac
)
4172 pg_data_t
*last_pgdat
= NULL
;
4173 enum zone_type highest_zoneidx
= ac
->highest_zoneidx
;
4174 unsigned int reclaim_order
;
4177 reclaim_order
= max(order
, pageblock_order
);
4179 reclaim_order
= order
;
4181 for_each_zone_zonelist_nodemask(zone
, z
, ac
->zonelist
, highest_zoneidx
,
4183 if (!managed_zone(zone
))
4185 if (last_pgdat
== zone
->zone_pgdat
)
4187 wakeup_kswapd(zone
, gfp_mask
, reclaim_order
, highest_zoneidx
);
4188 last_pgdat
= zone
->zone_pgdat
;
4192 static inline unsigned int
4193 gfp_to_alloc_flags(gfp_t gfp_mask
, unsigned int order
)
4195 unsigned int alloc_flags
= ALLOC_WMARK_MIN
| ALLOC_CPUSET
;
4198 * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
4199 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
4200 * to save two branches.
4202 BUILD_BUG_ON(__GFP_HIGH
!= (__force gfp_t
) ALLOC_MIN_RESERVE
);
4203 BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM
!= (__force gfp_t
) ALLOC_KSWAPD
);
4206 * The caller may dip into page reserves a bit more if the caller
4207 * cannot run direct reclaim, or if the caller has realtime scheduling
4208 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
4209 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
4211 alloc_flags
|= (__force
int)
4212 (gfp_mask
& (__GFP_HIGH
| __GFP_KSWAPD_RECLAIM
));
4214 if (!(gfp_mask
& __GFP_DIRECT_RECLAIM
)) {
4216 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
4217 * if it can't schedule.
4219 if (!(gfp_mask
& __GFP_NOMEMALLOC
)) {
4220 alloc_flags
|= ALLOC_NON_BLOCK
;
4223 alloc_flags
|= ALLOC_HIGHATOMIC
;
4227 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
4228 * GFP_ATOMIC) rather than fail, see the comment for
4229 * cpuset_current_node_allowed().
4231 if (alloc_flags
& ALLOC_MIN_RESERVE
)
4232 alloc_flags
&= ~ALLOC_CPUSET
;
4233 } else if (unlikely(rt_or_dl_task(current
)) && in_task())
4234 alloc_flags
|= ALLOC_MIN_RESERVE
;
4236 alloc_flags
= gfp_to_alloc_flags_cma(gfp_mask
, alloc_flags
);
4239 alloc_flags
|= ALLOC_NOFRAGMENT
;
4244 static bool oom_reserves_allowed(struct task_struct
*tsk
)
4246 if (!tsk_is_oom_victim(tsk
))
4250 * !MMU doesn't have oom reaper so give access to memory reserves
4251 * only to the thread with TIF_MEMDIE set
4253 if (!IS_ENABLED(CONFIG_MMU
) && !test_thread_flag(TIF_MEMDIE
))
4260 * Distinguish requests which really need access to full memory
4261 * reserves from oom victims which can live with a portion of it
4263 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask
)
4265 if (unlikely(gfp_mask
& __GFP_NOMEMALLOC
))
4267 if (gfp_mask
& __GFP_MEMALLOC
)
4268 return ALLOC_NO_WATERMARKS
;
4269 if (in_serving_softirq() && (current
->flags
& PF_MEMALLOC
))
4270 return ALLOC_NO_WATERMARKS
;
4271 if (!in_interrupt()) {
4272 if (current
->flags
& PF_MEMALLOC
)
4273 return ALLOC_NO_WATERMARKS
;
4274 else if (oom_reserves_allowed(current
))
4281 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask
)
4283 return !!__gfp_pfmemalloc_flags(gfp_mask
);
4287 * Checks whether it makes sense to retry the reclaim to make a forward progress
4288 * for the given allocation request.
4290 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4291 * without success, or when we couldn't even meet the watermark if we
4292 * reclaimed all remaining pages on the LRU lists.
4294 * Returns true if a retry is viable or false to enter the oom path.
4297 should_reclaim_retry(gfp_t gfp_mask
, unsigned order
,
4298 struct alloc_context
*ac
, int alloc_flags
,
4299 bool did_some_progress
, int *no_progress_loops
)
4306 * Costly allocations might have made a progress but this doesn't mean
4307 * their order will become available due to high fragmentation so
4308 * always increment the no progress counter for them
4310 if (did_some_progress
&& order
<= PAGE_ALLOC_COSTLY_ORDER
)
4311 *no_progress_loops
= 0;
4313 (*no_progress_loops
)++;
4315 if (*no_progress_loops
> MAX_RECLAIM_RETRIES
)
4320 * Keep reclaiming pages while there is a chance this will lead
4321 * somewhere. If none of the target zones can satisfy our allocation
4322 * request even if all reclaimable pages are considered then we are
4323 * screwed and have to go OOM.
4325 for_each_zone_zonelist_nodemask(zone
, z
, ac
->zonelist
,
4326 ac
->highest_zoneidx
, ac
->nodemask
) {
4327 unsigned long available
;
4328 unsigned long reclaimable
;
4329 unsigned long min_wmark
= min_wmark_pages(zone
);
4332 if (cpusets_enabled() &&
4333 (alloc_flags
& ALLOC_CPUSET
) &&
4334 !__cpuset_zone_allowed(zone
, gfp_mask
))
4337 available
= reclaimable
= zone_reclaimable_pages(zone
);
4338 available
+= zone_page_state_snapshot(zone
, NR_FREE_PAGES
);
4341 * Would the allocation succeed if we reclaimed all
4342 * reclaimable pages?
4344 wmark
= __zone_watermark_ok(zone
, order
, min_wmark
,
4345 ac
->highest_zoneidx
, alloc_flags
, available
);
4346 trace_reclaim_retry_zone(z
, order
, reclaimable
,
4347 available
, min_wmark
, *no_progress_loops
, wmark
);
4355 * Memory allocation/reclaim might be called from a WQ context and the
4356 * current implementation of the WQ concurrency control doesn't
4357 * recognize that a particular WQ is congested if the worker thread is
4358 * looping without ever sleeping. Therefore we have to do a short sleep
4359 * here rather than calling cond_resched().
4361 if (current
->flags
& PF_WQ_WORKER
)
4362 schedule_timeout_uninterruptible(1);
4366 /* Before OOM, exhaust highatomic_reserve */
4368 return unreserve_highatomic_pageblock(ac
, true);
4374 check_retry_cpuset(int cpuset_mems_cookie
, struct alloc_context
*ac
)
4377 * It's possible that cpuset's mems_allowed and the nodemask from
4378 * mempolicy don't intersect. This should be normally dealt with by
4379 * policy_nodemask(), but it's possible to race with cpuset update in
4380 * such a way the check therein was true, and then it became false
4381 * before we got our cpuset_mems_cookie here.
4382 * This assumes that for all allocations, ac->nodemask can come only
4383 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4384 * when it does not intersect with the cpuset restrictions) or the
4385 * caller can deal with a violated nodemask.
4387 if (cpusets_enabled() && ac
->nodemask
&&
4388 !cpuset_nodemask_valid_mems_allowed(ac
->nodemask
)) {
4389 ac
->nodemask
= NULL
;
4394 * When updating a task's mems_allowed or mempolicy nodemask, it is
4395 * possible to race with parallel threads in such a way that our
4396 * allocation can fail while the mask is being updated. If we are about
4397 * to fail, check if the cpuset changed during allocation and if so,
4400 if (read_mems_allowed_retry(cpuset_mems_cookie
))
4406 static inline struct page
*
4407 __alloc_pages_slowpath(gfp_t gfp_mask
, unsigned int order
,
4408 struct alloc_context
*ac
)
4410 bool can_direct_reclaim
= gfp_mask
& __GFP_DIRECT_RECLAIM
;
4411 bool can_compact
= gfp_compaction_allowed(gfp_mask
);
4412 bool nofail
= gfp_mask
& __GFP_NOFAIL
;
4413 const bool costly_order
= order
> PAGE_ALLOC_COSTLY_ORDER
;
4414 struct page
*page
= NULL
;
4415 unsigned int alloc_flags
;
4416 unsigned long did_some_progress
;
4417 enum compact_priority compact_priority
;
4418 enum compact_result compact_result
;
4419 int compaction_retries
;
4420 int no_progress_loops
;
4421 unsigned int cpuset_mems_cookie
;
4422 unsigned int zonelist_iter_cookie
;
4425 if (unlikely(nofail
)) {
4427 * We most definitely don't want callers attempting to
4428 * allocate greater than order-1 page units with __GFP_NOFAIL.
4430 WARN_ON_ONCE(order
> 1);
4432 * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
4433 * otherwise, we may result in lockup.
4435 WARN_ON_ONCE(!can_direct_reclaim
);
4437 * PF_MEMALLOC request from this context is rather bizarre
4438 * because we cannot reclaim anything and only can loop waiting
4439 * for somebody to do a work for us.
4441 WARN_ON_ONCE(current
->flags
& PF_MEMALLOC
);
4445 compaction_retries
= 0;
4446 no_progress_loops
= 0;
4447 compact_result
= COMPACT_SKIPPED
;
4448 compact_priority
= DEF_COMPACT_PRIORITY
;
4449 cpuset_mems_cookie
= read_mems_allowed_begin();
4450 zonelist_iter_cookie
= zonelist_iter_begin();
4453 * The fast path uses conservative alloc_flags to succeed only until
4454 * kswapd needs to be woken up, and to avoid the cost of setting up
4455 * alloc_flags precisely. So we do that now.
4457 alloc_flags
= gfp_to_alloc_flags(gfp_mask
, order
);
4460 * We need to recalculate the starting point for the zonelist iterator
4461 * because we might have used different nodemask in the fast path, or
4462 * there was a cpuset modification and we are retrying - otherwise we
4463 * could end up iterating over non-eligible zones endlessly.
4465 ac
->preferred_zoneref
= first_zones_zonelist(ac
->zonelist
,
4466 ac
->highest_zoneidx
, ac
->nodemask
);
4467 if (!zonelist_zone(ac
->preferred_zoneref
))
4471 * Check for insane configurations where the cpuset doesn't contain
4472 * any suitable zone to satisfy the request - e.g. non-movable
4473 * GFP_HIGHUSER allocations from MOVABLE nodes only.
4475 if (cpusets_insane_config() && (gfp_mask
& __GFP_HARDWALL
)) {
4476 struct zoneref
*z
= first_zones_zonelist(ac
->zonelist
,
4477 ac
->highest_zoneidx
,
4478 &cpuset_current_mems_allowed
);
4479 if (!zonelist_zone(z
))
4483 if (alloc_flags
& ALLOC_KSWAPD
)
4484 wake_all_kswapds(order
, gfp_mask
, ac
);
4487 * The adjusted alloc_flags might result in immediate success, so try
4490 page
= get_page_from_freelist(gfp_mask
, order
, alloc_flags
, ac
);
4495 * For costly allocations, try direct compaction first, as it's likely
4496 * that we have enough base pages and don't need to reclaim. For non-
4497 * movable high-order allocations, do that as well, as compaction will
4498 * try prevent permanent fragmentation by migrating from blocks of the
4500 * Don't try this for allocations that are allowed to ignore
4501 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
4503 if (can_direct_reclaim
&& can_compact
&&
4505 (order
> 0 && ac
->migratetype
!= MIGRATE_MOVABLE
))
4506 && !gfp_pfmemalloc_allowed(gfp_mask
)) {
4507 page
= __alloc_pages_direct_compact(gfp_mask
, order
,
4509 INIT_COMPACT_PRIORITY
,
4515 * Checks for costly allocations with __GFP_NORETRY, which
4516 * includes some THP page fault allocations
4518 if (costly_order
&& (gfp_mask
& __GFP_NORETRY
)) {
4520 * If allocating entire pageblock(s) and compaction
4521 * failed because all zones are below low watermarks
4522 * or is prohibited because it recently failed at this
4523 * order, fail immediately unless the allocator has
4524 * requested compaction and reclaim retry.
4527 * - potentially very expensive because zones are far
4528 * below their low watermarks or this is part of very
4529 * bursty high order allocations,
4530 * - not guaranteed to help because isolate_freepages()
4531 * may not iterate over freed pages as part of its
4533 * - unlikely to make entire pageblocks free on its
4536 if (compact_result
== COMPACT_SKIPPED
||
4537 compact_result
== COMPACT_DEFERRED
)
4541 * Looks like reclaim/compaction is worth trying, but
4542 * sync compaction could be very expensive, so keep
4543 * using async compaction.
4545 compact_priority
= INIT_COMPACT_PRIORITY
;
4551 * Deal with possible cpuset update races or zonelist updates to avoid
4554 if (check_retry_cpuset(cpuset_mems_cookie
, ac
) ||
4555 check_retry_zonelist(zonelist_iter_cookie
))
4558 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4559 if (alloc_flags
& ALLOC_KSWAPD
)
4560 wake_all_kswapds(order
, gfp_mask
, ac
);
4562 reserve_flags
= __gfp_pfmemalloc_flags(gfp_mask
);
4564 alloc_flags
= gfp_to_alloc_flags_cma(gfp_mask
, reserve_flags
) |
4565 (alloc_flags
& ALLOC_KSWAPD
);
4568 * Reset the nodemask and zonelist iterators if memory policies can be
4569 * ignored. These allocations are high priority and system rather than
4572 if (!(alloc_flags
& ALLOC_CPUSET
) || reserve_flags
) {
4573 ac
->nodemask
= NULL
;
4574 ac
->preferred_zoneref
= first_zones_zonelist(ac
->zonelist
,
4575 ac
->highest_zoneidx
, ac
->nodemask
);
4578 /* Attempt with potentially adjusted zonelist and alloc_flags */
4579 page
= get_page_from_freelist(gfp_mask
, order
, alloc_flags
, ac
);
4583 /* Caller is not willing to reclaim, we can't balance anything */
4584 if (!can_direct_reclaim
)
4587 /* Avoid recursion of direct reclaim */
4588 if (current
->flags
& PF_MEMALLOC
)
4591 /* Try direct reclaim and then allocating */
4592 page
= __alloc_pages_direct_reclaim(gfp_mask
, order
, alloc_flags
, ac
,
4593 &did_some_progress
);
4597 /* Try direct compaction and then allocating */
4598 page
= __alloc_pages_direct_compact(gfp_mask
, order
, alloc_flags
, ac
,
4599 compact_priority
, &compact_result
);
4603 /* Do not loop if specifically requested */
4604 if (gfp_mask
& __GFP_NORETRY
)
4608 * Do not retry costly high order allocations unless they are
4609 * __GFP_RETRY_MAYFAIL and we can compact
4611 if (costly_order
&& (!can_compact
||
4612 !(gfp_mask
& __GFP_RETRY_MAYFAIL
)))
4615 if (should_reclaim_retry(gfp_mask
, order
, ac
, alloc_flags
,
4616 did_some_progress
> 0, &no_progress_loops
))
4620 * It doesn't make any sense to retry for the compaction if the order-0
4621 * reclaim is not able to make any progress because the current
4622 * implementation of the compaction depends on the sufficient amount
4623 * of free memory (see __compaction_suitable)
4625 if (did_some_progress
> 0 && can_compact
&&
4626 should_compact_retry(ac
, order
, alloc_flags
,
4627 compact_result
, &compact_priority
,
4628 &compaction_retries
))
4631 /* Reclaim/compaction failed to prevent the fallback */
4632 if (defrag_mode
&& (alloc_flags
& ALLOC_NOFRAGMENT
)) {
4633 alloc_flags
&= ~ALLOC_NOFRAGMENT
;
4638 * Deal with possible cpuset update races or zonelist updates to avoid
4639 * a unnecessary OOM kill.
4641 if (check_retry_cpuset(cpuset_mems_cookie
, ac
) ||
4642 check_retry_zonelist(zonelist_iter_cookie
))
4645 /* Reclaim has failed us, start killing things */
4646 page
= __alloc_pages_may_oom(gfp_mask
, order
, ac
, &did_some_progress
);
4650 /* Avoid allocations with no watermarks from looping endlessly */
4651 if (tsk_is_oom_victim(current
) &&
4652 (alloc_flags
& ALLOC_OOM
||
4653 (gfp_mask
& __GFP_NOMEMALLOC
)))
4656 /* Retry as long as the OOM killer is making progress */
4657 if (did_some_progress
) {
4658 no_progress_loops
= 0;
4664 * Deal with possible cpuset update races or zonelist updates to avoid
4665 * a unnecessary OOM kill.
4667 if (check_retry_cpuset(cpuset_mems_cookie
, ac
) ||
4668 check_retry_zonelist(zonelist_iter_cookie
))
4672 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4675 if (unlikely(nofail
)) {
4677 * Lacking direct_reclaim we can't do anything to reclaim memory,
4678 * we disregard these unreasonable nofail requests and still
4681 if (!can_direct_reclaim
)
4685 * Help non-failing allocations by giving some access to memory
4686 * reserves normally used for high priority non-blocking
4687 * allocations but do not use ALLOC_NO_WATERMARKS because this
4688 * could deplete whole memory reserves which would just make
4689 * the situation worse.
4691 page
= __alloc_pages_cpuset_fallback(gfp_mask
, order
, ALLOC_MIN_RESERVE
, ac
);
4699 warn_alloc(gfp_mask
, ac
->nodemask
,
4700 "page allocation failure: order:%u", order
);
4705 static inline bool prepare_alloc_pages(gfp_t gfp_mask
, unsigned int order
,
4706 int preferred_nid
, nodemask_t
*nodemask
,
4707 struct alloc_context
*ac
, gfp_t
*alloc_gfp
,
4708 unsigned int *alloc_flags
)
4710 ac
->highest_zoneidx
= gfp_zone(gfp_mask
);
4711 ac
->zonelist
= node_zonelist(preferred_nid
, gfp_mask
);
4712 ac
->nodemask
= nodemask
;
4713 ac
->migratetype
= gfp_migratetype(gfp_mask
);
4715 if (cpusets_enabled()) {
4716 *alloc_gfp
|= __GFP_HARDWALL
;
4718 * When we are in the interrupt context, it is irrelevant
4719 * to the current task context. It means that any node ok.
4721 if (in_task() && !ac
->nodemask
)
4722 ac
->nodemask
= &cpuset_current_mems_allowed
;
4724 *alloc_flags
|= ALLOC_CPUSET
;
4727 might_alloc(gfp_mask
);
4730 * Don't invoke should_fail logic, since it may call
4731 * get_random_u32() and printk() which need to spin_lock.
4733 if (!(*alloc_flags
& ALLOC_TRYLOCK
) &&
4734 should_fail_alloc_page(gfp_mask
, order
))
4737 *alloc_flags
= gfp_to_alloc_flags_cma(gfp_mask
, *alloc_flags
);
4739 /* Dirty zone balancing only done in the fast path */
4740 ac
->spread_dirty_pages
= (gfp_mask
& __GFP_WRITE
);
4743 * The preferred zone is used for statistics but crucially it is
4744 * also used as the starting point for the zonelist iterator. It
4745 * may get reset for allocations that ignore memory policies.
4747 ac
->preferred_zoneref
= first_zones_zonelist(ac
->zonelist
,
4748 ac
->highest_zoneidx
, ac
->nodemask
);
4754 * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
4755 * @gfp: GFP flags for the allocation
4756 * @preferred_nid: The preferred NUMA node ID to allocate from
4757 * @nodemask: Set of nodes to allocate from, may be NULL
4758 * @nr_pages: The number of pages desired in the array
4759 * @page_array: Array to store the pages
4761 * This is a batched version of the page allocator that attempts to
4762 * allocate nr_pages quickly. Pages are added to the page_array.
4764 * Note that only NULL elements are populated with pages and nr_pages
4765 * is the maximum number of pages that will be stored in the array.
4767 * Returns the number of pages in the array.
4769 unsigned long alloc_pages_bulk_noprof(gfp_t gfp
, int preferred_nid
,
4770 nodemask_t
*nodemask
, int nr_pages
,
4771 struct page
**page_array
)
4774 unsigned long __maybe_unused UP_flags
;
4777 struct per_cpu_pages
*pcp
;
4778 struct list_head
*pcp_list
;
4779 struct alloc_context ac
;
4781 unsigned int alloc_flags
= ALLOC_WMARK_LOW
;
4782 int nr_populated
= 0, nr_account
= 0;
4785 * Skip populated array elements to determine if any pages need
4786 * to be allocated before disabling IRQs.
4788 while (nr_populated
< nr_pages
&& page_array
[nr_populated
])
4791 /* No pages requested? */
4792 if (unlikely(nr_pages
<= 0))
4795 /* Already populated array? */
4796 if (unlikely(nr_pages
- nr_populated
== 0))
4799 /* Bulk allocator does not support memcg accounting. */
4800 if (memcg_kmem_online() && (gfp
& __GFP_ACCOUNT
))
4803 /* Use the single page allocator for one page. */
4804 if (nr_pages
- nr_populated
== 1)
4807 #ifdef CONFIG_PAGE_OWNER
4809 * PAGE_OWNER may recurse into the allocator to allocate space to
4810 * save the stack with pagesets.lock held. Releasing/reacquiring
4811 * removes much of the performance benefit of bulk allocation so
4812 * force the caller to allocate one page at a time as it'll have
4813 * similar performance to added complexity to the bulk allocator.
4815 if (static_branch_unlikely(&page_owner_inited
))
4819 /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
4820 gfp
&= gfp_allowed_mask
;
4822 if (!prepare_alloc_pages(gfp
, 0, preferred_nid
, nodemask
, &ac
, &alloc_gfp
, &alloc_flags
))
4826 /* Find an allowed local zone that meets the low watermark. */
4827 z
= ac
.preferred_zoneref
;
4828 for_next_zone_zonelist_nodemask(zone
, z
, ac
.highest_zoneidx
, ac
.nodemask
) {
4831 if (cpusets_enabled() && (alloc_flags
& ALLOC_CPUSET
) &&
4832 !__cpuset_zone_allowed(zone
, gfp
)) {
4836 if (nr_online_nodes
> 1 && zone
!= zonelist_zone(ac
.preferred_zoneref
) &&
4837 zone_to_nid(zone
) != zonelist_node_idx(ac
.preferred_zoneref
)) {
4841 cond_accept_memory(zone
, 0, alloc_flags
);
4843 mark
= wmark_pages(zone
, alloc_flags
& ALLOC_WMARK_MASK
) + nr_pages
;
4844 if (zone_watermark_fast(zone
, 0, mark
,
4845 zonelist_zone_idx(ac
.preferred_zoneref
),
4846 alloc_flags
, gfp
)) {
4850 if (cond_accept_memory(zone
, 0, alloc_flags
))
4851 goto retry_this_zone
;
4853 /* Try again if zone has deferred pages */
4854 if (deferred_pages_enabled()) {
4855 if (_deferred_grow_zone(zone
, 0))
4856 goto retry_this_zone
;
4861 * If there are no allowed local zones that meets the watermarks then
4862 * try to allocate a single page and reclaim if necessary.
4864 if (unlikely(!zone
))
4867 /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
4868 pcp_trylock_prepare(UP_flags
);
4869 pcp
= pcp_spin_trylock(zone
->per_cpu_pageset
);
4873 /* Attempt the batch allocation */
4874 pcp_list
= &pcp
->lists
[order_to_pindex(ac
.migratetype
, 0)];
4875 while (nr_populated
< nr_pages
) {
4877 /* Skip existing pages */
4878 if (page_array
[nr_populated
]) {
4883 page
= __rmqueue_pcplist(zone
, 0, ac
.migratetype
, alloc_flags
,
4885 if (unlikely(!page
)) {
4886 /* Try and allocate at least one page */
4888 pcp_spin_unlock(pcp
);
4895 prep_new_page(page
, 0, gfp
, 0);
4896 set_page_refcounted(page
);
4897 page_array
[nr_populated
++] = page
;
4900 pcp_spin_unlock(pcp
);
4901 pcp_trylock_finish(UP_flags
);
4903 __count_zid_vm_events(PGALLOC
, zone_idx(zone
), nr_account
);
4904 zone_statistics(zonelist_zone(ac
.preferred_zoneref
), zone
, nr_account
);
4907 return nr_populated
;
4910 pcp_trylock_finish(UP_flags
);
4913 page
= __alloc_pages_noprof(gfp
, 0, preferred_nid
, nodemask
);
4915 page_array
[nr_populated
++] = page
;
4918 EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof
);
4921 * This is the 'heart' of the zoned buddy allocator.
4923 struct page
*__alloc_frozen_pages_noprof(gfp_t gfp
, unsigned int order
,
4924 int preferred_nid
, nodemask_t
*nodemask
)
4927 unsigned int alloc_flags
= ALLOC_WMARK_LOW
;
4928 gfp_t alloc_gfp
; /* The gfp_t that was actually used for allocation */
4929 struct alloc_context ac
= { };
4932 * There are several places where we assume that the order value is sane
4933 * so bail out early if the request is out of bound.
4935 if (WARN_ON_ONCE_GFP(order
> MAX_PAGE_ORDER
, gfp
))
4938 gfp
&= gfp_allowed_mask
;
4940 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
4941 * resp. GFP_NOIO which has to be inherited for all allocation requests
4942 * from a particular context which has been marked by
4943 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
4944 * movable zones are not used during allocation.
4946 gfp
= current_gfp_context(gfp
);
4948 if (!prepare_alloc_pages(gfp
, order
, preferred_nid
, nodemask
, &ac
,
4949 &alloc_gfp
, &alloc_flags
))
4953 * Forbid the first pass from falling back to types that fragment
4954 * memory until all local zones are considered.
4956 alloc_flags
|= alloc_flags_nofragment(zonelist_zone(ac
.preferred_zoneref
), gfp
);
4958 /* First allocation attempt */
4959 page
= get_page_from_freelist(alloc_gfp
, order
, alloc_flags
, &ac
);
4964 ac
.spread_dirty_pages
= false;
4967 * Restore the original nodemask if it was potentially replaced with
4968 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4970 ac
.nodemask
= nodemask
;
4972 page
= __alloc_pages_slowpath(alloc_gfp
, order
, &ac
);
4975 if (memcg_kmem_online() && (gfp
& __GFP_ACCOUNT
) && page
&&
4976 unlikely(__memcg_kmem_charge_page(page
, gfp
, order
) != 0)) {
4977 free_frozen_pages(page
, order
);
4981 trace_mm_page_alloc(page
, order
, alloc_gfp
, ac
.migratetype
);
4982 kmsan_alloc_page(page
, order
, alloc_gfp
);
4986 EXPORT_SYMBOL(__alloc_frozen_pages_noprof
);
4988 struct page
*__alloc_pages_noprof(gfp_t gfp
, unsigned int order
,
4989 int preferred_nid
, nodemask_t
*nodemask
)
4993 page
= __alloc_frozen_pages_noprof(gfp
, order
, preferred_nid
, nodemask
);
4995 set_page_refcounted(page
);
4998 EXPORT_SYMBOL(__alloc_pages_noprof
);
5000 struct folio
*__folio_alloc_noprof(gfp_t gfp
, unsigned int order
, int preferred_nid
,
5001 nodemask_t
*nodemask
)
5003 struct page
*page
= __alloc_pages_noprof(gfp
| __GFP_COMP
, order
,
5004 preferred_nid
, nodemask
);
5005 return page_rmappable_folio(page
);
5007 EXPORT_SYMBOL(__folio_alloc_noprof
);
5010 * Common helper functions. Never use with __GFP_HIGHMEM because the returned
5011 * address cannot represent highmem pages. Use alloc_pages and then kmap if
5012 * you need to access high mem.
5014 unsigned long get_free_pages_noprof(gfp_t gfp_mask
, unsigned int order
)
5018 page
= alloc_pages_noprof(gfp_mask
& ~__GFP_HIGHMEM
, order
);
5021 return (unsigned long) page_address(page
);
5023 EXPORT_SYMBOL(get_free_pages_noprof
);
5025 unsigned long get_zeroed_page_noprof(gfp_t gfp_mask
)
5027 return get_free_pages_noprof(gfp_mask
| __GFP_ZERO
, 0);
5029 EXPORT_SYMBOL(get_zeroed_page_noprof
);
5032 * ___free_pages - Free pages allocated with alloc_pages().
5033 * @page: The page pointer returned from alloc_pages().
5034 * @order: The order of the allocation.
5035 * @fpi_flags: Free Page Internal flags.
5037 * This function can free multi-page allocations that are not compound
5038 * pages. It does not check that the @order passed in matches that of
5039 * the allocation, so it is easy to leak memory. Freeing more memory
5040 * than was allocated will probably emit a warning.
5042 * If the last reference to this page is speculative, it will be released
5043 * by put_page() which only frees the first page of a non-compound
5044 * allocation. To prevent the remaining pages from being leaked, we free
5045 * the subsequent pages here. If you want to use the page's reference
5046 * count to decide when to free the allocation, you should allocate a
5047 * compound page, and use put_page() instead of __free_pages().
5049 * Context: May be called in interrupt context or while holding a normal
5050 * spinlock, but not in NMI context or while holding a raw spinlock.
5052 static void ___free_pages(struct page
*page
, unsigned int order
,
5055 /* get PageHead before we drop reference */
5056 int head
= PageHead(page
);
5057 /* get alloc tag in case the page is released by others */
5058 struct alloc_tag
*tag
= pgalloc_tag_get(page
);
5060 if (put_page_testzero(page
))
5061 __free_frozen_pages(page
, order
, fpi_flags
);
5063 pgalloc_tag_sub_pages(tag
, (1 << order
) - 1);
5065 __free_frozen_pages(page
+ (1 << order
), order
,
5069 void __free_pages(struct page
*page
, unsigned int order
)
5071 ___free_pages(page
, order
, FPI_NONE
);
5073 EXPORT_SYMBOL(__free_pages
);
5076 * Can be called while holding raw_spin_lock or from IRQ and NMI for any
5077 * page type (not only those that came from alloc_pages_nolock)
5079 void free_pages_nolock(struct page
*page
, unsigned int order
)
5081 ___free_pages(page
, order
, FPI_TRYLOCK
);
5084 void free_pages(unsigned long addr
, unsigned int order
)
5087 VM_BUG_ON(!virt_addr_valid((void *)addr
));
5088 __free_pages(virt_to_page((void *)addr
), order
);
5092 EXPORT_SYMBOL(free_pages
);
5094 static void *make_alloc_exact(unsigned long addr
, unsigned int order
,
5098 unsigned long nr
= DIV_ROUND_UP(size
, PAGE_SIZE
);
5099 struct page
*page
= virt_to_page((void *)addr
);
5100 struct page
*last
= page
+ nr
;
5102 split_page_owner(page
, order
, 0);
5103 pgalloc_tag_split(page_folio(page
), order
, 0);
5104 split_page_memcg(page
, order
);
5105 while (page
< --last
)
5106 set_page_refcounted(last
);
5108 last
= page
+ (1UL << order
);
5109 for (page
+= nr
; page
< last
; page
++)
5110 __free_pages_ok(page
, 0, FPI_TO_TAIL
);
5112 return (void *)addr
;
5116 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
5117 * @size: the number of bytes to allocate
5118 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
5120 * This function is similar to alloc_pages(), except that it allocates the
5121 * minimum number of pages to satisfy the request. alloc_pages() can only
5122 * allocate memory in power-of-two pages.
5124 * This function is also limited by MAX_PAGE_ORDER.
5126 * Memory allocated by this function must be released by free_pages_exact().
5128 * Return: pointer to the allocated area or %NULL in case of error.
5130 void *alloc_pages_exact_noprof(size_t size
, gfp_t gfp_mask
)
5132 unsigned int order
= get_order(size
);
5135 if (WARN_ON_ONCE(gfp_mask
& (__GFP_COMP
| __GFP_HIGHMEM
)))
5136 gfp_mask
&= ~(__GFP_COMP
| __GFP_HIGHMEM
);
5138 addr
= get_free_pages_noprof(gfp_mask
, order
);
5139 return make_alloc_exact(addr
, order
, size
);
5141 EXPORT_SYMBOL(alloc_pages_exact_noprof
);
5144 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
5146 * @nid: the preferred node ID where memory should be allocated
5147 * @size: the number of bytes to allocate
5148 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
5150 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
5153 * Return: pointer to the allocated area or %NULL in case of error.
5155 void * __meminit
alloc_pages_exact_nid_noprof(int nid
, size_t size
, gfp_t gfp_mask
)
5157 unsigned int order
= get_order(size
);
5160 if (WARN_ON_ONCE(gfp_mask
& (__GFP_COMP
| __GFP_HIGHMEM
)))
5161 gfp_mask
&= ~(__GFP_COMP
| __GFP_HIGHMEM
);
5163 p
= alloc_pages_node_noprof(nid
, gfp_mask
, order
);
5166 return make_alloc_exact((unsigned long)page_address(p
), order
, size
);
5170 * free_pages_exact - release memory allocated via alloc_pages_exact()
5171 * @virt: the value returned by alloc_pages_exact.
5172 * @size: size of allocation, same value as passed to alloc_pages_exact().
5174 * Release the memory allocated by a previous call to alloc_pages_exact.
5176 void free_pages_exact(void *virt
, size_t size
)
5178 unsigned long addr
= (unsigned long)virt
;
5179 unsigned long end
= addr
+ PAGE_ALIGN(size
);
5181 while (addr
< end
) {
5186 EXPORT_SYMBOL(free_pages_exact
);
5189 * nr_free_zone_pages - count number of pages beyond high watermark
5190 * @offset: The zone index of the highest zone
5192 * nr_free_zone_pages() counts the number of pages which are beyond the
5193 * high watermark within all zones at or below a given zone index. For each
5194 * zone, the number of pages is calculated as:
5196 * nr_free_zone_pages = managed_pages - high_pages
5198 * Return: number of pages beyond high watermark.
5200 static unsigned long nr_free_zone_pages(int offset
)
5205 /* Just pick one node, since fallback list is circular */
5206 unsigned long sum
= 0;
5208 struct zonelist
*zonelist
= node_zonelist(numa_node_id(), GFP_KERNEL
);
5210 for_each_zone_zonelist(zone
, z
, zonelist
, offset
) {
5211 unsigned long size
= zone_managed_pages(zone
);
5212 unsigned long high
= high_wmark_pages(zone
);
5221 * nr_free_buffer_pages - count number of pages beyond high watermark
5223 * nr_free_buffer_pages() counts the number of pages which are beyond the high
5224 * watermark within ZONE_DMA and ZONE_NORMAL.
5226 * Return: number of pages beyond high watermark within ZONE_DMA and
5229 unsigned long nr_free_buffer_pages(void)
5231 return nr_free_zone_pages(gfp_zone(GFP_USER
));
5233 EXPORT_SYMBOL_GPL(nr_free_buffer_pages
);
5235 static void zoneref_set_zone(struct zone
*zone
, struct zoneref
*zoneref
)
5237 zoneref
->zone
= zone
;
5238 zoneref
->zone_idx
= zone_idx(zone
);
5242 * Builds allocation fallback zone lists.
5244 * Add all populated zones of a node to the zonelist.
5246 static int build_zonerefs_node(pg_data_t
*pgdat
, struct zoneref
*zonerefs
)
5249 enum zone_type zone_type
= MAX_NR_ZONES
;
5254 zone
= pgdat
->node_zones
+ zone_type
;
5255 if (populated_zone(zone
)) {
5256 zoneref_set_zone(zone
, &zonerefs
[nr_zones
++]);
5257 check_highest_zone(zone_type
);
5259 } while (zone_type
);
5266 static int __parse_numa_zonelist_order(char *s
)
5269 * We used to support different zonelists modes but they turned
5270 * out to be just not useful. Let's keep the warning in place
5271 * if somebody still use the cmd line parameter so that we do
5272 * not fail it silently
5274 if (!(*s
== 'd' || *s
== 'D' || *s
== 'n' || *s
== 'N')) {
5275 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s
);
5281 static char numa_zonelist_order
[] = "Node";
5282 #define NUMA_ZONELIST_ORDER_LEN 16
5284 * sysctl handler for numa_zonelist_order
5286 static int numa_zonelist_order_handler(const struct ctl_table
*table
, int write
,
5287 void *buffer
, size_t *length
, loff_t
*ppos
)
5290 return __parse_numa_zonelist_order(buffer
);
5291 return proc_dostring(table
, write
, buffer
, length
, ppos
);
5294 static int node_load
[MAX_NUMNODES
];
5297 * find_next_best_node - find the next node that should appear in a given node's fallback list
5298 * @node: node whose fallback list we're appending
5299 * @used_node_mask: nodemask_t of already used nodes
5301 * We use a number of factors to determine which is the next node that should
5302 * appear on a given node's fallback list. The node should not have appeared
5303 * already in @node's fallback list, and it should be the next closest node
5304 * according to the distance array (which contains arbitrary distance values
5305 * from each node to each node in the system), and should also prefer nodes
5306 * with no CPUs, since presumably they'll have very little allocation pressure
5307 * on them otherwise.
5309 * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5311 int find_next_best_node(int node
, nodemask_t
*used_node_mask
)
5314 int min_val
= INT_MAX
;
5315 int best_node
= NUMA_NO_NODE
;
5318 * Use the local node if we haven't already, but for memoryless local
5319 * node, we should skip it and fall back to other nodes.
5321 if (!node_isset(node
, *used_node_mask
) && node_state(node
, N_MEMORY
)) {
5322 node_set(node
, *used_node_mask
);
5326 for_each_node_state(n
, N_MEMORY
) {
5328 /* Don't want a node to appear more than once */
5329 if (node_isset(n
, *used_node_mask
))
5332 /* Use the distance array to find the distance */
5333 val
= node_distance(node
, n
);
5335 /* Penalize nodes under us ("prefer the next node") */
5338 /* Give preference to headless and unused nodes */
5339 if (!cpumask_empty(cpumask_of_node(n
)))
5340 val
+= PENALTY_FOR_NODE_WITH_CPUS
;
5342 /* Slight preference for less loaded node */
5343 val
*= MAX_NUMNODES
;
5344 val
+= node_load
[n
];
5346 if (val
< min_val
) {
5353 node_set(best_node
, *used_node_mask
);
5360 * Build zonelists ordered by node and zones within node.
5361 * This results in maximum locality--normal zone overflows into local
5362 * DMA zone, if any--but risks exhausting DMA zone.
5364 static void build_zonelists_in_node_order(pg_data_t
*pgdat
, int *node_order
,
5367 struct zoneref
*zonerefs
;
5370 zonerefs
= pgdat
->node_zonelists
[ZONELIST_FALLBACK
]._zonerefs
;
5372 for (i
= 0; i
< nr_nodes
; i
++) {
5375 pg_data_t
*node
= NODE_DATA(node_order
[i
]);
5377 nr_zones
= build_zonerefs_node(node
, zonerefs
);
5378 zonerefs
+= nr_zones
;
5380 zonerefs
->zone
= NULL
;
5381 zonerefs
->zone_idx
= 0;
5385 * Build __GFP_THISNODE zonelists
5387 static void build_thisnode_zonelists(pg_data_t
*pgdat
)
5389 struct zoneref
*zonerefs
;
5392 zonerefs
= pgdat
->node_zonelists
[ZONELIST_NOFALLBACK
]._zonerefs
;
5393 nr_zones
= build_zonerefs_node(pgdat
, zonerefs
);
5394 zonerefs
+= nr_zones
;
5395 zonerefs
->zone
= NULL
;
5396 zonerefs
->zone_idx
= 0;
5399 static void build_zonelists(pg_data_t
*pgdat
)
5401 static int node_order
[MAX_NUMNODES
];
5402 int node
, nr_nodes
= 0;
5403 nodemask_t used_mask
= NODE_MASK_NONE
;
5404 int local_node
, prev_node
;
5406 /* NUMA-aware ordering of nodes */
5407 local_node
= pgdat
->node_id
;
5408 prev_node
= local_node
;
5410 memset(node_order
, 0, sizeof(node_order
));
5411 while ((node
= find_next_best_node(local_node
, &used_mask
)) >= 0) {
5413 * We don't want to pressure a particular node.
5414 * So adding penalty to the first node in same
5415 * distance group to make it round-robin.
5417 if (node_distance(local_node
, node
) !=
5418 node_distance(local_node
, prev_node
))
5419 node_load
[node
] += 1;
5421 node_order
[nr_nodes
++] = node
;
5425 build_zonelists_in_node_order(pgdat
, node_order
, nr_nodes
);
5426 build_thisnode_zonelists(pgdat
);
5427 pr_info("Fallback order for Node %d: ", local_node
);
5428 for (node
= 0; node
< nr_nodes
; node
++)
5429 pr_cont("%d ", node_order
[node
]);
5433 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5435 * Return node id of node used for "local" allocations.
5436 * I.e., first node id of first zone in arg node's generic zonelist.
5437 * Used for initializing percpu 'numa_mem', which is used primarily
5438 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5440 int local_memory_node(int node
)
5444 z
= first_zones_zonelist(node_zonelist(node
, GFP_KERNEL
),
5445 gfp_zone(GFP_KERNEL
),
5447 return zonelist_node_idx(z
);
5451 static void setup_min_unmapped_ratio(void);
5452 static void setup_min_slab_ratio(void);
5453 #else /* CONFIG_NUMA */
5455 static void build_zonelists(pg_data_t
*pgdat
)
5457 struct zoneref
*zonerefs
;
5460 zonerefs
= pgdat
->node_zonelists
[ZONELIST_FALLBACK
]._zonerefs
;
5461 nr_zones
= build_zonerefs_node(pgdat
, zonerefs
);
5462 zonerefs
+= nr_zones
;
5464 zonerefs
->zone
= NULL
;
5465 zonerefs
->zone_idx
= 0;
5468 #endif /* CONFIG_NUMA */
5471 * Boot pageset table. One per cpu which is going to be used for all
5472 * zones and all nodes. The parameters will be set in such a way
5473 * that an item put on a list will immediately be handed over to
5474 * the buddy list. This is safe since pageset manipulation is done
5475 * with interrupts disabled.
5477 * The boot_pagesets must be kept even after bootup is complete for
5478 * unused processors and/or zones. They do play a role for bootstrapping
5479 * hotplugged processors.
5481 * zoneinfo_show() and maybe other functions do
5482 * not check if the processor is online before following the pageset pointer.
5483 * Other parts of the kernel may not check if the zone is available.
5485 static void per_cpu_pages_init(struct per_cpu_pages
*pcp
, struct per_cpu_zonestat
*pzstats
);
5486 /* These effectively disable the pcplists in the boot pageset completely */
5487 #define BOOT_PAGESET_HIGH 0
5488 #define BOOT_PAGESET_BATCH 1
5489 static DEFINE_PER_CPU(struct per_cpu_pages
, boot_pageset
);
5490 static DEFINE_PER_CPU(struct per_cpu_zonestat
, boot_zonestats
);
5492 static void __build_all_zonelists(void *data
)
5495 int __maybe_unused cpu
;
5496 pg_data_t
*self
= data
;
5497 unsigned long flags
;
5500 * The zonelist_update_seq must be acquired with irqsave because the
5501 * reader can be invoked from IRQ with GFP_ATOMIC.
5503 write_seqlock_irqsave(&zonelist_update_seq
, flags
);
5505 * Also disable synchronous printk() to prevent any printk() from
5506 * trying to hold port->lock, for
5507 * tty_insert_flip_string_and_push_buffer() on other CPU might be
5508 * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
5510 printk_deferred_enter();
5513 memset(node_load
, 0, sizeof(node_load
));
5517 * This node is hotadded and no memory is yet present. So just
5518 * building zonelists is fine - no need to touch other nodes.
5520 if (self
&& !node_online(self
->node_id
)) {
5521 build_zonelists(self
);
5524 * All possible nodes have pgdat preallocated
5527 for_each_node(nid
) {
5528 pg_data_t
*pgdat
= NODE_DATA(nid
);
5530 build_zonelists(pgdat
);
5533 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5535 * We now know the "local memory node" for each node--
5536 * i.e., the node of the first zone in the generic zonelist.
5537 * Set up numa_mem percpu variable for on-line cpus. During
5538 * boot, only the boot cpu should be on-line; we'll init the
5539 * secondary cpus' numa_mem as they come on-line. During
5540 * node/memory hotplug, we'll fixup all on-line cpus.
5542 for_each_online_cpu(cpu
)
5543 set_cpu_numa_mem(cpu
, local_memory_node(cpu_to_node(cpu
)));
5547 printk_deferred_exit();
5548 write_sequnlock_irqrestore(&zonelist_update_seq
, flags
);
5551 static noinline
void __init
5552 build_all_zonelists_init(void)
5556 __build_all_zonelists(NULL
);
5559 * Initialize the boot_pagesets that are going to be used
5560 * for bootstrapping processors. The real pagesets for
5561 * each zone will be allocated later when the per cpu
5562 * allocator is available.
5564 * boot_pagesets are used also for bootstrapping offline
5565 * cpus if the system is already booted because the pagesets
5566 * are needed to initialize allocators on a specific cpu too.
5567 * F.e. the percpu allocator needs the page allocator which
5568 * needs the percpu allocator in order to allocate its pagesets
5569 * (a chicken-egg dilemma).
5571 for_each_possible_cpu(cpu
)
5572 per_cpu_pages_init(&per_cpu(boot_pageset
, cpu
), &per_cpu(boot_zonestats
, cpu
));
5574 mminit_verify_zonelist();
5575 cpuset_init_current_mems_allowed();
5579 * unless system_state == SYSTEM_BOOTING.
5581 * __ref due to call of __init annotated helper build_all_zonelists_init
5582 * [protected by SYSTEM_BOOTING].
5584 void __ref
build_all_zonelists(pg_data_t
*pgdat
)
5586 unsigned long vm_total_pages
;
5588 if (system_state
== SYSTEM_BOOTING
) {
5589 build_all_zonelists_init();
5591 __build_all_zonelists(pgdat
);
5592 /* cpuset refresh routine should be here */
5594 /* Get the number of free pages beyond high watermark in all zones. */
5595 vm_total_pages
= nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE
));
5597 * Disable grouping by mobility if the number of pages in the
5598 * system is too low to allow the mechanism to work. It would be
5599 * more accurate, but expensive to check per-zone. This check is
5600 * made on memory-hotadd so a system can start with mobility
5601 * disabled and enable it later
5603 if (vm_total_pages
< (pageblock_nr_pages
* MIGRATE_TYPES
))
5604 page_group_by_mobility_disabled
= 1;
5606 page_group_by_mobility_disabled
= 0;
5608 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5610 str_off_on(page_group_by_mobility_disabled
),
5613 pr_info("Policy zone: %s\n", zone_names
[policy_zone
]);
5617 static int zone_batchsize(struct zone
*zone
)
5623 * The number of pages to batch allocate is either ~0.1%
5624 * of the zone or 1MB, whichever is smaller. The batch
5625 * size is striking a balance between allocation latency
5626 * and zone lock contention.
5628 batch
= min(zone_managed_pages(zone
) >> 10, SZ_1M
/ PAGE_SIZE
);
5629 batch
/= 4; /* We effectively *= 4 below */
5634 * Clamp the batch to a 2^n - 1 value. Having a power
5635 * of 2 value was found to be more likely to have
5636 * suboptimal cache aliasing properties in some cases.
5638 * For example if 2 tasks are alternately allocating
5639 * batches of pages, one task can end up with a lot
5640 * of pages of one half of the possible page colors
5641 * and the other with pages of the other colors.
5643 batch
= rounddown_pow_of_two(batch
+ batch
/2) - 1;
5648 /* The deferral and batching of frees should be suppressed under NOMMU
5651 * The problem is that NOMMU needs to be able to allocate large chunks
5652 * of contiguous memory as there's no hardware page translation to
5653 * assemble apparent contiguous memory from discontiguous pages.
5655 * Queueing large contiguous runs of pages for batching, however,
5656 * causes the pages to actually be freed in smaller chunks. As there
5657 * can be a significant delay between the individual batches being
5658 * recycled, this leads to the once large chunks of space being
5659 * fragmented and becoming unavailable for high-order allocations.
5665 static int percpu_pagelist_high_fraction
;
5666 static int zone_highsize(struct zone
*zone
, int batch
, int cpu_online
,
5672 unsigned long total_pages
;
5674 if (!high_fraction
) {
5676 * By default, the high value of the pcp is based on the zone
5677 * low watermark so that if they are full then background
5678 * reclaim will not be started prematurely.
5680 total_pages
= low_wmark_pages(zone
);
5683 * If percpu_pagelist_high_fraction is configured, the high
5684 * value is based on a fraction of the managed pages in the
5687 total_pages
= zone_managed_pages(zone
) / high_fraction
;
5691 * Split the high value across all online CPUs local to the zone. Note
5692 * that early in boot that CPUs may not be online yet and that during
5693 * CPU hotplug that the cpumask is not yet updated when a CPU is being
5694 * onlined. For memory nodes that have no CPUs, split the high value
5695 * across all online CPUs to mitigate the risk that reclaim is triggered
5696 * prematurely due to pages stored on pcp lists.
5698 nr_split_cpus
= cpumask_weight(cpumask_of_node(zone_to_nid(zone
))) + cpu_online
;
5700 nr_split_cpus
= num_online_cpus();
5701 high
= total_pages
/ nr_split_cpus
;
5704 * Ensure high is at least batch*4. The multiple is based on the
5705 * historical relationship between high and batch.
5707 high
= max(high
, batch
<< 2);
5716 * pcp->high and pcp->batch values are related and generally batch is lower
5717 * than high. They are also related to pcp->count such that count is lower
5718 * than high, and as soon as it reaches high, the pcplist is flushed.
5720 * However, guaranteeing these relations at all times would require e.g. write
5721 * barriers here but also careful usage of read barriers at the read side, and
5722 * thus be prone to error and bad for performance. Thus the update only prevents
5723 * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
5724 * should ensure they can cope with those fields changing asynchronously, and
5725 * fully trust only the pcp->count field on the local CPU with interrupts
5728 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5729 * outside of boot time (or some other assurance that no concurrent updaters
5732 static void pageset_update(struct per_cpu_pages
*pcp
, unsigned long high_min
,
5733 unsigned long high_max
, unsigned long batch
)
5735 WRITE_ONCE(pcp
->batch
, batch
);
5736 WRITE_ONCE(pcp
->high_min
, high_min
);
5737 WRITE_ONCE(pcp
->high_max
, high_max
);
5740 static void per_cpu_pages_init(struct per_cpu_pages
*pcp
, struct per_cpu_zonestat
*pzstats
)
5744 memset(pcp
, 0, sizeof(*pcp
));
5745 memset(pzstats
, 0, sizeof(*pzstats
));
5747 spin_lock_init(&pcp
->lock
);
5748 for (pindex
= 0; pindex
< NR_PCP_LISTS
; pindex
++)
5749 INIT_LIST_HEAD(&pcp
->lists
[pindex
]);
5752 * Set batch and high values safe for a boot pageset. A true percpu
5753 * pageset's initialization will update them subsequently. Here we don't
5754 * need to be as careful as pageset_update() as nobody can access the
5757 pcp
->high_min
= BOOT_PAGESET_HIGH
;
5758 pcp
->high_max
= BOOT_PAGESET_HIGH
;
5759 pcp
->batch
= BOOT_PAGESET_BATCH
;
5760 pcp
->free_count
= 0;
5763 static void __zone_set_pageset_high_and_batch(struct zone
*zone
, unsigned long high_min
,
5764 unsigned long high_max
, unsigned long batch
)
5766 struct per_cpu_pages
*pcp
;
5769 for_each_possible_cpu(cpu
) {
5770 pcp
= per_cpu_ptr(zone
->per_cpu_pageset
, cpu
);
5771 pageset_update(pcp
, high_min
, high_max
, batch
);
5776 * Calculate and set new high and batch values for all per-cpu pagesets of a
5777 * zone based on the zone's size.
5779 static void zone_set_pageset_high_and_batch(struct zone
*zone
, int cpu_online
)
5781 int new_high_min
, new_high_max
, new_batch
;
5783 new_batch
= max(1, zone_batchsize(zone
));
5784 if (percpu_pagelist_high_fraction
) {
5785 new_high_min
= zone_highsize(zone
, new_batch
, cpu_online
,
5786 percpu_pagelist_high_fraction
);
5788 * PCP high is tuned manually, disable auto-tuning via
5789 * setting high_min and high_max to the manual value.
5791 new_high_max
= new_high_min
;
5793 new_high_min
= zone_highsize(zone
, new_batch
, cpu_online
, 0);
5794 new_high_max
= zone_highsize(zone
, new_batch
, cpu_online
,
5795 MIN_PERCPU_PAGELIST_HIGH_FRACTION
);
5798 if (zone
->pageset_high_min
== new_high_min
&&
5799 zone
->pageset_high_max
== new_high_max
&&
5800 zone
->pageset_batch
== new_batch
)
5803 zone
->pageset_high_min
= new_high_min
;
5804 zone
->pageset_high_max
= new_high_max
;
5805 zone
->pageset_batch
= new_batch
;
5807 __zone_set_pageset_high_and_batch(zone
, new_high_min
, new_high_max
,
5811 void __meminit
setup_zone_pageset(struct zone
*zone
)
5815 /* Size may be 0 on !SMP && !NUMA */
5816 if (sizeof(struct per_cpu_zonestat
) > 0)
5817 zone
->per_cpu_zonestats
= alloc_percpu(struct per_cpu_zonestat
);
5819 zone
->per_cpu_pageset
= alloc_percpu(struct per_cpu_pages
);
5820 for_each_possible_cpu(cpu
) {
5821 struct per_cpu_pages
*pcp
;
5822 struct per_cpu_zonestat
*pzstats
;
5824 pcp
= per_cpu_ptr(zone
->per_cpu_pageset
, cpu
);
5825 pzstats
= per_cpu_ptr(zone
->per_cpu_zonestats
, cpu
);
5826 per_cpu_pages_init(pcp
, pzstats
);
5829 zone_set_pageset_high_and_batch(zone
, 0);
5833 * The zone indicated has a new number of managed_pages; batch sizes and percpu
5834 * page high values need to be recalculated.
5836 static void zone_pcp_update(struct zone
*zone
, int cpu_online
)
5838 mutex_lock(&pcp_batch_high_lock
);
5839 zone_set_pageset_high_and_batch(zone
, cpu_online
);
5840 mutex_unlock(&pcp_batch_high_lock
);
5843 static void zone_pcp_update_cacheinfo(struct zone
*zone
, unsigned int cpu
)
5845 struct per_cpu_pages
*pcp
;
5846 struct cpu_cacheinfo
*cci
;
5848 pcp
= per_cpu_ptr(zone
->per_cpu_pageset
, cpu
);
5849 cci
= get_cpu_cacheinfo(cpu
);
5851 * If data cache slice of CPU is large enough, "pcp->batch"
5852 * pages can be preserved in PCP before draining PCP for
5853 * consecutive high-order pages freeing without allocation.
5854 * This can reduce zone lock contention without hurting
5855 * cache-hot pages sharing.
5857 spin_lock(&pcp
->lock
);
5858 if ((cci
->per_cpu_data_slice_size
>> PAGE_SHIFT
) > 3 * pcp
->batch
)
5859 pcp
->flags
|= PCPF_FREE_HIGH_BATCH
;
5861 pcp
->flags
&= ~PCPF_FREE_HIGH_BATCH
;
5862 spin_unlock(&pcp
->lock
);
5865 void setup_pcp_cacheinfo(unsigned int cpu
)
5869 for_each_populated_zone(zone
)
5870 zone_pcp_update_cacheinfo(zone
, cpu
);
5874 * Allocate per cpu pagesets and initialize them.
5875 * Before this call only boot pagesets were available.
5877 void __init
setup_per_cpu_pageset(void)
5879 struct pglist_data
*pgdat
;
5881 int __maybe_unused cpu
;
5883 for_each_populated_zone(zone
)
5884 setup_zone_pageset(zone
);
5888 * Unpopulated zones continue using the boot pagesets.
5889 * The numa stats for these pagesets need to be reset.
5890 * Otherwise, they will end up skewing the stats of
5891 * the nodes these zones are associated with.
5893 for_each_possible_cpu(cpu
) {
5894 struct per_cpu_zonestat
*pzstats
= &per_cpu(boot_zonestats
, cpu
);
5895 memset(pzstats
->vm_numa_event
, 0,
5896 sizeof(pzstats
->vm_numa_event
));
5900 for_each_online_pgdat(pgdat
)
5901 pgdat
->per_cpu_nodestats
=
5902 alloc_percpu(struct per_cpu_nodestat
);
5905 __meminit
void zone_pcp_init(struct zone
*zone
)
5908 * per cpu subsystem is not up at this point. The following code
5909 * relies on the ability of the linker to provide the
5910 * offset of a (static) per cpu variable into the per cpu area.
5912 zone
->per_cpu_pageset
= &boot_pageset
;
5913 zone
->per_cpu_zonestats
= &boot_zonestats
;
5914 zone
->pageset_high_min
= BOOT_PAGESET_HIGH
;
5915 zone
->pageset_high_max
= BOOT_PAGESET_HIGH
;
5916 zone
->pageset_batch
= BOOT_PAGESET_BATCH
;
5918 if (populated_zone(zone
))
5919 pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone
->name
,
5920 zone
->present_pages
, zone_batchsize(zone
));
5923 static void setup_per_zone_lowmem_reserve(void);
5925 void adjust_managed_page_count(struct page
*page
, long count
)
5927 atomic_long_add(count
, &page_zone(page
)->managed_pages
);
5928 totalram_pages_add(count
);
5929 setup_per_zone_lowmem_reserve();
5931 EXPORT_SYMBOL(adjust_managed_page_count
);
5933 unsigned long free_reserved_area(void *start
, void *end
, int poison
, const char *s
)
5936 unsigned long pages
= 0;
5938 start
= (void *)PAGE_ALIGN((unsigned long)start
);
5939 end
= (void *)((unsigned long)end
& PAGE_MASK
);
5940 for (pos
= start
; pos
< end
; pos
+= PAGE_SIZE
, pages
++) {
5941 struct page
*page
= virt_to_page(pos
);
5942 void *direct_map_addr
;
5945 * 'direct_map_addr' might be different from 'pos'
5946 * because some architectures' virt_to_page()
5947 * work with aliases. Getting the direct map
5948 * address ensures that we get a _writeable_
5949 * alias for the memset().
5951 direct_map_addr
= page_address(page
);
5953 * Perform a kasan-unchecked memset() since this memory
5954 * has not been initialized.
5956 direct_map_addr
= kasan_reset_tag(direct_map_addr
);
5957 if ((unsigned int)poison
<= 0xFF)
5958 memset(direct_map_addr
, poison
, PAGE_SIZE
);
5960 free_reserved_page(page
);
5964 pr_info("Freeing %s memory: %ldK\n", s
, K(pages
));
5969 void free_reserved_page(struct page
*page
)
5971 clear_page_tag_ref(page
);
5972 ClearPageReserved(page
);
5973 init_page_count(page
);
5975 adjust_managed_page_count(page
, 1);
5977 EXPORT_SYMBOL(free_reserved_page
);
5979 static int page_alloc_cpu_dead(unsigned int cpu
)
5983 lru_add_drain_cpu(cpu
);
5984 mlock_drain_remote(cpu
);
5988 * Spill the event counters of the dead processor
5989 * into the current processors event counters.
5990 * This artificially elevates the count of the current
5993 vm_events_fold_cpu(cpu
);
5996 * Zero the differential counters of the dead processor
5997 * so that the vm statistics are consistent.
5999 * This is only okay since the processor is dead and cannot
6000 * race with what we are doing.
6002 cpu_vm_stats_fold(cpu
);
6004 for_each_populated_zone(zone
)
6005 zone_pcp_update(zone
, 0);
6010 static int page_alloc_cpu_online(unsigned int cpu
)
6014 for_each_populated_zone(zone
)
6015 zone_pcp_update(zone
, 1);
6019 void __init
page_alloc_init_cpuhp(void)
6023 ret
= cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC
,
6024 "mm/page_alloc:pcp",
6025 page_alloc_cpu_online
,
6026 page_alloc_cpu_dead
);
6031 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
6032 * or min_free_kbytes changes.
6034 static void calculate_totalreserve_pages(void)
6036 struct pglist_data
*pgdat
;
6037 unsigned long reserve_pages
= 0;
6038 enum zone_type i
, j
;
6040 for_each_online_pgdat(pgdat
) {
6042 pgdat
->totalreserve_pages
= 0;
6044 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
6045 struct zone
*zone
= pgdat
->node_zones
+ i
;
6047 unsigned long managed_pages
= zone_managed_pages(zone
);
6049 /* Find valid and maximum lowmem_reserve in the zone */
6050 for (j
= i
; j
< MAX_NR_ZONES
; j
++) {
6051 if (zone
->lowmem_reserve
[j
] > max
)
6052 max
= zone
->lowmem_reserve
[j
];
6055 /* we treat the high watermark as reserved pages. */
6056 max
+= high_wmark_pages(zone
);
6058 if (max
> managed_pages
)
6059 max
= managed_pages
;
6061 pgdat
->totalreserve_pages
+= max
;
6063 reserve_pages
+= max
;
6066 totalreserve_pages
= reserve_pages
;
6067 trace_mm_calculate_totalreserve_pages(totalreserve_pages
);
6071 * setup_per_zone_lowmem_reserve - called whenever
6072 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
6073 * has a correct pages reserved value, so an adequate number of
6074 * pages are left in the zone after a successful __alloc_pages().
6076 static void setup_per_zone_lowmem_reserve(void)
6078 struct pglist_data
*pgdat
;
6079 enum zone_type i
, j
;
6081 for_each_online_pgdat(pgdat
) {
6082 for (i
= 0; i
< MAX_NR_ZONES
- 1; i
++) {
6083 struct zone
*zone
= &pgdat
->node_zones
[i
];
6084 int ratio
= sysctl_lowmem_reserve_ratio
[i
];
6085 bool clear
= !ratio
|| !zone_managed_pages(zone
);
6086 unsigned long managed_pages
= 0;
6088 for (j
= i
+ 1; j
< MAX_NR_ZONES
; j
++) {
6089 struct zone
*upper_zone
= &pgdat
->node_zones
[j
];
6091 managed_pages
+= zone_managed_pages(upper_zone
);
6094 zone
->lowmem_reserve
[j
] = 0;
6096 zone
->lowmem_reserve
[j
] = managed_pages
/ ratio
;
6097 trace_mm_setup_per_zone_lowmem_reserve(zone
, upper_zone
,
6098 zone
->lowmem_reserve
[j
]);
6103 /* update totalreserve_pages */
6104 calculate_totalreserve_pages();
6107 static void __setup_per_zone_wmarks(void)
6109 unsigned long pages_min
= min_free_kbytes
>> (PAGE_SHIFT
- 10);
6110 unsigned long lowmem_pages
= 0;
6112 unsigned long flags
;
6114 /* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */
6115 for_each_zone(zone
) {
6116 if (!is_highmem(zone
) && zone_idx(zone
) != ZONE_MOVABLE
)
6117 lowmem_pages
+= zone_managed_pages(zone
);
6120 for_each_zone(zone
) {
6123 spin_lock_irqsave(&zone
->lock
, flags
);
6124 tmp
= (u64
)pages_min
* zone_managed_pages(zone
);
6125 tmp
= div64_ul(tmp
, lowmem_pages
);
6126 if (is_highmem(zone
) || zone_idx(zone
) == ZONE_MOVABLE
) {
6128 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
6129 * need highmem and movable zones pages, so cap pages_min
6130 * to a small value here.
6132 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
6133 * deltas control async page reclaim, and so should
6134 * not be capped for highmem and movable zones.
6136 unsigned long min_pages
;
6138 min_pages
= zone_managed_pages(zone
) / 1024;
6139 min_pages
= clamp(min_pages
, SWAP_CLUSTER_MAX
, 128UL);
6140 zone
->_watermark
[WMARK_MIN
] = min_pages
;
6143 * If it's a lowmem zone, reserve a number of pages
6144 * proportionate to the zone's size.
6146 zone
->_watermark
[WMARK_MIN
] = tmp
;
6150 * Set the kswapd watermarks distance according to the
6151 * scale factor in proportion to available memory, but
6152 * ensure a minimum size on small systems.
6154 tmp
= max_t(u64
, tmp
>> 2,
6155 mult_frac(zone_managed_pages(zone
),
6156 watermark_scale_factor
, 10000));
6158 zone
->watermark_boost
= 0;
6159 zone
->_watermark
[WMARK_LOW
] = min_wmark_pages(zone
) + tmp
;
6160 zone
->_watermark
[WMARK_HIGH
] = low_wmark_pages(zone
) + tmp
;
6161 zone
->_watermark
[WMARK_PROMO
] = high_wmark_pages(zone
) + tmp
;
6162 trace_mm_setup_per_zone_wmarks(zone
);
6164 spin_unlock_irqrestore(&zone
->lock
, flags
);
6167 /* update totalreserve_pages */
6168 calculate_totalreserve_pages();
6172 * setup_per_zone_wmarks - called when min_free_kbytes changes
6173 * or when memory is hot-{added|removed}
6175 * Ensures that the watermark[min,low,high] values for each zone are set
6176 * correctly with respect to min_free_kbytes.
6178 void setup_per_zone_wmarks(void)
6181 static DEFINE_SPINLOCK(lock
);
6184 __setup_per_zone_wmarks();
6188 * The watermark size have changed so update the pcpu batch
6189 * and high limits or the limits may be inappropriate.
6192 zone_pcp_update(zone
, 0);
6196 * Initialise min_free_kbytes.
6198 * For small machines we want it small (128k min). For large machines
6199 * we want it large (256MB max). But it is not linear, because network
6200 * bandwidth does not increase linearly with machine size. We use
6202 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
6203 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
6219 void calculate_min_free_kbytes(void)
6221 unsigned long lowmem_kbytes
;
6222 int new_min_free_kbytes
;
6224 lowmem_kbytes
= nr_free_buffer_pages() * (PAGE_SIZE
>> 10);
6225 new_min_free_kbytes
= int_sqrt(lowmem_kbytes
* 16);
6227 if (new_min_free_kbytes
> user_min_free_kbytes
)
6228 min_free_kbytes
= clamp(new_min_free_kbytes
, 128, 262144);
6230 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
6231 new_min_free_kbytes
, user_min_free_kbytes
);
6235 int __meminit
init_per_zone_wmark_min(void)
6237 calculate_min_free_kbytes();
6238 setup_per_zone_wmarks();
6239 refresh_zone_stat_thresholds();
6240 setup_per_zone_lowmem_reserve();
6243 setup_min_unmapped_ratio();
6244 setup_min_slab_ratio();
6247 khugepaged_min_free_kbytes_update();
6251 postcore_initcall(init_per_zone_wmark_min
)
6254 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
6255 * that we can call two helper functions whenever min_free_kbytes
6258 static int min_free_kbytes_sysctl_handler(const struct ctl_table
*table
, int write
,
6259 void *buffer
, size_t *length
, loff_t
*ppos
)
6263 rc
= proc_dointvec_minmax(table
, write
, buffer
, length
, ppos
);
6268 user_min_free_kbytes
= min_free_kbytes
;
6269 setup_per_zone_wmarks();
6274 static int watermark_scale_factor_sysctl_handler(const struct ctl_table
*table
, int write
,
6275 void *buffer
, size_t *length
, loff_t
*ppos
)
6279 rc
= proc_dointvec_minmax(table
, write
, buffer
, length
, ppos
);
6284 setup_per_zone_wmarks();
6290 static void setup_min_unmapped_ratio(void)
6295 for_each_online_pgdat(pgdat
)
6296 pgdat
->min_unmapped_pages
= 0;
6299 zone
->zone_pgdat
->min_unmapped_pages
+= (zone_managed_pages(zone
) *
6300 sysctl_min_unmapped_ratio
) / 100;
6304 static int sysctl_min_unmapped_ratio_sysctl_handler(const struct ctl_table
*table
, int write
,
6305 void *buffer
, size_t *length
, loff_t
*ppos
)
6309 rc
= proc_dointvec_minmax(table
, write
, buffer
, length
, ppos
);
6313 setup_min_unmapped_ratio();
6318 static void setup_min_slab_ratio(void)
6323 for_each_online_pgdat(pgdat
)
6324 pgdat
->min_slab_pages
= 0;
6327 zone
->zone_pgdat
->min_slab_pages
+= (zone_managed_pages(zone
) *
6328 sysctl_min_slab_ratio
) / 100;
6331 static int sysctl_min_slab_ratio_sysctl_handler(const struct ctl_table
*table
, int write
,
6332 void *buffer
, size_t *length
, loff_t
*ppos
)
6336 rc
= proc_dointvec_minmax(table
, write
, buffer
, length
, ppos
);
6340 setup_min_slab_ratio();
6347 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
6348 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
6349 * whenever sysctl_lowmem_reserve_ratio changes.
6351 * The reserve ratio obviously has absolutely no relation with the
6352 * minimum watermarks. The lowmem reserve ratio can only make sense
6353 * if in function of the boot time zone sizes.
6355 static int lowmem_reserve_ratio_sysctl_handler(const struct ctl_table
*table
,
6356 int write
, void *buffer
, size_t *length
, loff_t
*ppos
)
6360 proc_dointvec_minmax(table
, write
, buffer
, length
, ppos
);
6362 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
6363 if (sysctl_lowmem_reserve_ratio
[i
] < 1)
6364 sysctl_lowmem_reserve_ratio
[i
] = 0;
6367 setup_per_zone_lowmem_reserve();
6372 * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
6373 * cpu. It is the fraction of total pages in each zone that a hot per cpu
6374 * pagelist can have before it gets flushed back to buddy allocator.
6376 static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table
*table
,
6377 int write
, void *buffer
, size_t *length
, loff_t
*ppos
)
6380 int old_percpu_pagelist_high_fraction
;
6383 mutex_lock(&pcp_batch_high_lock
);
6384 old_percpu_pagelist_high_fraction
= percpu_pagelist_high_fraction
;
6386 ret
= proc_dointvec_minmax(table
, write
, buffer
, length
, ppos
);
6387 if (!write
|| ret
< 0)
6390 /* Sanity checking to avoid pcp imbalance */
6391 if (percpu_pagelist_high_fraction
&&
6392 percpu_pagelist_high_fraction
< MIN_PERCPU_PAGELIST_HIGH_FRACTION
) {
6393 percpu_pagelist_high_fraction
= old_percpu_pagelist_high_fraction
;
6399 if (percpu_pagelist_high_fraction
== old_percpu_pagelist_high_fraction
)
6402 for_each_populated_zone(zone
)
6403 zone_set_pageset_high_and_batch(zone
, 0);
6405 mutex_unlock(&pcp_batch_high_lock
);
6409 static const struct ctl_table page_alloc_sysctl_table
[] = {
6411 .procname
= "min_free_kbytes",
6412 .data
= &min_free_kbytes
,
6413 .maxlen
= sizeof(min_free_kbytes
),
6415 .proc_handler
= min_free_kbytes_sysctl_handler
,
6416 .extra1
= SYSCTL_ZERO
,
6419 .procname
= "watermark_boost_factor",
6420 .data
= &watermark_boost_factor
,
6421 .maxlen
= sizeof(watermark_boost_factor
),
6423 .proc_handler
= proc_dointvec_minmax
,
6424 .extra1
= SYSCTL_ZERO
,
6427 .procname
= "watermark_scale_factor",
6428 .data
= &watermark_scale_factor
,
6429 .maxlen
= sizeof(watermark_scale_factor
),
6431 .proc_handler
= watermark_scale_factor_sysctl_handler
,
6432 .extra1
= SYSCTL_ONE
,
6433 .extra2
= SYSCTL_THREE_THOUSAND
,
6436 .procname
= "defrag_mode",
6437 .data
= &defrag_mode
,
6438 .maxlen
= sizeof(defrag_mode
),
6440 .proc_handler
= proc_dointvec_minmax
,
6441 .extra1
= SYSCTL_ZERO
,
6442 .extra2
= SYSCTL_ONE
,
6445 .procname
= "percpu_pagelist_high_fraction",
6446 .data
= &percpu_pagelist_high_fraction
,
6447 .maxlen
= sizeof(percpu_pagelist_high_fraction
),
6449 .proc_handler
= percpu_pagelist_high_fraction_sysctl_handler
,
6450 .extra1
= SYSCTL_ZERO
,
6453 .procname
= "lowmem_reserve_ratio",
6454 .data
= &sysctl_lowmem_reserve_ratio
,
6455 .maxlen
= sizeof(sysctl_lowmem_reserve_ratio
),
6457 .proc_handler
= lowmem_reserve_ratio_sysctl_handler
,
6461 .procname
= "numa_zonelist_order",
6462 .data
= &numa_zonelist_order
,
6463 .maxlen
= NUMA_ZONELIST_ORDER_LEN
,
6465 .proc_handler
= numa_zonelist_order_handler
,
6468 .procname
= "min_unmapped_ratio",
6469 .data
= &sysctl_min_unmapped_ratio
,
6470 .maxlen
= sizeof(sysctl_min_unmapped_ratio
),
6472 .proc_handler
= sysctl_min_unmapped_ratio_sysctl_handler
,
6473 .extra1
= SYSCTL_ZERO
,
6474 .extra2
= SYSCTL_ONE_HUNDRED
,
6477 .procname
= "min_slab_ratio",
6478 .data
= &sysctl_min_slab_ratio
,
6479 .maxlen
= sizeof(sysctl_min_slab_ratio
),
6481 .proc_handler
= sysctl_min_slab_ratio_sysctl_handler
,
6482 .extra1
= SYSCTL_ZERO
,
6483 .extra2
= SYSCTL_ONE_HUNDRED
,
6488 void __init
page_alloc_sysctl_init(void)
6490 register_sysctl_init("vm", page_alloc_sysctl_table
);
6493 #ifdef CONFIG_CONTIG_ALLOC
6494 /* Usage: See admin-guide/dynamic-debug-howto.rst */
6495 static void alloc_contig_dump_pages(struct list_head
*page_list
)
6497 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor
, "migrate failure");
6499 if (DYNAMIC_DEBUG_BRANCH(descriptor
)) {
6503 list_for_each_entry(page
, page_list
, lru
)
6504 dump_page(page
, "migration failure");
6509 * [start, end) must belong to a single zone.
6510 * @migratetype: using migratetype to filter the type of migration in
6511 * trace_mm_alloc_contig_migrate_range_info.
6513 static int __alloc_contig_migrate_range(struct compact_control
*cc
,
6514 unsigned long start
, unsigned long end
, int migratetype
)
6516 /* This function is based on compact_zone() from compaction.c. */
6517 unsigned int nr_reclaimed
;
6518 unsigned long pfn
= start
;
6519 unsigned int tries
= 0;
6521 struct migration_target_control mtc
= {
6522 .nid
= zone_to_nid(cc
->zone
),
6523 .gfp_mask
= cc
->gfp_mask
,
6524 .reason
= MR_CONTIG_RANGE
,
6527 unsigned long total_mapped
= 0;
6528 unsigned long total_migrated
= 0;
6529 unsigned long total_reclaimed
= 0;
6531 lru_cache_disable();
6533 while (pfn
< end
|| !list_empty(&cc
->migratepages
)) {
6534 if (fatal_signal_pending(current
)) {
6539 if (list_empty(&cc
->migratepages
)) {
6540 cc
->nr_migratepages
= 0;
6541 ret
= isolate_migratepages_range(cc
, pfn
, end
);
6542 if (ret
&& ret
!= -EAGAIN
)
6544 pfn
= cc
->migrate_pfn
;
6546 } else if (++tries
== 5) {
6551 nr_reclaimed
= reclaim_clean_pages_from_list(cc
->zone
,
6553 cc
->nr_migratepages
-= nr_reclaimed
;
6555 if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
6556 total_reclaimed
+= nr_reclaimed
;
6557 list_for_each_entry(page
, &cc
->migratepages
, lru
) {
6558 struct folio
*folio
= page_folio(page
);
6560 total_mapped
+= folio_mapped(folio
) *
6561 folio_nr_pages(folio
);
6565 ret
= migrate_pages(&cc
->migratepages
, alloc_migration_target
,
6566 NULL
, (unsigned long)&mtc
, cc
->mode
, MR_CONTIG_RANGE
, NULL
);
6568 if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret
)
6569 total_migrated
+= cc
->nr_migratepages
;
6572 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
6573 * to retry again over this error, so do the same here.
6581 if (!(cc
->gfp_mask
& __GFP_NOWARN
) && ret
== -EBUSY
)
6582 alloc_contig_dump_pages(&cc
->migratepages
);
6583 putback_movable_pages(&cc
->migratepages
);
6586 trace_mm_alloc_contig_migrate_range_info(start
, end
, migratetype
,
6590 return (ret
< 0) ? ret
: 0;
6593 static void split_free_pages(struct list_head
*list
, gfp_t gfp_mask
)
6597 for (order
= 0; order
< NR_PAGE_ORDERS
; order
++) {
6598 struct page
*page
, *next
;
6599 int nr_pages
= 1 << order
;
6601 list_for_each_entry_safe(page
, next
, &list
[order
], lru
) {
6604 post_alloc_hook(page
, order
, gfp_mask
);
6605 set_page_refcounted(page
);
6609 split_page(page
, order
);
6611 /* Add all subpages to the order-0 head, in sequence. */
6612 list_del(&page
->lru
);
6613 for (i
= 0; i
< nr_pages
; i
++)
6614 list_add_tail(&page
[i
].lru
, &list
[0]);
6619 static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask
, gfp_t
*gfp_cc_mask
)
6621 const gfp_t reclaim_mask
= __GFP_IO
| __GFP_FS
| __GFP_RECLAIM
;
6622 const gfp_t action_mask
= __GFP_COMP
| __GFP_RETRY_MAYFAIL
| __GFP_NOWARN
|
6623 __GFP_ZERO
| __GFP_ZEROTAGS
| __GFP_SKIP_ZERO
;
6624 const gfp_t cc_action_mask
= __GFP_RETRY_MAYFAIL
| __GFP_NOWARN
;
6627 * We are given the range to allocate; node, mobility and placement
6628 * hints are irrelevant at this point. We'll simply ignore them.
6630 gfp_mask
&= ~(GFP_ZONEMASK
| __GFP_RECLAIMABLE
| __GFP_WRITE
|
6631 __GFP_HARDWALL
| __GFP_THISNODE
| __GFP_MOVABLE
);
6634 * We only support most reclaim flags (but not NOFAIL/NORETRY), and
6635 * selected action flags.
6637 if (gfp_mask
& ~(reclaim_mask
| action_mask
))
6641 * Flags to control page compaction/migration/reclaim, to free up our
6642 * page range. Migratable pages are movable, __GFP_MOVABLE is implied
6645 * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that
6646 * to not degrade callers.
6648 *gfp_cc_mask
= (gfp_mask
& (reclaim_mask
| cc_action_mask
)) |
6649 __GFP_MOVABLE
| __GFP_RETRY_MAYFAIL
;
6654 * alloc_contig_range() -- tries to allocate given range of pages
6655 * @start: start PFN to allocate
6656 * @end: one-past-the-last PFN to allocate
6657 * @migratetype: migratetype of the underlying pageblocks (either
6658 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6659 * in range must have the same migratetype and it must
6660 * be either of the two.
6661 * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
6662 * action and reclaim modifiers are supported. Reclaim modifiers
6663 * control allocation behavior during compaction/migration/reclaim.
6665 * The PFN range does not have to be pageblock aligned. The PFN range must
6666 * belong to a single zone.
6668 * The first thing this routine does is attempt to MIGRATE_ISOLATE all
6669 * pageblocks in the range. Once isolated, the pageblocks should not
6670 * be modified by others.
6672 * Return: zero on success or negative error code. On success all
6673 * pages which PFN is in [start, end) are allocated for the caller and
6674 * need to be freed with free_contig_range().
6676 int alloc_contig_range_noprof(unsigned long start
, unsigned long end
,
6677 unsigned migratetype
, gfp_t gfp_mask
)
6679 unsigned long outer_start
, outer_end
;
6682 struct compact_control cc
= {
6683 .nr_migratepages
= 0,
6685 .zone
= page_zone(pfn_to_page(start
)),
6686 .mode
= MIGRATE_SYNC
,
6687 .ignore_skip_hint
= true,
6688 .no_set_skip_hint
= true,
6689 .alloc_contig
= true,
6691 INIT_LIST_HEAD(&cc
.migratepages
);
6693 gfp_mask
= current_gfp_context(gfp_mask
);
6694 if (__alloc_contig_verify_gfp_mask(gfp_mask
, (gfp_t
*)&cc
.gfp_mask
))
6698 * What we do here is we mark all pageblocks in range as
6699 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6700 * have different sizes, and due to the way page allocator
6701 * work, start_isolate_page_range() has special handlings for this.
6703 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6704 * migrate the pages from an unaligned range (ie. pages that
6705 * we are interested in). This will put all the pages in
6706 * range back to page allocator as MIGRATE_ISOLATE.
6708 * When this is done, we take the pages in range from page
6709 * allocator removing them from the buddy system. This way
6710 * page allocator will never consider using them.
6712 * This lets us mark the pageblocks back as
6713 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6714 * aligned range but not in the unaligned, original range are
6715 * put back to page allocator so that buddy can use them.
6718 ret
= start_isolate_page_range(start
, end
, migratetype
, 0);
6722 drain_all_pages(cc
.zone
);
6725 * In case of -EBUSY, we'd like to know which page causes problem.
6726 * So, just fall through. test_pages_isolated() has a tracepoint
6727 * which will report the busy page.
6729 * It is possible that busy pages could become available before
6730 * the call to test_pages_isolated, and the range will actually be
6731 * allocated. So, if we fall through be sure to clear ret so that
6732 * -EBUSY is not accidentally used or returned to caller.
6734 ret
= __alloc_contig_migrate_range(&cc
, start
, end
, migratetype
);
6735 if (ret
&& ret
!= -EBUSY
)
6739 * When in-use hugetlb pages are migrated, they may simply be released
6740 * back into the free hugepage pool instead of being returned to the
6741 * buddy system. After the migration of in-use huge pages is completed,
6742 * we will invoke replace_free_hugepage_folios() to ensure that these
6743 * hugepages are properly released to the buddy system.
6745 ret
= replace_free_hugepage_folios(start
, end
);
6750 * Pages from [start, end) are within a pageblock_nr_pages
6751 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6752 * more, all pages in [start, end) are free in page allocator.
6753 * What we are going to do is to allocate all pages from
6754 * [start, end) (that is remove them from page allocator).
6756 * The only problem is that pages at the beginning and at the
6757 * end of interesting range may be not aligned with pages that
6758 * page allocator holds, ie. they can be part of higher order
6759 * pages. Because of this, we reserve the bigger range and
6760 * once this is done free the pages we are not interested in.
6762 * We don't have to hold zone->lock here because the pages are
6763 * isolated thus they won't get removed from buddy.
6765 outer_start
= find_large_buddy(start
);
6767 /* Make sure the range is really isolated. */
6768 if (test_pages_isolated(outer_start
, end
, 0)) {
6773 /* Grab isolated pages from freelists. */
6774 outer_end
= isolate_freepages_range(&cc
, outer_start
, end
);
6780 if (!(gfp_mask
& __GFP_COMP
)) {
6781 split_free_pages(cc
.freepages
, gfp_mask
);
6783 /* Free head and tail (if any) */
6784 if (start
!= outer_start
)
6785 free_contig_range(outer_start
, start
- outer_start
);
6786 if (end
!= outer_end
)
6787 free_contig_range(end
, outer_end
- end
);
6788 } else if (start
== outer_start
&& end
== outer_end
&& is_power_of_2(end
- start
)) {
6789 struct page
*head
= pfn_to_page(start
);
6790 int order
= ilog2(end
- start
);
6792 check_new_pages(head
, order
);
6793 prep_new_page(head
, order
, gfp_mask
, 0);
6794 set_page_refcounted(head
);
6797 WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
6798 start
, end
, outer_start
, outer_end
);
6801 undo_isolate_page_range(start
, end
, migratetype
);
6804 EXPORT_SYMBOL(alloc_contig_range_noprof
);
6806 static int __alloc_contig_pages(unsigned long start_pfn
,
6807 unsigned long nr_pages
, gfp_t gfp_mask
)
6809 unsigned long end_pfn
= start_pfn
+ nr_pages
;
6811 return alloc_contig_range_noprof(start_pfn
, end_pfn
, MIGRATE_MOVABLE
,
6815 static bool pfn_range_valid_contig(struct zone
*z
, unsigned long start_pfn
,
6816 unsigned long nr_pages
)
6818 unsigned long i
, end_pfn
= start_pfn
+ nr_pages
;
6821 for (i
= start_pfn
; i
< end_pfn
; i
++) {
6822 page
= pfn_to_online_page(i
);
6826 if (page_zone(page
) != z
)
6829 if (PageReserved(page
))
6838 static bool zone_spans_last_pfn(const struct zone
*zone
,
6839 unsigned long start_pfn
, unsigned long nr_pages
)
6841 unsigned long last_pfn
= start_pfn
+ nr_pages
- 1;
6843 return zone_spans_pfn(zone
, last_pfn
);
6847 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
6848 * @nr_pages: Number of contiguous pages to allocate
6849 * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some
6850 * action and reclaim modifiers are supported. Reclaim modifiers
6851 * control allocation behavior during compaction/migration/reclaim.
6853 * @nodemask: Mask for other possible nodes
6855 * This routine is a wrapper around alloc_contig_range(). It scans over zones
6856 * on an applicable zonelist to find a contiguous pfn range which can then be
6857 * tried for allocation with alloc_contig_range(). This routine is intended
6858 * for allocation requests which can not be fulfilled with the buddy allocator.
6860 * The allocated memory is always aligned to a page boundary. If nr_pages is a
6861 * power of two, then allocated range is also guaranteed to be aligned to same
6862 * nr_pages (e.g. 1GB request would be aligned to 1GB).
6864 * Allocated pages can be freed with free_contig_range() or by manually calling
6865 * __free_page() on each allocated page.
6867 * Return: pointer to contiguous pages on success, or NULL if not successful.
6869 struct page
*alloc_contig_pages_noprof(unsigned long nr_pages
, gfp_t gfp_mask
,
6870 int nid
, nodemask_t
*nodemask
)
6872 unsigned long ret
, pfn
, flags
;
6873 struct zonelist
*zonelist
;
6877 zonelist
= node_zonelist(nid
, gfp_mask
);
6878 for_each_zone_zonelist_nodemask(zone
, z
, zonelist
,
6879 gfp_zone(gfp_mask
), nodemask
) {
6880 spin_lock_irqsave(&zone
->lock
, flags
);
6882 pfn
= ALIGN(zone
->zone_start_pfn
, nr_pages
);
6883 while (zone_spans_last_pfn(zone
, pfn
, nr_pages
)) {
6884 if (pfn_range_valid_contig(zone
, pfn
, nr_pages
)) {
6886 * We release the zone lock here because
6887 * alloc_contig_range() will also lock the zone
6888 * at some point. If there's an allocation
6889 * spinning on this lock, it may win the race
6890 * and cause alloc_contig_range() to fail...
6892 spin_unlock_irqrestore(&zone
->lock
, flags
);
6893 ret
= __alloc_contig_pages(pfn
, nr_pages
,
6896 return pfn_to_page(pfn
);
6897 spin_lock_irqsave(&zone
->lock
, flags
);
6901 spin_unlock_irqrestore(&zone
->lock
, flags
);
6905 #endif /* CONFIG_CONTIG_ALLOC */
6907 void free_contig_range(unsigned long pfn
, unsigned long nr_pages
)
6909 unsigned long count
= 0;
6910 struct folio
*folio
= pfn_folio(pfn
);
6912 if (folio_test_large(folio
)) {
6913 int expected
= folio_nr_pages(folio
);
6915 if (nr_pages
== expected
)
6918 WARN(true, "PFN %lu: nr_pages %lu != expected %d\n",
6919 pfn
, nr_pages
, expected
);
6923 for (; nr_pages
--; pfn
++) {
6924 struct page
*page
= pfn_to_page(pfn
);
6926 count
+= page_count(page
) != 1;
6929 WARN(count
!= 0, "%lu pages are still in use!\n", count
);
6931 EXPORT_SYMBOL(free_contig_range
);
6934 * Effectively disable pcplists for the zone by setting the high limit to 0
6935 * and draining all cpus. A concurrent page freeing on another CPU that's about
6936 * to put the page on pcplist will either finish before the drain and the page
6937 * will be drained, or observe the new high limit and skip the pcplist.
6939 * Must be paired with a call to zone_pcp_enable().
6941 void zone_pcp_disable(struct zone
*zone
)
6943 mutex_lock(&pcp_batch_high_lock
);
6944 __zone_set_pageset_high_and_batch(zone
, 0, 0, 1);
6945 __drain_all_pages(zone
, true);
6948 void zone_pcp_enable(struct zone
*zone
)
6950 __zone_set_pageset_high_and_batch(zone
, zone
->pageset_high_min
,
6951 zone
->pageset_high_max
, zone
->pageset_batch
);
6952 mutex_unlock(&pcp_batch_high_lock
);
6955 void zone_pcp_reset(struct zone
*zone
)
6958 struct per_cpu_zonestat
*pzstats
;
6960 if (zone
->per_cpu_pageset
!= &boot_pageset
) {
6961 for_each_online_cpu(cpu
) {
6962 pzstats
= per_cpu_ptr(zone
->per_cpu_zonestats
, cpu
);
6963 drain_zonestat(zone
, pzstats
);
6965 free_percpu(zone
->per_cpu_pageset
);
6966 zone
->per_cpu_pageset
= &boot_pageset
;
6967 if (zone
->per_cpu_zonestats
!= &boot_zonestats
) {
6968 free_percpu(zone
->per_cpu_zonestats
);
6969 zone
->per_cpu_zonestats
= &boot_zonestats
;
6974 #ifdef CONFIG_MEMORY_HOTREMOVE
6976 * All pages in the range must be in a single zone, must not contain holes,
6977 * must span full sections, and must be isolated before calling this function.
6979 * Returns the number of managed (non-PageOffline()) pages in the range: the
6980 * number of pages for which memory offlining code must adjust managed page
6981 * counters using adjust_managed_page_count().
6983 unsigned long __offline_isolated_pages(unsigned long start_pfn
,
6984 unsigned long end_pfn
)
6986 unsigned long already_offline
= 0, flags
;
6987 unsigned long pfn
= start_pfn
;
6992 offline_mem_sections(pfn
, end_pfn
);
6993 zone
= page_zone(pfn_to_page(pfn
));
6994 spin_lock_irqsave(&zone
->lock
, flags
);
6995 while (pfn
< end_pfn
) {
6996 page
= pfn_to_page(pfn
);
6998 * The HWPoisoned page may be not in buddy system, and
6999 * page_count() is not 0.
7001 if (unlikely(!PageBuddy(page
) && PageHWPoison(page
))) {
7006 * At this point all remaining PageOffline() pages have a
7007 * reference count of 0 and can simply be skipped.
7009 if (PageOffline(page
)) {
7010 BUG_ON(page_count(page
));
7011 BUG_ON(PageBuddy(page
));
7017 BUG_ON(page_count(page
));
7018 BUG_ON(!PageBuddy(page
));
7019 VM_WARN_ON(get_pageblock_migratetype(page
) != MIGRATE_ISOLATE
);
7020 order
= buddy_order(page
);
7021 del_page_from_free_list(page
, zone
, order
, MIGRATE_ISOLATE
);
7022 pfn
+= (1 << order
);
7024 spin_unlock_irqrestore(&zone
->lock
, flags
);
7026 return end_pfn
- start_pfn
- already_offline
;
7031 * This function returns a stable result only if called under zone lock.
7033 bool is_free_buddy_page(const struct page
*page
)
7035 unsigned long pfn
= page_to_pfn(page
);
7038 for (order
= 0; order
< NR_PAGE_ORDERS
; order
++) {
7039 const struct page
*head
= page
- (pfn
& ((1 << order
) - 1));
7041 if (PageBuddy(head
) &&
7042 buddy_order_unsafe(head
) >= order
)
7046 return order
<= MAX_PAGE_ORDER
;
7048 EXPORT_SYMBOL(is_free_buddy_page
);
7050 #ifdef CONFIG_MEMORY_FAILURE
7051 static inline void add_to_free_list(struct page
*page
, struct zone
*zone
,
7052 unsigned int order
, int migratetype
,
7055 __add_to_free_list(page
, zone
, order
, migratetype
, tail
);
7056 account_freepages(zone
, 1 << order
, migratetype
);
7060 * Break down a higher-order page in sub-pages, and keep our target out of
7063 static void break_down_buddy_pages(struct zone
*zone
, struct page
*page
,
7064 struct page
*target
, int low
, int high
,
7067 unsigned long size
= 1 << high
;
7068 struct page
*current_buddy
;
7070 while (high
> low
) {
7074 if (target
>= &page
[size
]) {
7075 current_buddy
= page
;
7078 current_buddy
= page
+ size
;
7081 if (set_page_guard(zone
, current_buddy
, high
))
7084 add_to_free_list(current_buddy
, zone
, high
, migratetype
, false);
7085 set_buddy_order(current_buddy
, high
);
7090 * Take a page that will be marked as poisoned off the buddy allocator.
7092 bool take_page_off_buddy(struct page
*page
)
7094 struct zone
*zone
= page_zone(page
);
7095 unsigned long pfn
= page_to_pfn(page
);
7096 unsigned long flags
;
7100 spin_lock_irqsave(&zone
->lock
, flags
);
7101 for (order
= 0; order
< NR_PAGE_ORDERS
; order
++) {
7102 struct page
*page_head
= page
- (pfn
& ((1 << order
) - 1));
7103 int page_order
= buddy_order(page_head
);
7105 if (PageBuddy(page_head
) && page_order
>= order
) {
7106 unsigned long pfn_head
= page_to_pfn(page_head
);
7107 int migratetype
= get_pfnblock_migratetype(page_head
,
7110 del_page_from_free_list(page_head
, zone
, page_order
,
7112 break_down_buddy_pages(zone
, page_head
, page
, 0,
7113 page_order
, migratetype
);
7114 SetPageHWPoisonTakenOff(page
);
7118 if (page_count(page_head
) > 0)
7121 spin_unlock_irqrestore(&zone
->lock
, flags
);
7126 * Cancel takeoff done by take_page_off_buddy().
7128 bool put_page_back_buddy(struct page
*page
)
7130 struct zone
*zone
= page_zone(page
);
7131 unsigned long flags
;
7134 spin_lock_irqsave(&zone
->lock
, flags
);
7135 if (put_page_testzero(page
)) {
7136 unsigned long pfn
= page_to_pfn(page
);
7137 int migratetype
= get_pfnblock_migratetype(page
, pfn
);
7139 ClearPageHWPoisonTakenOff(page
);
7140 __free_one_page(page
, pfn
, zone
, 0, migratetype
, FPI_NONE
);
7141 if (TestClearPageHWPoison(page
)) {
7145 spin_unlock_irqrestore(&zone
->lock
, flags
);
7151 #ifdef CONFIG_ZONE_DMA
7152 bool has_managed_dma(void)
7154 struct pglist_data
*pgdat
;
7156 for_each_online_pgdat(pgdat
) {
7157 struct zone
*zone
= &pgdat
->node_zones
[ZONE_DMA
];
7159 if (managed_zone(zone
))
7164 #endif /* CONFIG_ZONE_DMA */
7166 #ifdef CONFIG_UNACCEPTED_MEMORY
7168 static bool lazy_accept
= true;
7170 static int __init
accept_memory_parse(char *p
)
7172 if (!strcmp(p
, "lazy")) {
7175 } else if (!strcmp(p
, "eager")) {
7176 lazy_accept
= false;
7182 early_param("accept_memory", accept_memory_parse
);
7184 static bool page_contains_unaccepted(struct page
*page
, unsigned int order
)
7186 phys_addr_t start
= page_to_phys(page
);
7188 return range_contains_unaccepted_memory(start
, PAGE_SIZE
<< order
);
7191 static void __accept_page(struct zone
*zone
, unsigned long *flags
,
7194 list_del(&page
->lru
);
7195 account_freepages(zone
, -MAX_ORDER_NR_PAGES
, MIGRATE_MOVABLE
);
7196 __mod_zone_page_state(zone
, NR_UNACCEPTED
, -MAX_ORDER_NR_PAGES
);
7197 __ClearPageUnaccepted(page
);
7198 spin_unlock_irqrestore(&zone
->lock
, *flags
);
7200 accept_memory(page_to_phys(page
), PAGE_SIZE
<< MAX_PAGE_ORDER
);
7202 __free_pages_ok(page
, MAX_PAGE_ORDER
, FPI_TO_TAIL
);
7205 void accept_page(struct page
*page
)
7207 struct zone
*zone
= page_zone(page
);
7208 unsigned long flags
;
7210 spin_lock_irqsave(&zone
->lock
, flags
);
7211 if (!PageUnaccepted(page
)) {
7212 spin_unlock_irqrestore(&zone
->lock
, flags
);
7216 /* Unlocks zone->lock */
7217 __accept_page(zone
, &flags
, page
);
7220 static bool try_to_accept_memory_one(struct zone
*zone
)
7222 unsigned long flags
;
7225 spin_lock_irqsave(&zone
->lock
, flags
);
7226 page
= list_first_entry_or_null(&zone
->unaccepted_pages
,
7229 spin_unlock_irqrestore(&zone
->lock
, flags
);
7233 /* Unlocks zone->lock */
7234 __accept_page(zone
, &flags
, page
);
7239 static bool cond_accept_memory(struct zone
*zone
, unsigned int order
,
7242 long to_accept
, wmark
;
7245 if (list_empty(&zone
->unaccepted_pages
))
7248 /* Bailout, since try_to_accept_memory_one() needs to take a lock */
7249 if (alloc_flags
& ALLOC_TRYLOCK
)
7252 wmark
= promo_wmark_pages(zone
);
7255 * Watermarks have not been initialized yet.
7257 * Accepting one MAX_ORDER page to ensure progress.
7260 return try_to_accept_memory_one(zone
);
7262 /* How much to accept to get to promo watermark? */
7264 (zone_page_state(zone
, NR_FREE_PAGES
) -
7265 __zone_watermark_unusable_free(zone
, order
, 0) -
7266 zone_page_state(zone
, NR_UNACCEPTED
));
7268 while (to_accept
> 0) {
7269 if (!try_to_accept_memory_one(zone
))
7272 to_accept
-= MAX_ORDER_NR_PAGES
;
7278 static bool __free_unaccepted(struct page
*page
)
7280 struct zone
*zone
= page_zone(page
);
7281 unsigned long flags
;
7286 spin_lock_irqsave(&zone
->lock
, flags
);
7287 list_add_tail(&page
->lru
, &zone
->unaccepted_pages
);
7288 account_freepages(zone
, MAX_ORDER_NR_PAGES
, MIGRATE_MOVABLE
);
7289 __mod_zone_page_state(zone
, NR_UNACCEPTED
, MAX_ORDER_NR_PAGES
);
7290 __SetPageUnaccepted(page
);
7291 spin_unlock_irqrestore(&zone
->lock
, flags
);
7298 static bool page_contains_unaccepted(struct page
*page
, unsigned int order
)
7303 static bool cond_accept_memory(struct zone
*zone
, unsigned int order
,
7309 static bool __free_unaccepted(struct page
*page
)
7315 #endif /* CONFIG_UNACCEPTED_MEMORY */
7318 * alloc_pages_nolock - opportunistic reentrant allocation from any context
7319 * @nid: node to allocate from
7320 * @order: allocation order size
7322 * Allocates pages of a given order from the given node. This is safe to
7323 * call from any context (from atomic, NMI, and also reentrant
7324 * allocator -> tracepoint -> alloc_pages_nolock_noprof).
7325 * Allocation is best effort and to be expected to fail easily so nobody should
7326 * rely on the success. Failures are not reported via warn_alloc().
7327 * See always fail conditions below.
7329 * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
7330 * It means ENOMEM. There is no reason to call it again and expect !NULL.
7332 struct page
*alloc_pages_nolock_noprof(int nid
, unsigned int order
)
7335 * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
7336 * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
7337 * is not safe in arbitrary context.
7339 * These two are the conditions for gfpflags_allow_spinning() being true.
7341 * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
7342 * to warn. Also warn would trigger printk() which is unsafe from
7343 * various contexts. We cannot use printk_deferred_enter() to mitigate,
7344 * since the running context is unknown.
7346 * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
7347 * is safe in any context. Also zeroing the page is mandatory for
7350 * Though __GFP_NOMEMALLOC is not checked in the code path below,
7351 * specify it here to highlight that alloc_pages_nolock()
7352 * doesn't want to deplete reserves.
7354 gfp_t alloc_gfp
= __GFP_NOWARN
| __GFP_ZERO
| __GFP_NOMEMALLOC
7356 unsigned int alloc_flags
= ALLOC_TRYLOCK
;
7357 struct alloc_context ac
= { };
7361 * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
7362 * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
7363 * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
7364 * mark the task as the owner of another rt_spin_lock which will
7365 * confuse PI logic, so return immediately if called form hard IRQ or
7368 * Note, irqs_disabled() case is ok. This function can be called
7369 * from raw_spin_lock_irqsave region.
7371 if (IS_ENABLED(CONFIG_PREEMPT_RT
) && (in_nmi() || in_hardirq()))
7373 if (!pcp_allowed_order(order
))
7376 /* Bailout, since _deferred_grow_zone() needs to take a lock */
7377 if (deferred_pages_enabled())
7380 if (nid
== NUMA_NO_NODE
)
7381 nid
= numa_node_id();
7383 prepare_alloc_pages(alloc_gfp
, order
, nid
, NULL
, &ac
,
7384 &alloc_gfp
, &alloc_flags
);
7387 * Best effort allocation from percpu free list.
7388 * If it's empty attempt to spin_trylock zone->lock.
7390 page
= get_page_from_freelist(alloc_gfp
, order
, alloc_flags
, &ac
);
7392 /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
7395 set_page_refcounted(page
);
7397 if (memcg_kmem_online() && page
&&
7398 unlikely(__memcg_kmem_charge_page(page
, alloc_gfp
, order
) != 0)) {
7399 free_pages_nolock(page
, order
);
7402 trace_mm_page_alloc(page
, order
, alloc_gfp
, ac
.migratetype
);
7403 kmsan_alloc_page(page
, order
, alloc_gfp
);