HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
vmemmap pages for huge pages and remapping the freed range to a single
page containing the struct page metadata.
With the new mask-based compound_info encoding (for power-of-2 struct page
sizes), all tail pages of the same order are now identical regardless of
which compound page they belong to. This means the tail pages can be
truly shared without fake heads.
Allocate a single page of initialized tail struct pages per zone per order
in the vmemmap_tails[] array in struct zone. All huge pages of that order
in the zone share this tail page, mapped read-only into their vmemmap.
The head page remains unique per huge page.
Redefine MAX_FOLIO_ORDER using ilog2(). The define has to produce a
compile-constant as it is used to specify vmemmap_tail array size. For
some reason, compiler is not able to solve get_order() at compile-time,
but ilog2() works.
Avoid PUD_ORDER to define MAX_FOLIO_ORDER as it adds dependency to
<linux/pgtable.h> which generates hard-to-break include loop.
This eliminates fake heads while maintaining the same memory savings, and
simplifies compound_head() by removing fake head detection.
Link: https://lkml.kernel.org/r/20260227194302.274384-13-kas@kernel.org
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap);
-int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
+int vmemmap_populate_hvo(unsigned long start, unsigned long end,
+ unsigned int order, struct zone *zone,
unsigned long headsize);
void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
unsigned long headsize);
* currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
* no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
*/
-#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
+#ifdef CONFIG_64BIT
+#define MAX_FOLIO_ORDER (ilog2(SZ_16G) - PAGE_SHIFT)
+#else
+#define MAX_FOLIO_ORDER (ilog2(SZ_1G) - PAGE_SHIFT)
+#endif
#else
/*
* Without hugetlb, gigantic folios that are bigger than a single PUD are
* currently impossible.
*/
-#define MAX_FOLIO_ORDER PUD_ORDER
+#define MAX_FOLIO_ORDER (PUD_SHIFT - PAGE_SHIFT)
#endif
#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER)
is_power_of_2(sizeof(struct page)) ? \
MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0)
+/*
+ * vmemmap optimization (like HVO) is only possible for page orders that fill
+ * two or more pages with struct pages.
+ */
+#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page)))
+#define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1)
+#define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0)
+
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ struct page *vmemmap_tails[NR_VMEMMAP_TAILS];
+#endif
} ____cacheline_internodealigned_in_smp;
enum pgdat_flags {
#include <asm/tlbflush.h>
#include "hugetlb_vmemmap.h"
+#include "internal.h"
/**
* struct vmemmap_remap_walk - walk vmemmap page table
return true;
}
+static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone)
+{
+ const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER;
+ struct page *tail, *p;
+ int node = zone_to_nid(zone);
+
+ tail = READ_ONCE(zone->vmemmap_tails[idx]);
+ if (likely(tail))
+ return tail;
+
+ tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!tail)
+ return NULL;
+
+ p = page_to_virt(tail);
+ for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
+ init_compound_tail(p + i, NULL, order, zone);
+
+ if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) {
+ __free_page(tail);
+ tail = READ_ONCE(zone->vmemmap_tails[idx]);
+ }
+
+ return tail;
+}
+
static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
struct folio *folio,
struct list_head *vmemmap_pages,
if (!vmemmap_should_optimize_folio(h, folio))
return ret;
+ nid = folio_nid(folio);
+ vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio));
+ if (!vmemmap_tail)
+ return -ENOMEM;
+
static_branch_inc(&hugetlb_optimize_vmemmap_key);
if (flags & VMEMMAP_SYNCHRONIZE_RCU)
*/
folio_set_hugetlb_vmemmap_optimized(folio);
- nid = folio_nid(folio);
vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);
if (!vmemmap_head) {
ret = -ENOMEM;
list_add(&vmemmap_head->lru, vmemmap_pages);
memmap_pages_add(1);
- vmemmap_tail = vmemmap_head;
vmemmap_start = (unsigned long)&folio->page;
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
}
}
+static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn)
+{
+ struct zone *zone;
+ enum zone_type zone_type;
+
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ zone = &NODE_DATA(nid)->node_zones[zone_type];
+ if (zone_spans_pfn(zone, pfn))
+ return zone;
+ }
+
+ return NULL;
+}
+
void __init hugetlb_vmemmap_init_late(int nid)
{
struct huge_bootmem_page *m, *tm;
unsigned long phys, nr_pages, start, end;
unsigned long pfn, nr_mmap;
+ struct zone *zone = NULL;
struct hstate *h;
void *map;
continue;
}
- if (vmemmap_populate_hvo(start, end, nid,
+ if (!zone || !zone_spans_pfn(zone, pfn))
+ zone = pfn_to_zone(nid, pfn);
+ if (WARN_ON_ONCE(!zone))
+ continue;
+
+ if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone,
HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) {
/* Fallback if HVO population fails */
vmemmap_populate(start, end, nid, NULL);
static int __init hugetlb_vmemmap_init(void)
{
const struct hstate *h;
+ struct zone *zone;
/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
+ for_each_zone(zone) {
+ for (int i = 0; i < NR_VMEMMAP_TAILS; i++) {
+ struct page *tail, *p;
+ unsigned int order;
+
+ tail = zone->vmemmap_tails[i];
+ if (!tail)
+ continue;
+
+ order = i + VMEMMAP_TAIL_MIN_ORDER;
+ p = page_to_virt(tail);
+ for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++)
+ init_compound_tail(p + j, NULL, order, zone);
+ }
+ }
+
for_each_hstate(h) {
if (hugetlb_vmemmap_optimizable(h)) {
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
set_page_private(tail, 0);
}
+static inline void init_compound_tail(struct page *tail,
+ const struct page *head, unsigned int order, struct zone *zone)
+{
+ atomic_set(&tail->_mapcount, -1);
+ set_page_node(tail, zone_to_nid(zone));
+ set_page_zone(tail, zone_idx(zone));
+ prep_compound_tail(tail, head, order);
+}
+
void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);
}
}
-/*
- * Populate vmemmap pages HVO-style. The first page contains the head
- * page and needed tail pages, the other ones are mirrors of the first
- * page.
- */
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone)
+{
+ struct page *p, *tail;
+ unsigned int idx;
+ int node = zone_to_nid(zone);
+
+ if (WARN_ON_ONCE(order < VMEMMAP_TAIL_MIN_ORDER))
+ return NULL;
+ if (WARN_ON_ONCE(order > MAX_FOLIO_ORDER))
+ return NULL;
+
+ idx = order - VMEMMAP_TAIL_MIN_ORDER;
+ tail = zone->vmemmap_tails[idx];
+ if (tail)
+ return tail;
+
+ /*
+ * Only allocate the page, but do not initialize it.
+ *
+ * Any initialization done here will be overwritten by memmap_init().
+ *
+ * hugetlb_vmemmap_init() will take care of initialization after
+ * memmap_init().
+ */
+
+ p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+
+ tail = virt_to_page(p);
+ zone->vmemmap_tails[idx] = tail;
+
+ return tail;
+}
+
int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
- int node, unsigned long headsize)
+ unsigned int order, struct zone *zone,
+ unsigned long headsize)
{
- pte_t *pte;
unsigned long maddr;
+ struct page *tail;
+ pte_t *pte;
+ int node = zone_to_nid(zone);
+
+ tail = vmemmap_get_tail(order, zone);
+ if (!tail)
+ return -ENOMEM;
for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
* Reuse the last page struct page mapped above for the rest.
*/
return vmemmap_populate_range(maddr, end, node, NULL,
- pte_pfn(ptep_get(pte)), 0);
+ page_to_pfn(tail), 0);
}
+#endif
void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
unsigned long addr, unsigned long next)