From 3e43e260f1e44d21861815faa905a1829027600f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Apr 2025 16:28:17 +0100 Subject: [PATCH] mm: perform VMA allocation, freeing, duplication in mm Right now these are performed in kernel/fork.c which is odd and a violation of separation of concerns, as well as preventing us from integrating this and related logic into userland VMA testing going forward. There is a fly in the ointment - nommu - mmap.c is not compiled if CONFIG_MMU not set, and neither is vma.c. To square the circle, let's add a new file - vma_init.c. This will be compiled for both CONFIG_MMU and nommu builds, and will also form part of the VMA userland testing. This allows us to de-duplicate code, while maintaining separation of concerns and the ability for us to userland test this logic. Update the VMA userland tests accordingly, additionally adding a detach_free_vma() helper function to correctly detach VMAs before freeing them in test code, as this change was triggering the assert for this. [akpm@linux-foundation.org: remove stray newline, per Liam] Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Pedro Falcato Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + kernel/fork.c | 88 ------------------- mm/Makefile | 2 +- mm/mmap.c | 3 +- mm/nommu.c | 4 +- mm/vma.h | 6 ++ mm/vma_init.c | 101 ++++++++++++++++++++++ tools/testing/vma/Makefile | 2 +- tools/testing/vma/vma.c | 26 ++++-- tools/testing/vma/vma_internal.h | 143 +++++++++++++++++++++++++------ 10 files changed, 250 insertions(+), 126 deletions(-) create mode 100644 mm/vma_init.c diff --git a/MAINTAINERS b/MAINTAINERS index 8361050234955..ab6b08dbc779c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15683,6 +15683,7 @@ F: mm/mseal.c F: mm/vma.c F: mm/vma.h F: mm/vma_exec.c +F: mm/vma_init.c F: mm/vma_internal.h F: tools/testing/selftests/mm/merge.c F: tools/testing/vma/ diff --git a/kernel/fork.c b/kernel/fork.c index ac9f9267a473d..9e4616dacd825 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -431,88 +431,9 @@ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; -/* SLAB cache for vm_area_struct structures */ -static struct kmem_cache *vm_area_cachep; - /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!vma) - return NULL; - - vma_init(vma, mm); - - return vma; -} - -static void vm_area_init_from(const struct vm_area_struct *src, - struct vm_area_struct *dest) -{ - dest->vm_mm = src->vm_mm; - dest->vm_ops = src->vm_ops; - dest->vm_start = src->vm_start; - dest->vm_end = src->vm_end; - dest->anon_vma = src->anon_vma; - dest->vm_pgoff = src->vm_pgoff; - dest->vm_file = src->vm_file; - dest->vm_private_data = src->vm_private_data; - vm_flags_init(dest, src->vm_flags); - memcpy(&dest->vm_page_prot, &src->vm_page_prot, - sizeof(dest->vm_page_prot)); - /* - * src->shared.rb may be modified concurrently when called from - * dup_mmap(), but the clone will reinitialize it. - */ - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, - sizeof(dest->vm_userfaultfd_ctx)); -#ifdef CONFIG_ANON_VMA_NAME - dest->anon_name = src->anon_name; -#endif -#ifdef CONFIG_SWAP - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, - sizeof(dest->swap_readahead_info)); -#endif -#ifndef CONFIG_MMU - dest->vm_region = src->vm_region; -#endif -#ifdef CONFIG_NUMA - dest->vm_policy = src->vm_policy; -#endif -} - -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) -{ - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - - if (!new) - return NULL; - - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - vm_area_init_from(orig, new); - vma_lock_init(new, true); - INIT_LIST_HEAD(&new->anon_vma_chain); - vma_numab_state_init(new); - dup_anon_vma_name(orig, new); - - return new; -} - -void vm_area_free(struct vm_area_struct *vma) -{ - /* The vma should be detached while being destroyed. */ - vma_assert_detached(vma); - vma_numab_state_free(vma); - free_anon_vma_name(vma); - kmem_cache_free(vm_area_cachep, vma); -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -3033,11 +2954,6 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { - struct kmem_cache_args args = { - .use_freeptr_offset = true, - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), - }; - sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3054,10 +2970,6 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), &args, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/mm/Makefile b/mm/Makefile index 7aadec97c37bf..1a7a11d4933d1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o percpu.o slab_common.o \ compaction.o show_mem.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o mmap_lock.o $(mmu-y) + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/mmap.c b/mm/mmap.c index 5259df031e15d..81dd962a1cfca 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1554,7 +1554,7 @@ static const struct ctl_table mmap_table[] = { #endif /* CONFIG_SYSCTL */ /* - * initialise the percpu counter for VM + * initialise the percpu counter for VM, initialise VMA state. */ void __init mmap_init(void) { @@ -1565,6 +1565,7 @@ void __init mmap_init(void) #ifdef CONFIG_SYSCTL register_sysctl_init("vm", mmap_table); #endif + vma_state_init(); } /* diff --git a/mm/nommu.c b/mm/nommu.c index a142fc258d398..0bf4849b8204f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -399,7 +399,8 @@ static const struct ctl_table nommu_table[] = { }; /* - * initialise the percpu counter for VM and region record slabs + * initialise the percpu counter for VM and region record slabs, initialise VMA + * state. */ void __init mmap_init(void) { @@ -409,6 +410,7 @@ void __init mmap_init(void) VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); register_sysctl_init("vm", nommu_table); + vma_state_init(); } /* diff --git a/mm/vma.h b/mm/vma.h index 151e292263fdc..9a8af9be29a87 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -550,6 +550,12 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock); int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); +/* vma_init.h, shared between CONFIG_MMU and nommu. */ +void __init vma_state_init(void); +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); +void vm_area_free(struct vm_area_struct *vma); + /* vma_exec.c */ #ifdef CONFIG_MMU int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, diff --git a/mm/vma_init.c b/mm/vma_init.c new file mode 100644 index 0000000000000..967ca85179864 --- /dev/null +++ b/mm/vma_init.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. + */ + +#include "vma_internal.h" +#include "vma.h" + +/* SLAB cache for vm_area_struct structures */ +static struct kmem_cache *vm_area_cachep; + +void __init vma_state_init(void) +{ + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); +} + +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + return NULL; + + vma_init(vma, mm); + + return vma; +} + +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +} + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + + if (!new) + return NULL; + + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + vm_area_init_from(orig, new); + vma_lock_init(new, true); + INIT_LIST_HEAD(&new->anon_vma_chain); + vma_numab_state_init(new); + dup_anon_vma_name(orig, new); + + return new; +} + +void vm_area_free(struct vm_area_struct *vma) +{ + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); + vma_numab_state_free(vma); + free_anon_vma_name(vma); + kmem_cache_free(vm_area_cachep, vma); +} diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile index 624040fcf193f..66f3831a668f2 100644 --- a/tools/testing/vma/Makefile +++ b/tools/testing/vma/Makefile @@ -9,7 +9,7 @@ include ../shared/shared.mk OFILES = $(SHARED_OFILES) vma.o maple-shim.o TARGETS = vma -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h vma: $(OFILES) $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 5832ae5d797df..2be7597a2ac2d 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<vm_pgoff, 0); ASSERT_EQ(vma->vm_flags, flags); - vm_area_free(vma); + detach_free_vma(vma); mtree_destroy(&mm.mm_mt); return true; @@ -361,7 +368,7 @@ static bool test_simple_modify(void) ASSERT_EQ(vma->vm_end, 0x1000); ASSERT_EQ(vma->vm_pgoff, 0); - vm_area_free(vma); + detach_free_vma(vma); vma_iter_clear(&vmi); vma = vma_next(&vmi); @@ -370,7 +377,7 @@ static bool test_simple_modify(void) ASSERT_EQ(vma->vm_end, 0x2000); ASSERT_EQ(vma->vm_pgoff, 1); - vm_area_free(vma); + detach_free_vma(vma); vma_iter_clear(&vmi); vma = vma_next(&vmi); @@ -379,7 +386,7 @@ static bool test_simple_modify(void) ASSERT_EQ(vma->vm_end, 0x3000); ASSERT_EQ(vma->vm_pgoff, 2); - vm_area_free(vma); + detach_free_vma(vma); mtree_destroy(&mm.mm_mt); return true; @@ -407,7 +414,7 @@ static bool test_simple_expand(void) ASSERT_EQ(vma->vm_end, 0x3000); ASSERT_EQ(vma->vm_pgoff, 0); - vm_area_free(vma); + detach_free_vma(vma); mtree_destroy(&mm.mm_mt); return true; @@ -428,7 +435,7 @@ static bool test_simple_shrink(void) ASSERT_EQ(vma->vm_end, 0x1000); ASSERT_EQ(vma->vm_pgoff, 0); - vm_area_free(vma); + detach_free_vma(vma); mtree_destroy(&mm.mm_mt); return true; @@ -619,7 +626,7 @@ static bool test_merge_new(void) ASSERT_EQ(vma->vm_pgoff, 0); ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); - vm_area_free(vma); + detach_free_vma(vma); count++; } @@ -1668,6 +1675,7 @@ int main(void) int num_tests = 0, num_fail = 0; maple_tree_init(); + vma_state_init(); #define TEST(name) \ do { \ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 32e9903131583..198abe66de5a9 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t; */ #define pr_warn_once pr_err +#define data_race(expr) expr + +#define ASSERT_EXCLUSIVE_WRITER(x) + struct kref { refcount_t refcount; }; @@ -255,6 +259,8 @@ struct file { #define VMA_LOCK_OFFSET 0x40000000 +typedef struct { unsigned long v; } freeptr_t; + struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -264,9 +270,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; struct mm_struct *vm_mm; /* The address space we belong to. */ @@ -463,6 +467,65 @@ struct pagetable_move_control { .len_in = len_, \ } +struct kmem_cache_args { + /** + * @align: The required alignment for the objects. + * + * %0 means no specific alignment is requested. + */ + unsigned int align; + /** + * @useroffset: Usercopy region offset. + * + * %0 is a valid offset, when @usersize is non-%0 + */ + unsigned int useroffset; + /** + * @usersize: Usercopy region size. + * + * %0 means no usercopy region is specified. + */ + unsigned int usersize; + /** + * @freeptr_offset: Custom offset for the free pointer + * in &SLAB_TYPESAFE_BY_RCU caches + * + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer + * outside of the object. This might cause the object to grow in size. + * Cache creators that have a reason to avoid this can specify a custom + * free pointer offset in their struct where the free pointer will be + * placed. + * + * Note that placing the free pointer inside the object requires the + * caller to ensure that no fields are invalidated that are required to + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for + * details). + * + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset + * is specified, %use_freeptr_offset must be set %true. + * + * Note that @ctor currently isn't supported with custom free pointers + * as a @ctor requires an external free pointer. + */ + unsigned int freeptr_offset; + /** + * @use_freeptr_offset: Whether a @freeptr_offset is used. + */ + bool use_freeptr_offset; + /** + * @ctor: A constructor for the objects. + * + * The constructor is invoked for each object in a newly allocated slab + * page. It is the cache user's responsibility to free object in the + * same state as after calling the constructor, or deal appropriately + * with any differences between a freshly constructed and a reallocated + * object. + * + * %NULL means no constructor. + */ + void (*ctor)(void *); +}; + static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); @@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_lock_seq = UINT_MAX; } -static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) -{ - struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct)); +struct kmem_cache { + const char *name; + size_t object_size; + struct kmem_cache_args *args; +}; - if (!vma) - return NULL; +static inline struct kmem_cache *__kmem_cache_create(const char *name, + size_t object_size, + struct kmem_cache_args *args) +{ + struct kmem_cache *ret = malloc(sizeof(struct kmem_cache)); - vma_init(vma, mm); + ret->name = name; + ret->object_size = object_size; + ret->args = args; - return vma; + return ret; } -static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) -{ - struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct)); +#define kmem_cache_create(__name, __object_size, __args, ...) \ + __kmem_cache_create((__name), (__object_size), (__args)) - if (!new) - return NULL; +static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + (void)gfpflags; - memcpy(new, orig, sizeof(*new)); - refcount_set(&new->vm_refcnt, 0); - new->vm_lock_seq = UINT_MAX; - INIT_LIST_HEAD(&new->anon_vma_chain); + return calloc(s->object_size, 1); +} - return new; +static inline void kmem_cache_free(struct kmem_cache *s, void *x) +{ + free(x); } /* @@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *) { } -static inline void vm_area_free(struct vm_area_struct *vma) -{ - free(vma); -} - static inline void lru_add_drain(void) { } @@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm) (void)mm; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ + (void)vma; + (void)reset_refcnt; +} + +static inline void vma_numab_state_init(struct vm_area_struct *vma) +{ + (void)vma; +} + +static inline void vma_numab_state_free(struct vm_area_struct *vma) +{ + (void)vma; +} + +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + (void)orig_vma; + (void)new_vma; +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) +{ + (void)vma; +} + #endif /* __MM_VMA_INTERNAL_H */ -- 2.39.5