]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
mm: abstract initial stack setup to mm subsystem
authorLorenzo Stoakes <lorenzo.stoakes@oracle.com>
Mon, 28 Apr 2025 15:28:15 +0000 (16:28 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Tue, 13 May 2025 06:50:48 +0000 (23:50 -0700)
There are peculiarities within the kernel where what is very clearly mm
code is performed elsewhere arbitrarily.

This violates separation of concerns and makes it harder to refactor code
to make changes to how fundamental initialisation and operation of mm
logic is performed.

One such case is the creation of the VMA containing the initial stack upon
execve()'ing a new process.  This is currently performed in
__bprm_mm_init() in fs/exec.c.

Abstract this operation to create_init_stack_vma().  This allows us to
limit use of vma allocation and free code to fork and mm only.

We previously did the same for the step at which we relocate the initial
stack VMA downwards via relocate_vma_down(), now we move the initial VMA
establishment too.

Take the opportunity to also move insert_vm_struct() to mm/vma.c as it's
no longer needed anywhere outside of mm.

Link: https://lkml.kernel.org/r/118c950ef7a8dd19ab20a23a68c3603751acd30e.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/exec.c
mm/mmap.c
mm/vma.c
mm/vma.h
mm/vma_exec.c
tools/testing/vma/vma_internal.h

index 477bc3f2e9660c1c79d94b461bfd21da20ef36fd..f9bbcf0016a4b8a54de0dd92eefb95b4130cf6fe 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -245,60 +245,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 }
 
-static int __bprm_mm_init(struct linux_binprm *bprm)
-{
-       int err;
-       struct vm_area_struct *vma = NULL;
-       struct mm_struct *mm = bprm->mm;
-
-       bprm->vma = vma = vm_area_alloc(mm);
-       if (!vma)
-               return -ENOMEM;
-       vma_set_anonymous(vma);
-
-       if (mmap_write_lock_killable(mm)) {
-               err = -EINTR;
-               goto err_free;
-       }
-
-       /*
-        * Need to be called with mmap write lock
-        * held, to avoid race with ksmd.
-        */
-       err = ksm_execve(mm);
-       if (err)
-               goto err_ksm;
-
-       /*
-        * Place the stack at the largest stack address the architecture
-        * supports. Later, we'll move this to an appropriate place. We don't
-        * use STACK_TOP because that can depend on attributes which aren't
-        * configured yet.
-        */
-       BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
-       vma->vm_end = STACK_TOP_MAX;
-       vma->vm_start = vma->vm_end - PAGE_SIZE;
-       vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
-       vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-
-       err = insert_vm_struct(mm, vma);
-       if (err)
-               goto err;
-
-       mm->stack_vm = mm->total_vm = 1;
-       mmap_write_unlock(mm);
-       bprm->p = vma->vm_end - sizeof(void *);
-       return 0;
-err:
-       ksm_exit(mm);
-err_ksm:
-       mmap_write_unlock(mm);
-err_free:
-       bprm->vma = NULL;
-       vm_area_free(vma);
-       return err;
-}
-
 static bool valid_arg_len(struct linux_binprm *bprm, long len)
 {
        return len <= MAX_ARG_STRLEN;
@@ -351,12 +297,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 {
 }
 
-static int __bprm_mm_init(struct linux_binprm *bprm)
-{
-       bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
-       return 0;
-}
-
 static bool valid_arg_len(struct linux_binprm *bprm, long len)
 {
        return len <= bprm->p;
@@ -385,9 +325,13 @@ static int bprm_mm_init(struct linux_binprm *bprm)
        bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
        task_unlock(current->group_leader);
 
-       err = __bprm_mm_init(bprm);
+#ifndef CONFIG_MMU
+       bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
+#else
+       err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p);
        if (err)
                goto err;
+#endif
 
        return 0;
 
index 1794bf6f4dc0cfcfb862d0021c0de630c3cd9ac1..9e09eac0021c9c9e8839fa9fd74f2d8ab801a4f5 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1321,48 +1321,6 @@ destroy:
        vm_unacct_memory(nr_accounted);
 }
 
-/* Insert vm structure into process list sorted by address
- * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_rwsem is taken here.
- */
-int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
-       unsigned long charged = vma_pages(vma);
-
-
-       if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
-               return -ENOMEM;
-
-       if ((vma->vm_flags & VM_ACCOUNT) &&
-            security_vm_enough_memory_mm(mm, charged))
-               return -ENOMEM;
-
-       /*
-        * The vm_pgoff of a purely anonymous vma should be irrelevant
-        * until its first write fault, when page's anon_vma and index
-        * are set.  But now set the vm_pgoff it will almost certainly
-        * end up with (unless mremap moves it elsewhere before that
-        * first wfault), so /proc/pid/maps tells a consistent story.
-        *
-        * By setting it to reflect the virtual start address of the
-        * vma, merges and splits can happen in a seamless way, just
-        * using the existing file pgoff checks and manipulations.
-        * Similarly in do_mmap and in do_brk_flags.
-        */
-       if (vma_is_anonymous(vma)) {
-               BUG_ON(vma->anon_vma);
-               vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
-       }
-
-       if (vma_link(mm, vma)) {
-               if (vma->vm_flags & VM_ACCOUNT)
-                       vm_unacct_memory(charged);
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
 /*
  * Return true if the calling process may expand its vm space by the passed
  * number of pages
index 8a6c5e83575934d3255076bced044138a47dcb01..1f2634b2956845e9c5aba0b38cc6da5d6f1b07b4 100644 (file)
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -3052,3 +3052,46 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock)
        userfaultfd_unmap_complete(mm, &uf);
        return ret;
 }
+
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree.  If vm_file is non-NULL
+ * then i_mmap_rwsem is taken here.
+ */
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+       unsigned long charged = vma_pages(vma);
+
+
+       if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
+               return -ENOMEM;
+
+       if ((vma->vm_flags & VM_ACCOUNT) &&
+            security_vm_enough_memory_mm(mm, charged))
+               return -ENOMEM;
+
+       /*
+        * The vm_pgoff of a purely anonymous vma should be irrelevant
+        * until its first write fault, when page's anon_vma and index
+        * are set.  But now set the vm_pgoff it will almost certainly
+        * end up with (unless mremap moves it elsewhere before that
+        * first wfault), so /proc/pid/maps tells a consistent story.
+        *
+        * By setting it to reflect the virtual start address of the
+        * vma, merges and splits can happen in a seamless way, just
+        * using the existing file pgoff checks and manipulations.
+        * Similarly in do_mmap and in do_brk_flags.
+        */
+       if (vma_is_anonymous(vma)) {
+               BUG_ON(vma->anon_vma);
+               vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+       }
+
+       if (vma_link(mm, vma)) {
+               if (vma->vm_flags & VM_ACCOUNT)
+                       vm_unacct_memory(charged);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
index 4413445e074bdfbdf3352c26ab0e268159ebd82c..151e292263fdc5e27f3ad73021290835563c5c2d 100644 (file)
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -548,8 +548,12 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
 
 int __vm_munmap(unsigned long start, size_t len, bool unlock);
 
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
+
 /* vma_exec.c */
 #ifdef CONFIG_MMU
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+                         unsigned long *top_mem_p);
 int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
 #endif
 
index 6736ae37f748b3706b28dc6acb05c3ba6db96a1c..2dffb02ed6a2c88f03bdbda54528fc1358fe1946 100644 (file)
@@ -90,3 +90,72 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
        /* Shrink the vma to just the new range */
        return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
 }
+
+/*
+ * Establish the stack VMA in an execve'd process, located temporarily at the
+ * maximum stack address provided by the architecture.
+ *
+ * We later relocate this downwards in relocate_vma_down().
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable initialisation.
+ *
+ * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
+ * maximum addressable location in the stack (that is capable of storing a
+ * system word of data).
+ */
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+                         unsigned long *top_mem_p)
+{
+       int err;
+       struct vm_area_struct *vma = vm_area_alloc(mm);
+
+       if (!vma)
+               return -ENOMEM;
+
+       vma_set_anonymous(vma);
+
+       if (mmap_write_lock_killable(mm)) {
+               err = -EINTR;
+               goto err_free;
+       }
+
+       /*
+        * Need to be called with mmap write lock
+        * held, to avoid race with ksmd.
+        */
+       err = ksm_execve(mm);
+       if (err)
+               goto err_ksm;
+
+       /*
+        * Place the stack at the largest stack address the architecture
+        * supports. Later, we'll move this to an appropriate place. We don't
+        * use STACK_TOP because that can depend on attributes which aren't
+        * configured yet.
+        */
+       BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+       vma->vm_end = STACK_TOP_MAX;
+       vma->vm_start = vma->vm_end - PAGE_SIZE;
+       vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+       vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+       err = insert_vm_struct(mm, vma);
+       if (err)
+               goto err;
+
+       mm->stack_vm = mm->total_vm = 1;
+       mmap_write_unlock(mm);
+       *vmap = vma;
+       *top_mem_p = vma->vm_end - sizeof(void *);
+       return 0;
+
+err:
+       ksm_exit(mm);
+err_ksm:
+       mmap_write_unlock(mm);
+err_free:
+       *vmap = NULL;
+       vm_area_free(vma);
+       return err;
+}
index 0df19ca0000a6cac719e0b4c4e75d7939036f201..32e99031315836eac035929dd0d02479be7a46c6 100644 (file)
@@ -56,6 +56,8 @@ extern unsigned long dac_mmap_min_addr;
 #define VM_PFNMAP      0x00000400
 #define VM_LOCKED      0x00002000
 #define VM_IO           0x00004000
+#define VM_SEQ_READ    0x00008000      /* App will access data sequentially */
+#define VM_RAND_READ   0x00010000      /* App will not benefit from clustered reads */
 #define VM_DONTEXPAND  0x00040000
 #define VM_LOCKONFAULT 0x00080000
 #define VM_ACCOUNT     0x00100000
@@ -70,6 +72,20 @@ extern unsigned long dac_mmap_min_addr;
 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
 
+#ifdef CONFIG_STACK_GROWSUP
+#define VM_STACK       VM_GROWSUP
+#define VM_STACK_EARLY VM_GROWSDOWN
+#else
+#define VM_STACK       VM_GROWSDOWN
+#define VM_STACK_EARLY 0
+#endif
+
+#define DEFAULT_MAP_WINDOW     ((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_LOW          DEFAULT_MAP_WINDOW
+#define TASK_SIZE_MAX          DEFAULT_MAP_WINDOW
+#define STACK_TOP              TASK_SIZE_LOW
+#define STACK_TOP_MAX          TASK_SIZE_MAX
+
 /* This mask represents all the VMA flag bits used by mlock */
 #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
 
@@ -82,6 +98,10 @@ extern unsigned long dac_mmap_min_addr;
 
 #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
 
+#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
+#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
+
 #define RLIMIT_STACK           3       /* max stack size */
 #define RLIMIT_MEMLOCK         8       /* max locked-in-memory address space */
 
@@ -1280,4 +1300,16 @@ static inline void free_pgd_range(struct mmu_gather *tlb,
        (void)ceiling;
 }
 
+static inline int ksm_execve(struct mm_struct *mm)
+{
+       (void)mm;
+
+       return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+       (void)mm;
+}
+
 #endif /* __MM_VMA_INTERNAL_H */