]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
mm: move dup_mmap() to mm
authorLorenzo Stoakes <lorenzo.stoakes@oracle.com>
Mon, 28 Apr 2025 15:28:16 +0000 (16:28 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Tue, 13 May 2025 06:50:48 +0000 (23:50 -0700)
This is a key step in our being able to abstract and isolate VMA
allocation and destruction logic.

This function is the last one where vm_area_free() and vm_area_dup() are
directly referenced outside of mmap, so having this in mm allows us to
isolate these.

We do the same for the nommu version which is substantially simpler.

We place the declaration for dup_mmap() in mm/internal.h and have
kernel/fork.c import this in order to prevent improper use of this
functionality elsewhere in the kernel.

While we're here, we remove the useless #ifdef CONFIG_MMU check around
mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if
CONFIG_MMU is set.

Link: https://lkml.kernel.org/r/e49aad3d00212f5539d9fa5769bfda4ce451db3e.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
kernel/fork.c
mm/internal.h
mm/mmap.c
mm/nommu.c

index 168681fc4b25a9fddcb90ce155c027551455f4ee..ac9f9267a473dc6355f5fc504ef15b71331a0d2d 100644 (file)
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+/* For dup_mmap(). */
+#include "../mm/internal.h"
+
 #include <trace/events/sched.h>
 
 #define CREATE_TRACE_POINTS
@@ -589,7 +592,7 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
 
-static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
 {
        struct file *exe_file;
 
@@ -604,183 +607,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
 }
 
 #ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
-                                       struct mm_struct *oldmm)
-{
-       struct vm_area_struct *mpnt, *tmp;
-       int retval;
-       unsigned long charge = 0;
-       LIST_HEAD(uf);
-       VMA_ITERATOR(vmi, mm, 0);
-
-       if (mmap_write_lock_killable(oldmm))
-               return -EINTR;
-       flush_cache_dup_mm(oldmm);
-       uprobe_dup_mmap(oldmm, mm);
-       /*
-        * Not linked in yet - no deadlock potential:
-        */
-       mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
-
-       /* No ordering required: file already has been exposed. */
-       dup_mm_exe_file(mm, oldmm);
-
-       mm->total_vm = oldmm->total_vm;
-       mm->data_vm = oldmm->data_vm;
-       mm->exec_vm = oldmm->exec_vm;
-       mm->stack_vm = oldmm->stack_vm;
-
-       /* Use __mt_dup() to efficiently build an identical maple tree. */
-       retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
-       if (unlikely(retval))
-               goto out;
-
-       mt_clear_in_rcu(vmi.mas.tree);
-       for_each_vma(vmi, mpnt) {
-               struct file *file;
-
-               vma_start_write(mpnt);
-               if (mpnt->vm_flags & VM_DONTCOPY) {
-                       retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
-                                                   mpnt->vm_end, GFP_KERNEL);
-                       if (retval)
-                               goto loop_out;
-
-                       vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
-                       continue;
-               }
-               charge = 0;
-               /*
-                * Don't duplicate many vmas if we've been oom-killed (for
-                * example)
-                */
-               if (fatal_signal_pending(current)) {
-                       retval = -EINTR;
-                       goto loop_out;
-               }
-               if (mpnt->vm_flags & VM_ACCOUNT) {
-                       unsigned long len = vma_pages(mpnt);
-
-                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
-                               goto fail_nomem;
-                       charge = len;
-               }
-               tmp = vm_area_dup(mpnt);
-               if (!tmp)
-                       goto fail_nomem;
-
-               /* track_pfn_copy() will later take care of copying internal state. */
-               if (unlikely(tmp->vm_flags & VM_PFNMAP))
-                       untrack_pfn_clear(tmp);
-
-               retval = vma_dup_policy(mpnt, tmp);
-               if (retval)
-                       goto fail_nomem_policy;
-               tmp->vm_mm = mm;
-               retval = dup_userfaultfd(tmp, &uf);
-               if (retval)
-                       goto fail_nomem_anon_vma_fork;
-               if (tmp->vm_flags & VM_WIPEONFORK) {
-                       /*
-                        * VM_WIPEONFORK gets a clean slate in the child.
-                        * Don't prepare anon_vma until fault since we don't
-                        * copy page for current vma.
-                        */
-                       tmp->anon_vma = NULL;
-               } else if (anon_vma_fork(tmp, mpnt))
-                       goto fail_nomem_anon_vma_fork;
-               vm_flags_clear(tmp, VM_LOCKED_MASK);
-               /*
-                * Copy/update hugetlb private vma information.
-                */
-               if (is_vm_hugetlb_page(tmp))
-                       hugetlb_dup_vma_private(tmp);
-
-               /*
-                * Link the vma into the MT. After using __mt_dup(), memory
-                * allocation is not necessary here, so it cannot fail.
-                */
-               vma_iter_bulk_store(&vmi, tmp);
-
-               mm->map_count++;
-
-               if (tmp->vm_ops && tmp->vm_ops->open)
-                       tmp->vm_ops->open(tmp);
-
-               file = tmp->vm_file;
-               if (file) {
-                       struct address_space *mapping = file->f_mapping;
-
-                       get_file(file);
-                       i_mmap_lock_write(mapping);
-                       if (vma_is_shared_maywrite(tmp))
-                               mapping_allow_writable(mapping);
-                       flush_dcache_mmap_lock(mapping);
-                       /* insert tmp into the share list, just after mpnt */
-                       vma_interval_tree_insert_after(tmp, mpnt,
-                                       &mapping->i_mmap);
-                       flush_dcache_mmap_unlock(mapping);
-                       i_mmap_unlock_write(mapping);
-               }
-
-               if (!(tmp->vm_flags & VM_WIPEONFORK))
-                       retval = copy_page_range(tmp, mpnt);
-
-               if (retval) {
-                       mpnt = vma_next(&vmi);
-                       goto loop_out;
-               }
-       }
-       /* a new mm has just been created */
-       retval = arch_dup_mmap(oldmm, mm);
-loop_out:
-       vma_iter_free(&vmi);
-       if (!retval) {
-               mt_set_in_rcu(vmi.mas.tree);
-               ksm_fork(mm, oldmm);
-               khugepaged_fork(mm, oldmm);
-       } else {
-
-               /*
-                * The entire maple tree has already been duplicated. If the
-                * mmap duplication fails, mark the failure point with
-                * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
-                * stop releasing VMAs that have not been duplicated after this
-                * point.
-                */
-               if (mpnt) {
-                       mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
-                       mas_store(&vmi.mas, XA_ZERO_ENTRY);
-                       /* Avoid OOM iterating a broken tree */
-                       set_bit(MMF_OOM_SKIP, &mm->flags);
-               }
-               /*
-                * The mm_struct is going to exit, but the locks will be dropped
-                * first.  Set the mm_struct as unstable is advisable as it is
-                * not fully initialised.
-                */
-               set_bit(MMF_UNSTABLE, &mm->flags);
-       }
-out:
-       mmap_write_unlock(mm);
-       flush_tlb_mm(oldmm);
-       mmap_write_unlock(oldmm);
-       if (!retval)
-               dup_userfaultfd_complete(&uf);
-       else
-               dup_userfaultfd_fail(&uf);
-       return retval;
-
-fail_nomem_anon_vma_fork:
-       mpol_put(vma_policy(tmp));
-fail_nomem_policy:
-       vm_area_free(tmp);
-fail_nomem:
-       retval = -ENOMEM;
-       vm_unacct_memory(charge);
-       goto loop_out;
-}
-
 static inline int mm_alloc_pgd(struct mm_struct *mm)
 {
        mm->pgd = pgd_alloc(mm);
@@ -794,13 +620,6 @@ static inline void mm_free_pgd(struct mm_struct *mm)
        pgd_free(mm, mm->pgd);
 }
 #else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-       mmap_write_lock(oldmm);
-       dup_mm_exe_file(mm, oldmm);
-       mmap_write_unlock(oldmm);
-       return 0;
-}
 #define mm_alloc_pgd(mm)       (0)
 #define mm_free_pgd(mm)
 #endif /* CONFIG_MMU */
index cf7c0e9ef7ece45b284d4c18f2ecaf23a26ea4a5..6b8ed20177432581deb86bb8664ef968f3665a3f 100644 (file)
@@ -1624,5 +1624,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
 }
 #endif /* CONFIG_PT_RECLAIM */
 
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
 
 #endif /* __MM_INTERNAL_H */
index 9e09eac0021c9c9e8839fa9fd74f2d8ab801a4f5..5259df031e15d2541aa549177d558283bfc8db12 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1675,7 +1675,6 @@ static int __meminit init_reserve_notifier(void)
 }
 subsys_initcall(init_reserve_notifier);
 
-#ifdef CONFIG_MMU
 /*
  * Obtain a read lock on mm->mmap_lock, if the specified address is below the
  * start of the VMA, the intent is to perform a write, and it is a
@@ -1719,10 +1718,180 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
        mmap_write_downgrade(mm);
        return true;
 }
-#else
-bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
-                                unsigned long addr, bool write)
+
+__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-       return false;
+       struct vm_area_struct *mpnt, *tmp;
+       int retval;
+       unsigned long charge = 0;
+       LIST_HEAD(uf);
+       VMA_ITERATOR(vmi, mm, 0);
+
+       if (mmap_write_lock_killable(oldmm))
+               return -EINTR;
+       flush_cache_dup_mm(oldmm);
+       uprobe_dup_mmap(oldmm, mm);
+       /*
+        * Not linked in yet - no deadlock potential:
+        */
+       mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
+
+       /* No ordering required: file already has been exposed. */
+       dup_mm_exe_file(mm, oldmm);
+
+       mm->total_vm = oldmm->total_vm;
+       mm->data_vm = oldmm->data_vm;
+       mm->exec_vm = oldmm->exec_vm;
+       mm->stack_vm = oldmm->stack_vm;
+
+       /* Use __mt_dup() to efficiently build an identical maple tree. */
+       retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+       if (unlikely(retval))
+               goto out;
+
+       mt_clear_in_rcu(vmi.mas.tree);
+       for_each_vma(vmi, mpnt) {
+               struct file *file;
+
+               vma_start_write(mpnt);
+               if (mpnt->vm_flags & VM_DONTCOPY) {
+                       retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+                                                   mpnt->vm_end, GFP_KERNEL);
+                       if (retval)
+                               goto loop_out;
+
+                       vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+                       continue;
+               }
+               charge = 0;
+               /*
+                * Don't duplicate many vmas if we've been oom-killed (for
+                * example)
+                */
+               if (fatal_signal_pending(current)) {
+                       retval = -EINTR;
+                       goto loop_out;
+               }
+               if (mpnt->vm_flags & VM_ACCOUNT) {
+                       unsigned long len = vma_pages(mpnt);
+
+                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+                               goto fail_nomem;
+                       charge = len;
+               }
+
+               tmp = vm_area_dup(mpnt);
+               if (!tmp)
+                       goto fail_nomem;
+
+               /* track_pfn_copy() will later take care of copying internal state. */
+               if (unlikely(tmp->vm_flags & VM_PFNMAP))
+                       untrack_pfn_clear(tmp);
+
+               retval = vma_dup_policy(mpnt, tmp);
+               if (retval)
+                       goto fail_nomem_policy;
+               tmp->vm_mm = mm;
+               retval = dup_userfaultfd(tmp, &uf);
+               if (retval)
+                       goto fail_nomem_anon_vma_fork;
+               if (tmp->vm_flags & VM_WIPEONFORK) {
+                       /*
+                        * VM_WIPEONFORK gets a clean slate in the child.
+                        * Don't prepare anon_vma until fault since we don't
+                        * copy page for current vma.
+                        */
+                       tmp->anon_vma = NULL;
+               } else if (anon_vma_fork(tmp, mpnt))
+                       goto fail_nomem_anon_vma_fork;
+               vm_flags_clear(tmp, VM_LOCKED_MASK);
+               /*
+                * Copy/update hugetlb private vma information.
+                */
+               if (is_vm_hugetlb_page(tmp))
+                       hugetlb_dup_vma_private(tmp);
+
+               /*
+                * Link the vma into the MT. After using __mt_dup(), memory
+                * allocation is not necessary here, so it cannot fail.
+                */
+               vma_iter_bulk_store(&vmi, tmp);
+
+               mm->map_count++;
+
+               if (tmp->vm_ops && tmp->vm_ops->open)
+                       tmp->vm_ops->open(tmp);
+
+               file = tmp->vm_file;
+               if (file) {
+                       struct address_space *mapping = file->f_mapping;
+
+                       get_file(file);
+                       i_mmap_lock_write(mapping);
+                       if (vma_is_shared_maywrite(tmp))
+                               mapping_allow_writable(mapping);
+                       flush_dcache_mmap_lock(mapping);
+                       /* insert tmp into the share list, just after mpnt */
+                       vma_interval_tree_insert_after(tmp, mpnt,
+                                       &mapping->i_mmap);
+                       flush_dcache_mmap_unlock(mapping);
+                       i_mmap_unlock_write(mapping);
+               }
+
+               if (!(tmp->vm_flags & VM_WIPEONFORK))
+                       retval = copy_page_range(tmp, mpnt);
+
+               if (retval) {
+                       mpnt = vma_next(&vmi);
+                       goto loop_out;
+               }
+       }
+       /* a new mm has just been created */
+       retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+       vma_iter_free(&vmi);
+       if (!retval) {
+               mt_set_in_rcu(vmi.mas.tree);
+               ksm_fork(mm, oldmm);
+               khugepaged_fork(mm, oldmm);
+       } else {
+
+               /*
+                * The entire maple tree has already been duplicated. If the
+                * mmap duplication fails, mark the failure point with
+                * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+                * stop releasing VMAs that have not been duplicated after this
+                * point.
+                */
+               if (mpnt) {
+                       mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+                       mas_store(&vmi.mas, XA_ZERO_ENTRY);
+                       /* Avoid OOM iterating a broken tree */
+                       set_bit(MMF_OOM_SKIP, &mm->flags);
+               }
+               /*
+                * The mm_struct is going to exit, but the locks will be dropped
+                * first.  Set the mm_struct as unstable is advisable as it is
+                * not fully initialised.
+                */
+               set_bit(MMF_UNSTABLE, &mm->flags);
+       }
+out:
+       mmap_write_unlock(mm);
+       flush_tlb_mm(oldmm);
+       mmap_write_unlock(oldmm);
+       if (!retval)
+               dup_userfaultfd_complete(&uf);
+       else
+               dup_userfaultfd_fail(&uf);
+       return retval;
+
+fail_nomem_anon_vma_fork:
+       mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+       vm_area_free(tmp);
+fail_nomem:
+       retval = -ENOMEM;
+       vm_unacct_memory(charge);
+       goto loop_out;
 }
-#endif
index 2b4d304c6445ba5fbae29efbb7c638e4c2d9249b..a142fc258d3982db03cb11ceeaf0d16549d6416c 100644 (file)
@@ -1874,3 +1874,11 @@ static int __meminit init_admin_reserve(void)
        return 0;
 }
 subsys_initcall(init_admin_reserve);
+
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+       mmap_write_lock(oldmm);
+       dup_mm_exe_file(mm, oldmm);
+       mmap_write_unlock(oldmm);
+       return 0;
+}