From: Lorenzo Stoakes Date: Mon, 28 Apr 2025 15:28:16 +0000 (+0100) Subject: mm: move dup_mmap() to mm X-Git-Tag: v6.16-rc1~92^2~71 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=26a8f57760c12ccd860b36ac5b73daede4396aa3;p=thirdparty%2Flinux.git mm: move dup_mmap() to mm This is a key step in our being able to abstract and isolate VMA allocation and destruction logic. This function is the last one where vm_area_free() and vm_area_dup() are directly referenced outside of mmap, so having this in mm allows us to isolate these. We do the same for the nommu version which is substantially simpler. We place the declaration for dup_mmap() in mm/internal.h and have kernel/fork.c import this in order to prevent improper use of this functionality elsewhere in the kernel. While we're here, we remove the useless #ifdef CONFIG_MMU check around mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if CONFIG_MMU is set. Link: https://lkml.kernel.org/r/e49aad3d00212f5539d9fa5769bfda4ce451db3e.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Pedro Falcato Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Signed-off-by: Andrew Morton --- diff --git a/kernel/fork.c b/kernel/fork.c index 168681fc4b25a..ac9f9267a473d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -112,6 +112,9 @@ #include #include +/* For dup_mmap(). */ +#include "../mm/internal.h" + #include #define CREATE_TRACE_POINTS @@ -589,7 +592,7 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; @@ -604,183 +607,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) } #ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp; - int retval; - unsigned long charge = 0; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, 0); - - if (mmap_write_lock_killable(oldmm)) - return -EINTR; - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - dup_mm_exe_file(mm, oldmm); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - /* Use __mt_dup() to efficiently build an identical maple tree. */ - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); - if (unlikely(retval)) - goto out; - - mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(vmi, mpnt) { - struct file *file; - - vma_start_write(mpnt); - if (mpnt->vm_flags & VM_DONTCOPY) { - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, - mpnt->vm_end, GFP_KERNEL); - if (retval) - goto loop_out; - - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - /* - * Don't duplicate many vmas if we've been oom-killed (for - * example) - */ - if (fatal_signal_pending(current)) { - retval = -EINTR; - goto loop_out; - } - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = vm_area_dup(mpnt); - if (!tmp) - goto fail_nomem; - - /* track_pfn_copy() will later take care of copying internal state. */ - if (unlikely(tmp->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(tmp); - - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* - * VM_WIPEONFORK gets a clean slate in the child. - * Don't prepare anon_vma until fault since we don't - * copy page for current vma. - */ - tmp->anon_vma = NULL; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - vm_flags_clear(tmp, VM_LOCKED_MASK); - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - file = tmp->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - get_file(file); - i_mmap_lock_write(mapping); - if (vma_is_shared_maywrite(tmp)) - mapping_allow_writable(mapping); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(tmp, mpnt); - - if (retval) { - mpnt = vma_next(&vmi); - goto loop_out; - } - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -loop_out: - vma_iter_free(&vmi); - if (!retval) { - mt_set_in_rcu(vmi.mas.tree); - ksm_fork(mm, oldmm); - khugepaged_fork(mm, oldmm); - } else { - - /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. - */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); - } - /* - * The mm_struct is going to exit, but the locks will be dropped - * first. Set the mm_struct as unstable is advisable as it is - * not fully initialised. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - } -out: - mmap_write_unlock(mm); - flush_tlb_mm(oldmm); - mmap_write_unlock(oldmm); - if (!retval) - dup_userfaultfd_complete(&uf); - else - dup_userfaultfd_fail(&uf); - return retval; - -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - vm_area_free(tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto loop_out; -} - static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); @@ -794,13 +620,6 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - mmap_write_lock(oldmm); - dup_mm_exe_file(mm, oldmm); - mmap_write_unlock(oldmm); - return 0; -} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ diff --git a/mm/internal.h b/mm/internal.h index cf7c0e9ef7ece..6b8ed20177432 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1624,5 +1624,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, } #endif /* CONFIG_PT_RECLAIM */ +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 9e09eac0021c9..5259df031e15d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1675,7 +1675,6 @@ static int __meminit init_reserve_notifier(void) } subsys_initcall(init_reserve_notifier); -#ifdef CONFIG_MMU /* * Obtain a read lock on mm->mmap_lock, if the specified address is below the * start of the VMA, the intent is to perform a write, and it is a @@ -1719,10 +1718,180 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, mmap_write_downgrade(mm); return true; } -#else -bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, bool write) + +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - return false; + struct vm_area_struct *mpnt, *tmp; + int retval; + unsigned long charge = 0; + LIST_HEAD(uf); + VMA_ITERATOR(vmi, mm, 0); + + if (mmap_write_lock_killable(oldmm)) + return -EINTR; + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + dup_mm_exe_file(mm, oldmm); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) + goto out; + + mt_clear_in_rcu(vmi.mas.tree); + for_each_vma(vmi, mpnt) { + struct file *file; + + vma_start_write(mpnt); + if (mpnt->vm_flags & VM_DONTCOPY) { + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, + mpnt->vm_end, GFP_KERNEL); + if (retval) + goto loop_out; + + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + /* + * Don't duplicate many vmas if we've been oom-killed (for + * example) + */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto loop_out; + } + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + + tmp = vm_area_dup(mpnt); + if (!tmp) + goto fail_nomem; + + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(tmp->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(tmp); + + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* + * VM_WIPEONFORK gets a clean slate in the child. + * Don't prepare anon_vma until fault since we don't + * copy page for current vma. + */ + tmp->anon_vma = NULL; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + vm_flags_clear(tmp, VM_LOCKED_MASK); + /* + * Copy/update hugetlb private vma information. + */ + if (is_vm_hugetlb_page(tmp)) + hugetlb_dup_vma_private(tmp); + + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); + + mm->map_count++; + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + file = tmp->vm_file; + if (file) { + struct address_space *mapping = file->f_mapping; + + get_file(file); + i_mmap_lock_write(mapping); + if (vma_is_shared_maywrite(tmp)) + mapping_allow_writable(mapping); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(tmp, mpnt); + + if (retval) { + mpnt = vma_next(&vmi); + goto loop_out; + } + } + /* a new mm has just been created */ + retval = arch_dup_mmap(oldmm, mm); +loop_out: + vma_iter_free(&vmi); + if (!retval) { + mt_set_in_rcu(vmi.mas.tree); + ksm_fork(mm, oldmm); + khugepaged_fork(mm, oldmm); + } else { + + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + if (mpnt) { + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + /* Avoid OOM iterating a broken tree */ + set_bit(MMF_OOM_SKIP, &mm->flags); + } + /* + * The mm_struct is going to exit, but the locks will be dropped + * first. Set the mm_struct as unstable is advisable as it is + * not fully initialised. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + } +out: + mmap_write_unlock(mm); + flush_tlb_mm(oldmm); + mmap_write_unlock(oldmm); + if (!retval) + dup_userfaultfd_complete(&uf); + else + dup_userfaultfd_fail(&uf); + return retval; + +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + vm_area_free(tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto loop_out; } -#endif diff --git a/mm/nommu.c b/mm/nommu.c index 2b4d304c6445b..a142fc258d398 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1874,3 +1874,11 @@ static int __meminit init_admin_reserve(void) return 0; } subsys_initcall(init_admin_reserve); + +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + mmap_write_lock(oldmm); + dup_mm_exe_file(mm, oldmm); + mmap_write_unlock(oldmm); + return 0; +}