From: Lorenzo Stoakes Date: Fri, 20 Jun 2025 15:33:01 +0000 (+0100) Subject: mm/madvise: remove the visitor pattern and thread anon_vma state X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=58fc12f77eb962360ecf80abd3e1c2790b50786d;p=thirdparty%2Fkernel%2Flinux.git mm/madvise: remove the visitor pattern and thread anon_vma state Patch series "madvise cleanup", v2. This is a series of patches that helps address a number of historic problems in the madvise() implementation: * Eliminate the visitor pattern and having the code which is implemented for both the anon_vma_name implementation and ordinary madvise() operations use the same madvise_vma_behavior() implementation. * Thread state through the madvise_behavior state object - this object, very usefully introduced by SJ, is already used to transmit state through operations. This series extends this by having all madvise() operations use this, including anon_vma_name. * Thread range, VMA state through madvise_behavior - This helps avoid a lot of the confusing code around range and VMA state and again keeps things consistent and with a single 'source of truth'. * Addressing the very strange behaviour around the passed around struct vm_area_struct **prev pointer - all read-only users do absolutely nothing with the prev pointer. The only function that uses it is madvise_update_vma(), and in all cases prev is always reset to VMA. Fix this by no longer having aything but madvise_update_vma() reference prev, and having madvise_walk_vmas() update prev in each instance. Additionally make it clear that the meaningful change in vma state is when madvise_update_vma() potentially merges a VMA, so explicitly retrieve the VMA in this case. * Update and clarify the madvise_walk_vmas() function - this is a source of a great deal of confusion, so simplify, stop using prev = NULL to signify that the mmap lock has been dropped (!) and make that explicit, and add some comments to explain what's going on. This patch (of 5): Now we have the madvise_behavior helper struct we no longer need to mess around with void* pointers in order to propagate anon_vma_name, and this means we can get rid of the confusing and inconsistent visitor pattern implementation in madvise_vma_anon_name(). This means we now have a single state object that threads through most of madvise()'s logic and a single code path which executes the majority of madvise() behaviour (we maintain separate logic for failure injection and memory population for the time being). We are able to remove the visitor pattern by handling the anon_vma_name setting logic via an internal madvise flag - __MADV_SET_ANON_VMA_NAME. This uses a negative value so it isn't reasonable that we will ever add this as a UAPI flag. Additionally, the madvise_behavior_valid() check ensures that user-specified behaviours are strictly only those we permit which, of course, this flag will be excluded from. We are able to propagate the anon_vma_name object through use of the madvise_behavior helper struct. Doing this results in a can_modify_vma_madv() check for anonymous VMA name changes, however this will cause no issues as this operation is not prohibited. We can also then reuse more code and drop the redundant madvise_vma_anon_name() function altogether. Additionally separate out behaviours that update VMAs from those that do not. Link: https://lkml.kernel.org/r/cover.1750433500.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/c5094bfccb41ecd19d4e9bcaa1c4a11e00158bba.1750433500.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Zi Yan Reviewed-by: SeongJae Park Reviewed-by: Barry Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Dev Jain Cc: Jann Horn Cc: Lance Yang Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- diff --git a/mm/madvise.c b/mm/madvise.c index 070132f9842b3..93837b980cc20 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -37,6 +37,8 @@ #include "internal.h" #include "swap.h" +#define __MADV_SET_ANON_VMA_NAME (-1) + /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. @@ -59,9 +61,13 @@ struct madvise_behavior { int behavior; struct mmu_gather *tlb; enum madvise_lock_mode lock_mode; + struct anon_vma_name *anon_name; }; #ifdef CONFIG_ANON_VMA_NAME +static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, + unsigned long end, struct madvise_behavior *madv_behavior); + struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; @@ -112,6 +118,35 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, return 0; } + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) +{ + unsigned long end; + unsigned long len; + struct madvise_behavior madv_behavior = { + .behavior = __MADV_SET_ANON_VMA_NAME, + .lock_mode = MADVISE_MMAP_WRITE_LOCK, + .anon_name = anon_name, + }; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, &madv_behavior); +} #else /* CONFIG_ANON_VMA_NAME */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) @@ -1252,13 +1287,13 @@ static long madvise_guard_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - void *behavior_arg) + struct madvise_behavior *madv_behavior) { - struct madvise_behavior *arg = behavior_arg; - int behavior = arg->behavior; - int error; - struct anon_vma_name *anon_name; + int behavior = madv_behavior->behavior; + struct anon_vma_name *anon_name = madv_behavior->anon_name; vm_flags_t new_flags = vma->vm_flags; + bool set_new_anon_name = behavior == __MADV_SET_ANON_VMA_NAME; + int error; if (unlikely(!can_modify_vma_madv(vma, behavior))) return -EPERM; @@ -1275,7 +1310,17 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, arg); + return madvise_dontneed_free(vma, prev, start, end, + madv_behavior); + case MADV_COLLAPSE: + return madvise_collapse(vma, prev, start, end); + case MADV_GUARD_INSTALL: + return madvise_guard_install(vma, prev, start, end); + case MADV_GUARD_REMOVE: + return madvise_guard_remove(vma, prev, start, end); + + /* The below behaviours update VMAs via madvise_update_vma(). */ + case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1325,21 +1370,29 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; - case MADV_COLLAPSE: - return madvise_collapse(vma, prev, start, end); - case MADV_GUARD_INSTALL: - return madvise_guard_install(vma, prev, start, end); - case MADV_GUARD_REMOVE: - return madvise_guard_remove(vma, prev, start, end); + case __MADV_SET_ANON_VMA_NAME: + /* Only anonymous mappings can be named */ + if (vma->vm_file && !vma_is_anon_shmem(vma)) + return -EBADF; + break; } /* We cannot provide prev in this lock mode. */ - VM_WARN_ON_ONCE(arg->lock_mode == MADVISE_VMA_READ_LOCK); - anon_name = anon_vma_name(vma); - anon_vma_name_get(anon_name); + VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); + + /* + * madvise_update_vma() might cause a VMA merge which could put an + * anon_vma_name, so we must hold an additional reference on the + * anon_vma_name so it doesn't disappear from under us. + */ + if (!set_new_anon_name) { + anon_name = anon_vma_name(vma); + anon_vma_name_get(anon_name); + } error = madvise_update_vma(vma, prev, start, end, new_flags, anon_name); - anon_vma_name_put(anon_name); + if (!set_new_anon_name) + anon_vma_name_put(anon_name); out: /* @@ -1523,20 +1576,17 @@ take_mmap_read_lock: } /* - * Walk the vmas in range [start,end), and call the visit function on each one. - * The visit function will get start and end parameters that cover the overlap - * between the current vma and the original range. Any unmapped regions in the - * original range will result in this function returning -ENOMEM while still - * calling the visit function on all of the existing vmas in the range. - * Must be called with the mmap_lock held for reading or writing. + * Walk the vmas in range [start,end), and call the madvise_vma_behavior + * function on each one. The function will get start and end parameters that + * cover the overlap between the current vma and the original range. Any + * unmapped regions in the original range will result in this function returning + * -ENOMEM while still calling the madvise_vma_behavior function on all of the + * existing vmas in the range. Must be called with the mmap_lock held for + * reading or writing. */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, struct madvise_behavior *madv_behavior, - void *arg, - int (*visit)(struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, - unsigned long end, void *arg)) + unsigned long end, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1548,11 +1598,12 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, * If VMA read lock is supported, apply madvise to a single VMA * tentatively, avoiding walking VMAs. */ - if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) { + if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) { vma = try_vma_read_lock(mm, madv_behavior, start, end); if (vma) { prev = vma; - error = visit(vma, &prev, start, end, arg); + error = madvise_vma_behavior(vma, &prev, start, end, + madv_behavior); vma_end_read(vma); return error; } @@ -1586,7 +1637,8 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = visit(vma, &prev, start, tmp, arg); + error = madvise_vma_behavior(vma, &prev, start, tmp, + madv_behavior); if (error) return error; start = tmp; @@ -1603,57 +1655,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, return unmapped_error; } -#ifdef CONFIG_ANON_VMA_NAME -static int madvise_vma_anon_name(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - void *anon_name) -{ - int error; - - /* Only anonymous mappings can be named */ - if (vma->vm_file && !vma_is_anon_shmem(vma)) - return -EBADF; - - error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - anon_name); - - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - return error; -} - -int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, struct anon_vma_name *anon_name) -{ - unsigned long end; - unsigned long len; - - if (start & ~PAGE_MASK) - return -EINVAL; - len = (len_in + ~PAGE_MASK) & PAGE_MASK; - - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return -EINVAL; - - end = start + len; - if (end < start) - return -EINVAL; - - if (end == start) - return 0; - - return madvise_walk_vmas(mm, start, end, NULL, anon_name, - madvise_vma_anon_name); -} -#endif /* CONFIG_ANON_VMA_NAME */ - - /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need @@ -1845,8 +1846,7 @@ static int madvise_do_behavior(struct mm_struct *mm, if (is_madvise_populate(behavior)) error = madvise_populate(mm, start, end, behavior); else - error = madvise_walk_vmas(mm, start, end, madv_behavior, - madv_behavior, madvise_vma_behavior); + error = madvise_walk_vmas(mm, start, end, madv_behavior); blk_finish_plug(&plug); return error; }