]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
mm: madvise: use per_vma lock for MADV_FREE
authorBarry Song <v-songbaohua@oppo.com>
Wed, 11 Jun 2025 10:47:45 +0000 (22:47 +1200)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 10 Jul 2025 05:42:07 +0000 (22:42 -0700)
MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory
freeing in user-space native or Java heap memory management.  For example,
jemalloc can be configured to use MADV_FREE, and recent versions of the
Android Java heap have also increasingly adopted MADV_FREE.  Supporting
per-VMA locking for MADV_FREE thus appears increasingly necessary.

We have replaced walk_page_range() with walk_page_range_vma().  Along with
the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is
now in place to begin exploring per-VMA locking support for MADV_FREE and
potentially other madvise using walk_page_range_vma().

This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in
walk_page_range_vma(), and leverages madvise_lock_mode from madv_behavior
to select the appropriate walk_lock—either mmap_lock or per-VMA
lock—based on the context.

Because we now dynamically update the walk_ops->walk_lock field, we must
ensure this is thread-safe.  The madvise_free_walk_ops is now defined as a
stack variable instead of a global constant.

Link: https://lkml.kernel.org/r/20250611104745.57405-1-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/pagewalk.h
mm/madvise.c
mm/pagewalk.c

index 8ac2f6d6d2a344d53376e83f4ee64e179fcbf8cf..682472c15495263c001c2cb9abc904f420b493b5 100644 (file)
@@ -14,6 +14,8 @@ enum page_walk_lock {
        PGWALK_WRLOCK = 1,
        /* vma is expected to be already write-locked during the walk */
        PGWALK_WRLOCK_VERIFY = 2,
+       /* vma is expected to be already read-locked during the walk */
+       PGWALK_VMA_RDLOCK_VERIFY = 3,
 };
 
 /**
index 7d78d4b5fb1848a8234ad7063de3e47386c330bf..d451438af999f3e51b07a40a22300b40c0b1f954 100644 (file)
@@ -777,10 +777,19 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
        return 0;
 }
 
-static const struct mm_walk_ops madvise_free_walk_ops = {
-       .pmd_entry              = madvise_free_pte_range,
-       .walk_lock              = PGWALK_RDLOCK,
-};
+static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
+{
+       switch (mode) {
+       case MADVISE_VMA_READ_LOCK:
+               return PGWALK_VMA_RDLOCK_VERIFY;
+       case MADVISE_MMAP_READ_LOCK:
+               return PGWALK_RDLOCK;
+       default:
+               /* Other modes don't require fixing up the walk_lock */
+               WARN_ON_ONCE(1);
+               return PGWALK_RDLOCK;
+       }
+}
 
 static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
                        struct vm_area_struct *vma,
@@ -789,6 +798,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_notifier_range range;
        struct mmu_gather *tlb = madv_behavior->tlb;
+       struct mm_walk_ops walk_ops = {
+               .pmd_entry              = madvise_free_pte_range,
+       };
 
        /* MADV_FREE works for only anon vma at the moment */
        if (!vma_is_anonymous(vma))
@@ -808,8 +820,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
 
        mmu_notifier_invalidate_range_start(&range);
        tlb_start_vma(tlb, vma);
+       walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
        walk_page_range_vma(vma, range.start, range.end,
-                       &madvise_free_walk_ops, tlb);
+                       &walk_ops, tlb);
        tlb_end_vma(tlb, vma);
        mmu_notifier_invalidate_range_end(&range);
        return 0;
@@ -1658,7 +1671,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
        case MADV_WILLNEED:
        case MADV_COLD:
        case MADV_PAGEOUT:
-       case MADV_FREE:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
        case MADV_COLLAPSE:
@@ -1667,6 +1679,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
                return MADVISE_MMAP_READ_LOCK;
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
+       case MADV_FREE:
                return MADVISE_VMA_READ_LOCK;
        default:
                return MADVISE_MMAP_WRITE_LOCK;
index ff5299eca687fca5a6aa6497aca7f97a744bd969..a214a2b40ab9faf65db97898d1ea359acfdffc68 100644 (file)
@@ -422,7 +422,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm,
 {
        if (walk_lock == PGWALK_RDLOCK)
                mmap_assert_locked(mm);
-       else
+       else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
                mmap_assert_write_locked(mm);
 }
 
@@ -437,6 +437,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
        case PGWALK_WRLOCK_VERIFY:
                vma_assert_write_locked(vma);
                break;
+       case PGWALK_VMA_RDLOCK_VERIFY:
+               vma_assert_locked(vma);
+               break;
        case PGWALK_RDLOCK:
                /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
                break;