mm: move mmap/vma locking logic into specific files

author Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

Wed, 16 Apr 2025 10:38:36 +0000 (11:38 +0100)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 12 May 2025 00:48:33 +0000 (17:48 -0700)
author Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Wed, 16 Apr 2025 10:38:36 +0000 (11:38 +0100)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 12 May 2025 00:48:33 +0000 (17:48 -0700)
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 5eb0d77c443898d83598d833a63ba447032cb41a..9b701cfbef22375826f2386ff74ff7a5112751ee 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -671,204 +671,11 @@ static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
  static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
  #endif /* CONFIG_NUMA_BALANCING */
  
-#ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       static struct lock_class_key lockdep_key;
-
-       lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
-#endif
-       if (reset_refcnt)
-               refcount_set(&vma->vm_refcnt, 0);
-       vma->vm_lock_seq = UINT_MAX;
-}
-
-static inline bool is_vma_writer_only(int refcnt)
-{
-       /*
-        * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
-        * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
-        * a detached vma happens only in vma_mark_detached() and is a rare
-        * case, therefore most of the time there will be no unnecessary wakeup.
-        */
-       return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
-}
-
-static inline void vma_refcount_put(struct vm_area_struct *vma)
-{
-       /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
-       struct mm_struct *mm = vma->vm_mm;
-       int oldcnt;
-
-       rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
-       if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
-
-               if (is_vma_writer_only(oldcnt - 1))
-                       rcuwait_wake_up(&mm->vma_writer_wait);
-       }
-}
-
-/*
- * Try to read-lock a vma. The function is allowed to occasionally yield false
- * locked result to avoid performance overhead, in which case we fall back to
- * using mmap_lock. The function should never yield false unlocked result.
- * False locked result is possible if mm_lock_seq overflows or if vma gets
- * reused and attached to a different mm before we lock it.
- * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
- * detached.
- */
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
-                                                   struct vm_area_struct *vma)
-{
-       int oldcnt;
-
-       /*
-        * Check before locking. A race might cause false locked result.
-        * We can use READ_ONCE() for the mm_lock_seq here, and don't need
-        * ACQUIRE semantics, because this is just a lockless check whose result
-        * we don't rely on for anything - the mm_lock_seq read against which we
-        * need ordering is below.
-        */
-       if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
-               return NULL;
-
-       /*
-        * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
-        * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
-        * Acquire fence is required here to avoid reordering against later
-        * vm_lock_seq check and checks inside lock_vma_under_rcu().
-        */
-       if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
-                                                             VMA_REF_LIMIT))) {
-               /* return EAGAIN if vma got detached from under us */
-               return oldcnt ? NULL : ERR_PTR(-EAGAIN);
-       }
-
-       rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
-       /*
-        * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
-        * False unlocked result is impossible because we modify and check
-        * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
-        * modification invalidates all existing locks.
-        *
-        * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
-        * racing with vma_end_write_all(), we only start reading from the VMA
-        * after it has been unlocked.
-        * This pairs with RELEASE semantics in vma_end_write_all().
-        */
-       if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
-               vma_refcount_put(vma);
-               return NULL;
-       }
-
-       return vma;
-}
-
-/*
- * Use only while holding mmap read lock which guarantees that locking will not
- * fail (nobody can concurrently write-lock the vma). vma_start_read() should
- * not be used in such cases because it might fail due to mm_lock_seq overflow.
- * This functionality is used to obtain vma read lock and drop the mmap read lock.
- */
-static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
-{
-       int oldcnt;
-
-       mmap_assert_locked(vma->vm_mm);
-       if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
-                                                             VMA_REF_LIMIT)))
-               return false;
-
-       rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
-       return true;
-}
-
-/*
- * Use only while holding mmap read lock which guarantees that locking will not
- * fail (nobody can concurrently write-lock the vma). vma_start_read() should
- * not be used in such cases because it might fail due to mm_lock_seq overflow.
- * This functionality is used to obtain vma read lock and drop the mmap read lock.
- */
-static inline bool vma_start_read_locked(struct vm_area_struct *vma)
-{
-       return vma_start_read_locked_nested(vma, 0);
-}
-
-static inline void vma_end_read(struct vm_area_struct *vma)
-{
-       vma_refcount_put(vma);
-}
-
-/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
-{
-       mmap_assert_write_locked(vma->vm_mm);
-
-       /*
-        * current task is holding mmap_write_lock, both vma->vm_lock_seq and
-        * mm->mm_lock_seq can't be concurrently modified.
-        */
-       *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
-       return (vma->vm_lock_seq == *mm_lock_seq);
-}
-
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
-
-/*
- * Begin writing to a VMA.
- * Exclude concurrent readers under the per-VMA lock until the currently
- * write-locked mmap_lock is dropped or downgraded.
- */
-static inline void vma_start_write(struct vm_area_struct *vma)
-{
-       unsigned int mm_lock_seq;
-
-       if (__is_vma_write_locked(vma, &mm_lock_seq))
-               return;
-
-       __vma_start_write(vma, mm_lock_seq);
-}
-
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-{
-       unsigned int mm_lock_seq;
-
-       VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
-}
-
-static inline void vma_assert_locked(struct vm_area_struct *vma)
-{
-       unsigned int mm_lock_seq;
-
-       VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
-                     !__is_vma_write_locked(vma, &mm_lock_seq), vma);
-}
-
  /*
- * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
- * assertions should be made either under mmap_write_lock or when the object
- * has been isolated under mmap_write_lock, ensuring no competing writers.
+ * These must be here rather than mmap_lock.h as dependent on vm_fault type,
+ * declared in this header.
   */
-static inline void vma_assert_attached(struct vm_area_struct *vma)
-{
-       WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
-}
-
-static inline void vma_assert_detached(struct vm_area_struct *vma)
-{
-       WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
-}
-
-static inline void vma_mark_attached(struct vm_area_struct *vma)
-{
-       vma_assert_write_locked(vma);
-       vma_assert_detached(vma);
-       refcount_set_release(&vma->vm_refcnt, 1);
-}
-
-void vma_mark_detached(struct vm_area_struct *vma);
-
+#ifdef CONFIG_PER_VMA_LOCK
  static inline void release_fault_lock(struct vm_fault *vmf)
  {
         if (vmf->flags & FAULT_FLAG_VMA_LOCK)
@@ -884,36 +691,7 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
         else
                 mmap_assert_locked(vmf->vma->vm_mm);
  }
-
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
-                                         unsigned long address);
-
-#else /* CONFIG_PER_VMA_LOCK */
-
-static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
-                                                   struct vm_area_struct *vma)
-               { return NULL; }
-static inline void vma_end_read(struct vm_area_struct *vma) {}
-static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-               { mmap_assert_write_locked(vma->vm_mm); }
-static inline void vma_assert_attached(struct vm_area_struct *vma) {}
-static inline void vma_assert_detached(struct vm_area_struct *vma) {}
-static inline void vma_mark_attached(struct vm_area_struct *vma) {}
-static inline void vma_mark_detached(struct vm_area_struct *vma) {}
-
-static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
-               unsigned long address)
-{
-       return NULL;
-}
-
-static inline void vma_assert_locked(struct vm_area_struct *vma)
-{
-       mmap_assert_locked(vma->vm_mm);
-}
-
+#else
  static inline void release_fault_lock(struct vm_fault *vmf)
  {
         mmap_read_unlock(vmf->vma->vm_mm);
@@ -923,7 +701,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
  {
         mmap_assert_locked(vmf->vma->vm_mm);
  }
-
  #endif /* CONFIG_PER_VMA_LOCK */
  
  extern const struct vm_operations_struct vma_dummy_vm_ops;
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h

index 4706c676990275d480fcf3e20437462f7858cdd8..7983b2efe9bf090a31e196206f1ac3567405c133 100644 (file)
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -1,6 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
  #ifndef _LINUX_MMAP_LOCK_H
  #define _LINUX_MMAP_LOCK_H
  
+/* Avoid a dependency loop by declaring here. */
+extern int rcuwait_wake_up(struct rcuwait *w);
+
  #include <linux/lockdep.h>
  #include <linux/mm_types.h>
  #include <linux/mmdebug.h>
@@ -104,6 +108,206 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
         return read_seqcount_retry(&mm->mm_lock_seq, seq);
  }
  
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       static struct lock_class_key lockdep_key;
+
+       lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+       if (reset_refcnt)
+               refcount_set(&vma->vm_refcnt, 0);
+       vma->vm_lock_seq = UINT_MAX;
+}
+
+static inline bool is_vma_writer_only(int refcnt)
+{
+       /*
+        * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
+        * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
+        * a detached vma happens only in vma_mark_detached() and is a rare
+        * case, therefore most of the time there will be no unnecessary wakeup.
+        */
+       return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
+}
+
+static inline void vma_refcount_put(struct vm_area_struct *vma)
+{
+       /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+       struct mm_struct *mm = vma->vm_mm;
+       int oldcnt;
+
+       rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+       if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
+
+               if (is_vma_writer_only(oldcnt - 1))
+                       rcuwait_wake_up(&mm->vma_writer_wait);
+       }
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ * False locked result is possible if mm_lock_seq overflows or if vma gets
+ * reused and attached to a different mm before we lock it.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
+ */
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+                                                   struct vm_area_struct *vma)
+{
+       int oldcnt;
+
+       /*
+        * Check before locking. A race might cause false locked result.
+        * We can use READ_ONCE() for the mm_lock_seq here, and don't need
+        * ACQUIRE semantics, because this is just a lockless check whose result
+        * we don't rely on for anything - the mm_lock_seq read against which we
+        * need ordering is below.
+        */
+       if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
+               return NULL;
+
+       /*
+        * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+        * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+        * Acquire fence is required here to avoid reordering against later
+        * vm_lock_seq check and checks inside lock_vma_under_rcu().
+        */
+       if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+                                                             VMA_REF_LIMIT))) {
+               /* return EAGAIN if vma got detached from under us */
+               return oldcnt ? NULL : ERR_PTR(-EAGAIN);
+       }
+
+       rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+       /*
+        * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
+        * False unlocked result is impossible because we modify and check
+        * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
+        * modification invalidates all existing locks.
+        *
+        * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+        * racing with vma_end_write_all(), we only start reading from the VMA
+        * after it has been unlocked.
+        * This pairs with RELEASE semantics in vma_end_write_all().
+        */
+       if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+               vma_refcount_put(vma);
+               return NULL;
+       }
+
+       return vma;
+}
+
+/*
+ * Use only while holding mmap read lock which guarantees that locking will not
+ * fail (nobody can concurrently write-lock the vma). vma_start_read() should
+ * not be used in such cases because it might fail due to mm_lock_seq overflow.
+ * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ */
+static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
+{
+       int oldcnt;
+
+       mmap_assert_locked(vma->vm_mm);
+       if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+                                                             VMA_REF_LIMIT)))
+               return false;
+
+       rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+       return true;
+}
+
+/*
+ * Use only while holding mmap read lock which guarantees that locking will not
+ * fail (nobody can concurrently write-lock the vma). vma_start_read() should
+ * not be used in such cases because it might fail due to mm_lock_seq overflow.
+ * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ */
+static inline bool vma_start_read_locked(struct vm_area_struct *vma)
+{
+       return vma_start_read_locked_nested(vma, 0);
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+       vma_refcount_put(vma);
+}
+
+/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
+static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
+{
+       mmap_assert_write_locked(vma->vm_mm);
+
+       /*
+        * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+        * mm->mm_lock_seq can't be concurrently modified.
+        */
+       *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
+       return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
+
+/*
+ * Begin writing to a VMA.
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ */
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+       unsigned int mm_lock_seq;
+
+       if (__is_vma_write_locked(vma, &mm_lock_seq))
+               return;
+
+       __vma_start_write(vma, mm_lock_seq);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+       unsigned int mm_lock_seq;
+
+       VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+       unsigned int mm_lock_seq;
+
+       VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
+                     !__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
+static inline void vma_assert_attached(struct vm_area_struct *vma)
+{
+       WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
+}
+
+static inline void vma_assert_detached(struct vm_area_struct *vma)
+{
+       WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
+}
+
+static inline void vma_mark_attached(struct vm_area_struct *vma)
+{
+       vma_assert_write_locked(vma);
+       vma_assert_detached(vma);
+       refcount_set_release(&vma->vm_refcnt, 1);
+}
+
+void vma_mark_detached(struct vm_area_struct *vma);
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+                                         unsigned long address);
+
  #else /* CONFIG_PER_VMA_LOCK */
  
  static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
@@ -119,6 +323,29 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
  {
         return true;
  }
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+                                                   struct vm_area_struct *vma)
+               { return NULL; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+               { mmap_assert_write_locked(vma->vm_mm); }
+static inline void vma_assert_attached(struct vm_area_struct *vma) {}
+static inline void vma_assert_detached(struct vm_area_struct *vma) {}
+static inline void vma_mark_attached(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma) {}
+
+static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+               unsigned long address)
+{
+       return NULL;
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+       mmap_assert_locked(vma->vm_mm);
+}
  
  #endif /* CONFIG_PER_VMA_LOCK */
  
diff --git a/mm/memory.c b/mm/memory.c

index 71c255f3fdccf8ba35ca0037023235fe89fe6bf6..f18266b5a0a934db16a9efa0121de824fac44b63 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6378,258 +6378,6 @@ out:
  }
  EXPORT_SYMBOL_GPL(handle_mm_fault);
  
-#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
-#include <linux/extable.h>
-
-static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
-       if (likely(mmap_read_trylock(mm)))
-               return true;
-
-       if (regs && !user_mode(regs)) {
-               unsigned long ip = exception_ip(regs);
-               if (!search_exception_tables(ip))
-                       return false;
-       }
-
-       return !mmap_read_lock_killable(mm);
-}
-
-static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
-{
-       /*
-        * We don't have this operation yet.
-        *
-        * It should be easy enough to do: it's basically a
-        *    atomic_long_try_cmpxchg_acquire()
-        * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
-        * it also needs the proper lockdep magic etc.
-        */
-       return false;
-}
-
-static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
-       mmap_read_unlock(mm);
-       if (regs && !user_mode(regs)) {
-               unsigned long ip = exception_ip(regs);
-               if (!search_exception_tables(ip))
-                       return false;
-       }
-       return !mmap_write_lock_killable(mm);
-}
-
-/*
- * Helper for page fault handling.
- *
- * This is kind of equivalent to "mmap_read_lock()" followed
- * by "find_extend_vma()", except it's a lot more careful about
- * the locking (and will drop the lock on failure).
- *
- * For example, if we have a kernel bug that causes a page
- * fault, we don't want to just use mmap_read_lock() to get
- * the mm lock, because that would deadlock if the bug were
- * to happen while we're holding the mm lock for writing.
- *
- * So this checks the exception tables on kernel faults in
- * order to only do this all for instructions that are actually
- * expected to fault.
- *
- * We can also actually take the mm lock for writing if we
- * need to extend the vma, which helps the VM layer a lot.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
-                       unsigned long addr, struct pt_regs *regs)
-{
-       struct vm_area_struct *vma;
-
-       if (!get_mmap_lock_carefully(mm, regs))
-               return NULL;
-
-       vma = find_vma(mm, addr);
-       if (likely(vma && (vma->vm_start <= addr)))
-               return vma;
-
-       /*
-        * Well, dang. We might still be successful, but only
-        * if we can extend a vma to do so.
-        */
-       if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
-               mmap_read_unlock(mm);
-               return NULL;
-       }
-
-       /*
-        * We can try to upgrade the mmap lock atomically,
-        * in which case we can continue to use the vma
-        * we already looked up.
-        *
-        * Otherwise we'll have to drop the mmap lock and
-        * re-take it, and also look up the vma again,
-        * re-checking it.
-        */
-       if (!mmap_upgrade_trylock(mm)) {
-               if (!upgrade_mmap_lock_carefully(mm, regs))
-                       return NULL;
-
-               vma = find_vma(mm, addr);
-               if (!vma)
-                       goto fail;
-               if (vma->vm_start <= addr)
-                       goto success;
-               if (!(vma->vm_flags & VM_GROWSDOWN))
-                       goto fail;
-       }
-
-       if (expand_stack_locked(vma, addr))
-               goto fail;
-
-success:
-       mmap_write_downgrade(mm);
-       return vma;
-
-fail:
-       mmap_write_unlock(mm);
-       return NULL;
-}
-#endif
-
-#ifdef CONFIG_PER_VMA_LOCK
-static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
-{
-       unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
-
-       /* Additional refcnt if the vma is attached. */
-       if (!detaching)
-               tgt_refcnt++;
-
-       /*
-        * If vma is detached then only vma_mark_attached() can raise the
-        * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
-        */
-       if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
-               return false;
-
-       rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
-       rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
-                  refcount_read(&vma->vm_refcnt) == tgt_refcnt,
-                  TASK_UNINTERRUPTIBLE);
-       lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
-
-       return true;
-}
-
-static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
-{
-       *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
-       rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
-}
-
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
-{
-       bool locked;
-
-       /*
-        * __vma_enter_locked() returns false immediately if the vma is not
-        * attached, otherwise it waits until refcnt is indicating that vma
-        * is attached with no readers.
-        */
-       locked = __vma_enter_locked(vma, false);
-
-       /*
-        * We should use WRITE_ONCE() here because we can have concurrent reads
-        * from the early lockless pessimistic check in vma_start_read().
-        * We don't really care about the correctness of that early check, but
-        * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
-        */
-       WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-
-       if (locked) {
-               bool detached;
-
-               __vma_exit_locked(vma, &detached);
-               WARN_ON_ONCE(detached); /* vma should remain attached */
-       }
-}
-EXPORT_SYMBOL_GPL(__vma_start_write);
-
-void vma_mark_detached(struct vm_area_struct *vma)
-{
-       vma_assert_write_locked(vma);
-       vma_assert_attached(vma);
-
-       /*
-        * We are the only writer, so no need to use vma_refcount_put().
-        * The condition below is unlikely because the vma has been already
-        * write-locked and readers can increment vm_refcnt only temporarily
-        * before they check vm_lock_seq, realize the vma is locked and drop
-        * back the vm_refcnt. That is a narrow window for observing a raised
-        * vm_refcnt.
-        */
-       if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
-               /* Wait until vma is detached with no readers. */
-               if (__vma_enter_locked(vma, true)) {
-                       bool detached;
-
-                       __vma_exit_locked(vma, &detached);
-                       WARN_ON_ONCE(!detached);
-               }
-       }
-}
-
-/*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
- */
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
-                                         unsigned long address)
-{
-       MA_STATE(mas, &mm->mm_mt, address, address);
-       struct vm_area_struct *vma;
-
-       rcu_read_lock();
-retry:
-       vma = mas_walk(&mas);
-       if (!vma)
-               goto inval;
-
-       vma = vma_start_read(mm, vma);
-       if (IS_ERR_OR_NULL(vma)) {
-               /* Check if the VMA got isolated after we found it */
-               if (PTR_ERR(vma) == -EAGAIN) {
-                       count_vm_vma_lock_event(VMA_LOCK_MISS);
-                       /* The area was replaced with another one */
-                       goto retry;
-               }
-
-               /* Failed to lock the VMA */
-               goto inval;
-       }
-       /*
-        * At this point, we have a stable reference to a VMA: The VMA is
-        * locked and we know it hasn't already been isolated.
-        * From here on, we can access the VMA without worrying about which
-        * fields are accessible for RCU readers.
-        */
-
-       /* Check if the vma we locked is the right one. */
-       if (unlikely(vma->vm_mm != mm ||
-                    address < vma->vm_start || address >= vma->vm_end))
-               goto inval_end_read;
-
-       rcu_read_unlock();
-       return vma;
-
-inval_end_read:
-       vma_end_read(vma);
-inval:
-       rcu_read_unlock();
-       count_vm_vma_lock_event(VMA_LOCK_ABORT);
-       return NULL;
-}
-#endif /* CONFIG_PER_VMA_LOCK */
-
  #ifndef __PAGETABLE_P4D_FOLDED
  /*
   * Allocate p4d page table.
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c

index e7dbaf96aa172a9044d7363010a802864f430e03..5f725cc67334f6de5fb0448045f580c1c0f1d7a9 100644 (file)
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -42,3 +42,276 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
  }
  EXPORT_SYMBOL(__mmap_lock_do_trace_released);
  #endif /* CONFIG_TRACING */
+
+#ifdef CONFIG_MMU
+#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+       unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+       /* Additional refcnt if the vma is attached. */
+       if (!detaching)
+               tgt_refcnt++;
+
+       /*
+        * If vma is detached then only vma_mark_attached() can raise the
+        * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+        */
+       if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+               return false;
+
+       rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+       rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+                  refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+                  TASK_UNINTERRUPTIBLE);
+       lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+       return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+       *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+       rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+{
+       bool locked;
+
+       /*
+        * __vma_enter_locked() returns false immediately if the vma is not
+        * attached, otherwise it waits until refcnt is indicating that vma
+        * is attached with no readers.
+        */
+       locked = __vma_enter_locked(vma, false);
+
+       /*
+        * We should use WRITE_ONCE() here because we can have concurrent reads
+        * from the early lockless pessimistic check in vma_start_read().
+        * We don't really care about the correctness of that early check, but
+        * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+        */
+       WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+       if (locked) {
+               bool detached;
+
+               __vma_exit_locked(vma, &detached);
+               WARN_ON_ONCE(detached); /* vma should remain attached */
+       }
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
+
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+       vma_assert_write_locked(vma);
+       vma_assert_attached(vma);
+
+       /*
+        * We are the only writer, so no need to use vma_refcount_put().
+        * The condition below is unlikely because the vma has been already
+        * write-locked and readers can increment vm_refcnt only temporarily
+        * before they check vm_lock_seq, realize the vma is locked and drop
+        * back the vm_refcnt. That is a narrow window for observing a raised
+        * vm_refcnt.
+        */
+       if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+               /* Wait until vma is detached with no readers. */
+               if (__vma_enter_locked(vma, true)) {
+                       bool detached;
+
+                       __vma_exit_locked(vma, &detached);
+                       WARN_ON_ONCE(!detached);
+               }
+       }
+}
+
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       MA_STATE(mas, &mm->mm_mt, address, address);
+       struct vm_area_struct *vma;
+
+       rcu_read_lock();
+retry:
+       vma = mas_walk(&mas);
+       if (!vma)
+               goto inval;
+
+       vma = vma_start_read(mm, vma);
+       if (IS_ERR_OR_NULL(vma)) {
+               /* Check if the VMA got isolated after we found it */
+               if (PTR_ERR(vma) == -EAGAIN) {
+                       count_vm_vma_lock_event(VMA_LOCK_MISS);
+                       /* The area was replaced with another one */
+                       goto retry;
+               }
+
+               /* Failed to lock the VMA */
+               goto inval;
+       }
+       /*
+        * At this point, we have a stable reference to a VMA: The VMA is
+        * locked and we know it hasn't already been isolated.
+        * From here on, we can access the VMA without worrying about which
+        * fields are accessible for RCU readers.
+        */
+
+       /* Check if the vma we locked is the right one. */
+       if (unlikely(vma->vm_mm != mm ||
+                    address < vma->vm_start || address >= vma->vm_end))
+               goto inval_end_read;
+
+       rcu_read_unlock();
+       return vma;
+
+inval_end_read:
+       vma_end_read(vma);
+inval:
+       rcu_read_unlock();
+       count_vm_vma_lock_event(VMA_LOCK_ABORT);
+       return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+       if (likely(mmap_read_trylock(mm)))
+               return true;
+
+       if (regs && !user_mode(regs)) {
+               unsigned long ip = exception_ip(regs);
+               if (!search_exception_tables(ip))
+                       return false;
+       }
+
+       return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+       /*
+        * We don't have this operation yet.
+        *
+        * It should be easy enough to do: it's basically a
+        *    atomic_long_try_cmpxchg_acquire()
+        * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+        * it also needs the proper lockdep magic etc.
+        */
+       return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+       mmap_read_unlock(mm);
+       if (regs && !user_mode(regs)) {
+               unsigned long ip = exception_ip(regs);
+               if (!search_exception_tables(ip))
+                       return false;
+       }
+       return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalent to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+                       unsigned long addr, struct pt_regs *regs)
+{
+       struct vm_area_struct *vma;
+
+       if (!get_mmap_lock_carefully(mm, regs))
+               return NULL;
+
+       vma = find_vma(mm, addr);
+       if (likely(vma && (vma->vm_start <= addr)))
+               return vma;
+
+       /*
+        * Well, dang. We might still be successful, but only
+        * if we can extend a vma to do so.
+        */
+       if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+               mmap_read_unlock(mm);
+               return NULL;
+       }
+
+       /*
+        * We can try to upgrade the mmap lock atomically,
+        * in which case we can continue to use the vma
+        * we already looked up.
+        *
+        * Otherwise we'll have to drop the mmap lock and
+        * re-take it, and also look up the vma again,
+        * re-checking it.
+        */
+       if (!mmap_upgrade_trylock(mm)) {
+               if (!upgrade_mmap_lock_carefully(mm, regs))
+                       return NULL;
+
+               vma = find_vma(mm, addr);
+               if (!vma)
+                       goto fail;
+               if (vma->vm_start <= addr)
+                       goto success;
+               if (!(vma->vm_flags & VM_GROWSDOWN))
+                       goto fail;
+       }
+
+       if (expand_stack_locked(vma, addr))
+               goto fail;
+
+success:
+       mmap_write_downgrade(mm);
+       return vma;
+
+fail:
+       mmap_write_unlock(mm);
+       return NULL;
+}
+#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
+
+#else /* CONFIG_MMU */
+
+/*
+ * At least xtensa ends up having protection faults even with no
+ * MMU.. No stack expansion, at least.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+                       unsigned long addr, struct pt_regs *regs)
+{
+       struct vm_area_struct *vma;
+
+       mmap_read_lock(mm);
+       vma = vma_lookup(mm, addr);
+       if (!vma)
+               mmap_read_unlock(mm);
+       return vma;
+}
+
+#endif /* CONFIG_MMU */
diff --git a/mm/nommu.c b/mm/nommu.c

index 617e7ba8022f5bd56b738a8a9ec350718d0a6525..2b4d304c6445ba5fbae29efbb7c638e4c2d9249b 100644 (file)
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -626,22 +626,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
  }
  EXPORT_SYMBOL(find_vma);
  
-/*
- * At least xtensa ends up having protection faults even with no
- * MMU.. No stack expansion, at least.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
-                       unsigned long addr, struct pt_regs *regs)
-{
-       struct vm_area_struct *vma;
-
-       mmap_read_lock(mm);
-       vma = vma_lookup(mm, addr);
-       if (!vma)
-               mmap_read_unlock(mm);
-       return vma;
-}
-
  /*
   * expand a stack to a given address
   * - not supported under NOMMU conditions
author	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
	Wed, 16 Apr 2025 10:38:36 +0000 (11:38 +0100)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 12 May 2025 00:48:33 +0000 (17:48 -0700)
include/linux/mm.h		patch \| blob \| blame \| history
include/linux/mmap_lock.h		patch \| blob \| blame \| history
mm/memory.c		patch \| blob \| blame \| history
mm/mmap_lock.c		patch \| blob \| blame \| history
mm/nommu.c		patch \| blob \| blame \| history