bool pageout;
};
+enum madvise_lock_mode {
+ MADVISE_NO_LOCK,
+ MADVISE_MMAP_READ_LOCK,
+ MADVISE_MMAP_WRITE_LOCK,
+ MADVISE_VMA_READ_LOCK,
+};
+
struct madvise_behavior {
int behavior;
struct mmu_gather *tlb;
+ enum madvise_lock_mode lock_mode;
};
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
- switch (behavior) {
- case MADV_REMOVE:
- case MADV_WILLNEED:
- case MADV_DONTNEED:
- case MADV_DONTNEED_LOCKED:
- case MADV_COLD:
- case MADV_PAGEOUT:
- case MADV_FREE:
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- case MADV_COLLAPSE:
- case MADV_GUARD_INSTALL:
- case MADV_GUARD_REMOVE:
- return 0;
- default:
- /* be safe, default to 1. list exceptions explicitly */
- return 1;
- }
-}
-
#ifdef CONFIG_ANON_VMA_NAME
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
return madvise_guard_remove(vma, prev, start, end);
}
+ /* We cannot provide prev in this lock mode. */
+ VM_WARN_ON_ONCE(arg->lock_mode == MADVISE_VMA_READ_LOCK);
anon_name = anon_vma_name(vma);
anon_vma_name_get(anon_name);
error = madvise_update_vma(vma, prev, start, end, new_flags,
}
}
+/*
+ * Try to acquire a VMA read lock if possible.
+ *
+ * We only support this lock over a single VMA, which the input range must
+ * span either partially or fully.
+ *
+ * This function always returns with an appropriate lock held. If a VMA read
+ * lock could be acquired, we return the locked VMA.
+ *
+ * If a VMA read lock could not be acquired, we return NULL and expect caller to
+ * fallback to mmap lock behaviour.
+ */
+static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
+ struct madvise_behavior *madv_behavior,
+ unsigned long start, unsigned long end)
+{
+ struct vm_area_struct *vma;
+
+ vma = lock_vma_under_rcu(mm, start);
+ if (!vma)
+ goto take_mmap_read_lock;
+ /*
+ * Must span only a single VMA; uffd and remote processes are
+ * unsupported.
+ */
+ if (end > vma->vm_end || current->mm != mm ||
+ userfaultfd_armed(vma)) {
+ vma_end_read(vma);
+ goto take_mmap_read_lock;
+ }
+ return vma;
+
+take_mmap_read_lock:
+ mmap_read_lock(mm);
+ madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
+ return NULL;
+}
+
/*
* Walk the vmas in range [start,end), and call the visit function on each one.
* The visit function will get start and end parameters that cover the overlap
*/
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
- unsigned long end, void *arg,
+ unsigned long end, struct madvise_behavior *madv_behavior,
+ void *arg,
int (*visit)(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, void *arg))
struct vm_area_struct *prev;
unsigned long tmp;
int unmapped_error = 0;
+ int error;
+
+ /*
+ * If VMA read lock is supported, apply madvise to a single VMA
+ * tentatively, avoiding walking VMAs.
+ */
+ if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) {
+ vma = try_vma_read_lock(mm, madv_behavior, start, end);
+ if (vma) {
+ prev = vma;
+ error = visit(vma, &prev, start, end, arg);
+ vma_end_read(vma);
+ return error;
+ }
+ }
/*
* If the interval [start,end) covers some unmapped address
prev = vma;
for (;;) {
- int error;
-
/* Still start < end. */
if (!vma)
return -ENOMEM;
if (end == start)
return 0;
- return madvise_walk_vmas(mm, start, end, anon_name,
+ return madvise_walk_vmas(mm, start, end, NULL, anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
-static int madvise_lock(struct mm_struct *mm, int behavior)
+
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
+
if (is_memory_failure(behavior))
- return 0;
+ return MADVISE_NO_LOCK;
- if (madvise_need_mmap_write(behavior)) {
+ switch (behavior) {
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ case MADV_FREE:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
+ return MADVISE_MMAP_READ_LOCK;
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ return MADVISE_VMA_READ_LOCK;
+ default:
+ return MADVISE_MMAP_WRITE_LOCK;
+ }
+}
+
+static int madvise_lock(struct mm_struct *mm,
+ struct madvise_behavior *madv_behavior)
+{
+ enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
+
+ switch (lock_mode) {
+ case MADVISE_NO_LOCK:
+ break;
+ case MADVISE_MMAP_WRITE_LOCK:
if (mmap_write_lock_killable(mm))
return -EINTR;
- } else {
+ break;
+ case MADVISE_MMAP_READ_LOCK:
mmap_read_lock(mm);
+ break;
+ case MADVISE_VMA_READ_LOCK:
+ /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
+ break;
}
+
+ madv_behavior->lock_mode = lock_mode;
return 0;
}
-static void madvise_unlock(struct mm_struct *mm, int behavior)
+static void madvise_unlock(struct mm_struct *mm,
+ struct madvise_behavior *madv_behavior)
{
- if (is_memory_failure(behavior))
+ switch (madv_behavior->lock_mode) {
+ case MADVISE_NO_LOCK:
return;
-
- if (madvise_need_mmap_write(behavior))
+ case MADVISE_MMAP_WRITE_LOCK:
mmap_write_unlock(mm);
- else
+ break;
+ case MADVISE_MMAP_READ_LOCK:
mmap_read_unlock(mm);
+ break;
+ case MADVISE_VMA_READ_LOCK:
+ /* We will drop the lock per-VMA in madvise_walk_vmas(). */
+ break;
+ }
+
+ madv_behavior->lock_mode = MADVISE_NO_LOCK;
}
static bool madvise_batch_tlb_flush(int behavior)
}
}
+/*
+ * untagged_addr_remote() assumes mmap_lock is already held. On
+ * architectures like x86 and RISC-V, tagging is tricky because each
+ * mm may have a different tagging mask. However, we might only hold
+ * the per-VMA lock (currently only local processes are supported),
+ * so untagged_addr is used to avoid the mmap_lock assertion for
+ * local processes.
+ */
+static inline unsigned long get_untagged_addr(struct mm_struct *mm,
+ unsigned long start)
+{
+ return current->mm == mm ? untagged_addr(start) :
+ untagged_addr_remote(mm, start);
+}
+
static int madvise_do_behavior(struct mm_struct *mm,
unsigned long start, size_t len_in,
struct madvise_behavior *madv_behavior)
if (is_memory_failure(behavior))
return madvise_inject_error(behavior, start, start + len_in);
- start = untagged_addr_remote(mm, start);
+ start = get_untagged_addr(mm, start);
end = start + PAGE_ALIGN(len_in);
blk_start_plug(&plug);
error = madvise_populate(mm, start, end, behavior);
else
error = madvise_walk_vmas(mm, start, end, madv_behavior,
- madvise_vma_behavior);
+ madv_behavior, madvise_vma_behavior);
blk_finish_plug(&plug);
return error;
}
if (madvise_should_skip(start, len_in, behavior, &error))
return error;
- error = madvise_lock(mm, behavior);
+ error = madvise_lock(mm, &madv_behavior);
if (error)
return error;
madvise_init_tlb(&madv_behavior, mm);
error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
madvise_finish_tlb(&madv_behavior);
- madvise_unlock(mm, behavior);
+ madvise_unlock(mm, &madv_behavior);
return error;
}
total_len = iov_iter_count(iter);
- ret = madvise_lock(mm, behavior);
+ ret = madvise_lock(mm, &madv_behavior);
if (ret)
return ret;
madvise_init_tlb(&madv_behavior, mm);
/* Drop and reacquire lock to unwind race. */
madvise_finish_tlb(&madv_behavior);
- madvise_unlock(mm, behavior);
- ret = madvise_lock(mm, behavior);
+ madvise_unlock(mm, &madv_behavior);
+ ret = madvise_lock(mm, &madv_behavior);
if (ret)
goto out;
madvise_init_tlb(&madv_behavior, mm);
iov_iter_advance(iter, iter_iov_len(iter));
}
madvise_finish_tlb(&madv_behavior);
- madvise_unlock(mm, behavior);
+ madvise_unlock(mm, &madv_behavior);
out:
ret = (total_len - iov_iter_count(iter)) ? : ret;