--- /dev/null
+From f8c4e35d716b886d05595706af1be757fede502d Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 21:24:30 +0200
+Subject: arm/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream.
+
+arm has an additional check for address < FIRST_USER_ADDRESS before
+expanding the stack. Since FIRST_USER_ADDRESS is defined everywhere
+(generally as 0), move that check to the generic expand_downwards().
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/Kconfig | 1
+ arch/arm/mm/fault.c | 63 +++++++++++-----------------------------------------
+ mm/mmap.c | 2 -
+ 3 files changed, 16 insertions(+), 50 deletions(-)
+
+--- a/arch/arm/Kconfig
++++ b/arch/arm/Kconfig
+@@ -122,6 +122,7 @@ config ARM
+ select HAVE_UID16
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_REL
+ select NEED_DMA_MAP_STATE
+ select OF_EARLY_FLATTREE if OF
+--- a/arch/arm/mm/fault.c
++++ b/arch/arm/mm/fault.c
+@@ -231,37 +231,11 @@ static inline bool is_permission_fault(u
+ return false;
+ }
+
+-static vm_fault_t __kprobes
+-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags,
+- unsigned long vma_flags, struct pt_regs *regs)
+-{
+- struct vm_area_struct *vma = find_vma(mm, addr);
+- if (unlikely(!vma))
+- return VM_FAULT_BADMAP;
+-
+- if (unlikely(vma->vm_start > addr)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return VM_FAULT_BADMAP;
+- if (addr < FIRST_USER_ADDRESS)
+- return VM_FAULT_BADMAP;
+- if (expand_stack(vma, addr))
+- return VM_FAULT_BADMAP;
+- }
+-
+- /*
+- * ok, we have a good vm_area for this memory access, check the
+- * permissions on the VMA allow for the fault which occurred.
+- */
+- if (!(vma->vm_flags & vma_flags))
+- return VM_FAULT_BADACCESS;
+-
+- return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+-}
+-
+ static int __kprobes
+ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ {
+ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
+ int sig, code;
+ vm_fault_t fault;
+ unsigned int flags = FAULT_FLAG_DEFAULT;
+@@ -300,31 +274,21 @@ do_page_fault(unsigned long addr, unsign
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+
+- /*
+- * As per x86, we may deadlock here. However, since the kernel only
+- * validly references user space from well defined areas of the code,
+- * we can bug out early if this is from code which shouldn't.
+- */
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
+- goto no_context;
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case, we'll have missed the might_sleep() from
+- * down_read()
+- */
+- might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+- if (!user_mode(regs) &&
+- !search_exception_tables(regs->ARM_pc))
+- goto no_context;
+-#endif
++ vma = lock_mm_and_find_vma(mm, addr, regs);
++ if (unlikely(!vma)) {
++ fault = VM_FAULT_BADMAP;
++ goto bad_area;
+ }
+
+- fault = __do_page_fault(mm, addr, flags, vm_flags, regs);
++ /*
++ * ok, we have a good vm_area for this memory access, check the
++ * permissions on the VMA allow for the fault which occurred.
++ */
++ if (!(vma->vm_flags & vm_flags))
++ fault = VM_FAULT_BADACCESS;
++ else
++ fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+
+ /* If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_lock because
+@@ -355,6 +319,7 @@ retry:
+ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+ return 0;
+
++bad_area:
+ /*
+ * If we are in kernel mode at this point, we
+ * have no context to handle this fault with.
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2045,7 +2045,7 @@ int expand_downwards(struct vm_area_stru
+ int error = 0;
+
+ address &= PAGE_MASK;
+- if (address < mmap_min_addr)
++ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+
+ /* Enforce stack_guard_gap */
--- /dev/null
+From a45a8a9f70fe70c7c4479b9256b1eb1b5774df64 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 17:11:44 -0700
+Subject: arm64/mm: Convert to using lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream.
+
+This converts arm64 to use the new page fault helper. It was very
+straightforward, but still needed a fix for the "obvious" conversion I
+initially did. Thanks to Suren for the fix and testing.
+
+Fixed-and-tested-by: Suren Baghdasaryan <surenb@google.com>
+Unnecessary-code-removal-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1: Ignore CONFIG_PER_VMA_LOCK context]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/Kconfig | 1 +
+ arch/arm64/mm/fault.c | 46 +++++++++-------------------------------------
+ 2 files changed, 10 insertions(+), 37 deletions(-)
+
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -211,6 +211,7 @@ config ARM64
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
+ select KASAN_VMALLOC if KASAN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa
+ #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
+ #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
+
+-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
++static vm_fault_t __do_page_fault(struct mm_struct *mm,
++ struct vm_area_struct *vma, unsigned long addr,
+ unsigned int mm_flags, unsigned long vm_flags,
+ struct pt_regs *regs)
+ {
+- struct vm_area_struct *vma = find_vma(mm, addr);
+-
+- if (unlikely(!vma))
+- return VM_FAULT_BADMAP;
+-
+ /*
+ * Ok, we have a good vm_area for this memory access, so we can handle
+ * it.
+- */
+- if (unlikely(vma->vm_start > addr)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return VM_FAULT_BADMAP;
+- if (expand_stack(vma, addr))
+- return VM_FAULT_BADMAP;
+- }
+-
+- /*
+ * Check that the permissions on the VMA allow for the fault which
+ * occurred.
+ */
+@@ -535,6 +522,7 @@ static int __kprobes do_page_fault(unsig
+ unsigned long vm_flags;
+ unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+ unsigned long addr = untagged_addr(far);
++ struct vm_area_struct *vma;
+
+ if (kprobe_page_fault(regs, esr))
+ return 0;
+@@ -585,31 +573,14 @@ static int __kprobes do_page_fault(unsig
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+
+- /*
+- * As per x86, we may deadlock here. However, since the kernel only
+- * validly references user space from well defined areas of the code,
+- * we can bug out early if this is from code which shouldn't.
+- */
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->pc))
+- goto no_context;
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above mmap_read_trylock() might have succeeded in which
+- * case, we'll have missed the might_sleep() from down_read().
+- */
+- might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+- if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
+- mmap_read_unlock(mm);
+- goto no_context;
+- }
+-#endif
++ vma = lock_mm_and_find_vma(mm, addr, regs);
++ if (unlikely(!vma)) {
++ fault = VM_FAULT_BADMAP;
++ goto done;
+ }
+
+- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
++ fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+@@ -628,6 +599,7 @@ retry:
+ }
+ mmap_read_unlock(mm);
+
++done:
+ /*
+ * Handle the "normal" (no error) case first.
+ */
--- /dev/null
+From 9e1f3d01ba1f6ffa0ad902d594b1b44619568b74 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Mon, 19 Jun 2023 11:34:15 -0700
+Subject: execve: expand new process stack manually ahead of time
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream.
+
+This is a small step towards a model where GUP itself would not expand
+the stack, and any user that needs GUP to not look up existing mappings,
+but actually expand on them, would have to do so manually before-hand,
+and with the mm lock held for writing.
+
+It turns out that execve() already did almost exactly that, except it
+didn't take the mm lock at all (it's single-threaded so no locking
+technically needed, but it could cause lockdep errors). And it only did
+it for the CONFIG_STACK_GROWSUP case, since in that case GUP has
+obviously never expanded the stack downwards.
+
+So just make that CONFIG_STACK_GROWSUP case do the right thing with
+locking, and enable it generally. This will eventually help GUP, and in
+the meantime avoids a special case and the lockdep issue.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1 Minor context from still having FOLL_FORCE flags set]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c | 37 +++++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 16 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -198,34 +198,39 @@ static struct page *get_arg_page(struct
+ int write)
+ {
+ struct page *page;
++ struct vm_area_struct *vma = bprm->vma;
++ struct mm_struct *mm = bprm->mm;
+ int ret;
+- unsigned int gup_flags = FOLL_FORCE;
+
+-#ifdef CONFIG_STACK_GROWSUP
+- if (write) {
+- /* We claim to hold the lock - nobody to race with */
+- ret = expand_downwards(bprm->vma, pos, true);
+- if (ret < 0)
++ /*
++ * Avoid relying on expanding the stack down in GUP (which
++ * does not work for STACK_GROWSUP anyway), and just do it
++ * by hand ahead of time.
++ */
++ if (write && pos < vma->vm_start) {
++ mmap_write_lock(mm);
++ ret = expand_downwards(vma, pos, true);
++ if (unlikely(ret < 0)) {
++ mmap_write_unlock(mm);
+ return NULL;
+- }
+-#endif
+-
+- if (write)
+- gup_flags |= FOLL_WRITE;
++ }
++ mmap_write_downgrade(mm);
++ } else
++ mmap_read_lock(mm);
+
+ /*
+ * We are doing an exec(). 'current' is the process
+- * doing the exec and bprm->mm is the new process's mm.
++ * doing the exec and 'mm' is the new process's mm.
+ */
+- mmap_read_lock(bprm->mm);
+- ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
++ ret = get_user_pages_remote(mm, pos, 1,
++ write ? FOLL_WRITE : 0,
+ &page, NULL, NULL);
+- mmap_read_unlock(bprm->mm);
++ mmap_read_unlock(mm);
+ if (ret <= 0)
+ return NULL;
+
+ if (write)
+- acct_arg_size(bprm, vma_pages(bprm->vma));
++ acct_arg_size(bprm, vma_pages(vma));
+
+ return page;
+ }
--- /dev/null
+From f9ced2ac8976a6560505cc4bf14ffdf1c076e475 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 18:47:40 +0200
+Subject: mips/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/mips/Kconfig | 1 +
+ arch/mips/mm/fault.c | 12 ++----------
+ 2 files changed, 3 insertions(+), 10 deletions(-)
+
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -94,6 +94,7 @@ config MIPS
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP
+ select IRQ_FORCED_THREADING
+ select ISA if EISA
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_REL if MODULES
+ select MODULES_USE_ELF_RELA if MODULES && 64BIT
+ select PERF_USE_VMALLOC
+--- a/arch/mips/mm/fault.c
++++ b/arch/mips/mm/fault.c
+@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ si_code = SEGV_ACCERR;
+
+ if (write) {
--- /dev/null
+From 2956a81444985ffb601685f3a796e79470b56353 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 13:45:51 -0700
+Subject: mm: always expand the stack with the mmap write lock held
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream
+
+This finishes the job of always holding the mmap write lock when
+extending the user stack vma, and removes the 'write_locked' argument
+from the vm helper functions again.
+
+For some cases, we just avoid expanding the stack at all: drivers and
+page pinning really shouldn't be extending any stacks. Let's see if any
+strange users really wanted that.
+
+It's worth noting that architectures that weren't converted to the new
+lock_mm_and_find_vma() helper function are left using the legacy
+"expand_stack()" function, but it has been changed to drop the mmap_lock
+and take it for writing while expanding the vma. This makes it fairly
+straightforward to convert the remaining architectures.
+
+As a result of dropping and re-taking the lock, the calling conventions
+for this function have also changed, since the old vma may no longer be
+valid. So it will now return the new vma if successful, and NULL - and
+the lock dropped - if the area could not be extended.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1: Patch drivers/iommu/io-pgfault.c instead]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/ia64/mm/fault.c | 36 ++----------
+ arch/m68k/mm/fault.c | 9 ++-
+ arch/microblaze/mm/fault.c | 5 +
+ arch/openrisc/mm/fault.c | 5 +
+ arch/parisc/mm/fault.c | 23 +++-----
+ arch/s390/mm/fault.c | 5 +
+ arch/sparc/mm/fault_64.c | 8 +-
+ arch/um/kernel/trap.c | 11 ++-
+ drivers/iommu/amd/iommu_v2.c | 4 -
+ drivers/iommu/io-pgfault.c | 2
+ fs/binfmt_elf.c | 2
+ fs/exec.c | 4 -
+ include/linux/mm.h | 16 +----
+ mm/gup.c | 6 +-
+ mm/memory.c | 10 +++
+ mm/mmap.c | 121 ++++++++++++++++++++++++++++++++++---------
+ mm/nommu.c | 18 ++----
+ 17 files changed, 169 insertions(+), 116 deletions(-)
+
+--- a/arch/ia64/mm/fault.c
++++ b/arch/ia64/mm/fault.c
+@@ -110,10 +110,12 @@ retry:
+ * register backing store that needs to expand upwards, in
+ * this case vma will be null, but prev_vma will ne non-null
+ */
+- if (( !vma && prev_vma ) || (address < vma->vm_start) )
+- goto check_expansion;
++ if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
++ }
+
+- good_area:
+ code = SEGV_ACCERR;
+
+ /* OK, we've got a good vm_area for this memory area. Check the access permissions: */
+@@ -174,35 +176,9 @@ retry:
+ mmap_read_unlock(mm);
+ return;
+
+- check_expansion:
+- if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+- if (!vma)
+- goto bad_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+- || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
+- } else {
+- vma = prev_vma;
+- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+- || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+- goto bad_area;
+- /*
+- * Since the register backing store is accessed sequentially,
+- * we disallow growing it by more than a page at a time.
+- */
+- if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+- goto bad_area;
+- if (expand_upwards(vma, address))
+- goto bad_area;
+- }
+- goto good_area;
+-
+ bad_area:
+ mmap_read_unlock(mm);
++ bad_area_nosemaphore:
+ if ((isr & IA64_ISR_SP)
+ || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+ {
+--- a/arch/m68k/mm/fault.c
++++ b/arch/m68k/mm/fault.c
+@@ -105,8 +105,9 @@ retry:
+ if (address + 256 < rdusp())
+ goto map_err;
+ }
+- if (expand_stack(vma, address))
+- goto map_err;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto map_err_nosemaphore;
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+@@ -193,10 +194,12 @@ bus_err:
+ goto send_sig;
+
+ map_err:
++ mmap_read_unlock(mm);
++map_err_nosemaphore:
+ current->thread.signo = SIGSEGV;
+ current->thread.code = SEGV_MAPERR;
+ current->thread.faddr = address;
+- goto send_sig;
++ return send_fault_sig(regs);
+
+ acc_err:
+ current->thread.signo = SIGSEGV;
+--- a/arch/microblaze/mm/fault.c
++++ b/arch/microblaze/mm/fault.c
+@@ -192,8 +192,9 @@ retry:
+ && (kernel_mode(regs) || !store_updates_sp(regs)))
+ goto bad_area;
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+
+ good_area:
+ code = SEGV_ACCERR;
+--- a/arch/openrisc/mm/fault.c
++++ b/arch/openrisc/mm/fault.c
+@@ -127,8 +127,9 @@ retry:
+ if (address + PAGE_SIZE < regs->sp)
+ goto bad_area;
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+--- a/arch/parisc/mm/fault.c
++++ b/arch/parisc/mm/fault.c
+@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs,
+ retry:
+ mmap_read_lock(mm);
+ vma = find_vma_prev(mm, address, &prev_vma);
+- if (!vma || address < vma->vm_start)
+- goto check_expansion;
++ if (!vma || address < vma->vm_start) {
++ if (!prev || !(prev->vm_flags & VM_GROWSUP))
++ goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
++ }
++
+ /*
+ * Ok, we have a good vm_area for this memory access. We still need to
+ * check the access permissions.
+ */
+
+-good_area:
+-
+ if ((vma->vm_flags & acc_type) != acc_type)
+ goto bad_area;
+
+@@ -342,17 +346,13 @@ good_area:
+ mmap_read_unlock(mm);
+ return;
+
+-check_expansion:
+- vma = prev_vma;
+- if (vma && (expand_stack(vma, address) == 0))
+- goto good_area;
+-
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ */
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ int signo, si_code;
+
+@@ -444,7 +444,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ {
+ unsigned long insn = regs->iir;
+ int breg, treg, xreg, val = 0;
+- struct vm_area_struct *vma, *prev_vma;
++ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ unsigned long address;
+@@ -480,7 +480,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ /* Search for VMA */
+ address = regs->ior;
+ mmap_read_lock(mm);
+- vma = find_vma_prev(mm, address, &prev_vma);
++ vma = vma_lookup(mm, address);
+ mmap_read_unlock(mm);
+
+ /*
+@@ -489,7 +489,6 @@ handle_nadtlb_fault(struct pt_regs *regs
+ */
+ acc_type = (insn & 0x40) ? VM_WRITE : VM_READ;
+ if (vma
+- && address >= vma->vm_start
+ && (vma->vm_flags & acc_type) == acc_type)
+ val = 1;
+ }
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -429,8 +429,9 @@ retry:
+ if (unlikely(vma->vm_start > address)) {
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out_up;
+- if (expand_stack(vma, address))
+- goto out_up;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto out;
+ }
+
+ /*
+--- a/arch/sparc/mm/fault_64.c
++++ b/arch/sparc/mm/fault_64.c
+@@ -383,8 +383,9 @@ continue_fault:
+ goto bad_area;
+ }
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+@@ -482,8 +483,9 @@ exit_exception:
+ * Fix it, but check if it's kernel or user first..
+ */
+ bad_area:
+- insn = get_fault_insn(regs, insn);
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
++ insn = get_fault_insn(regs, insn);
+
+ handle_kernel_fault:
+ do_kernel_fault(regs, si_code, fault_code, insn, address);
+--- a/arch/um/kernel/trap.c
++++ b/arch/um/kernel/trap.c
+@@ -47,14 +47,15 @@ retry:
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto out;
+- else if (vma->vm_start <= address)
++ if (vma->vm_start <= address)
+ goto good_area;
+- else if (!(vma->vm_flags & VM_GROWSDOWN))
++ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out;
+- else if (is_user && !ARCH_IS_STACKGROW(address))
+- goto out;
+- else if (expand_stack(vma, address))
++ if (is_user && !ARCH_IS_STACKGROW(address))
+ goto out;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto out_nosemaphore;
+
+ good_area:
+ *code_out = SEGV_ACCERR;
+--- a/drivers/iommu/amd/iommu_v2.c
++++ b/drivers/iommu/amd/iommu_v2.c
+@@ -485,8 +485,8 @@ static void do_fault(struct work_struct
+ flags |= FAULT_FLAG_REMOTE;
+
+ mmap_read_lock(mm);
+- vma = find_extend_vma(mm, address);
+- if (!vma || address < vma->vm_start)
++ vma = vma_lookup(mm, address);
++ if (!vma)
+ /* failed to get a vma in the right range */
+ goto out;
+
+--- a/drivers/iommu/io-pgfault.c
++++ b/drivers/iommu/io-pgfault.c
+@@ -89,7 +89,7 @@ iopf_handle_single(struct iopf_fault *io
+
+ mmap_read_lock(mm);
+
+- vma = find_extend_vma(mm, prm->addr);
++ vma = vma_lookup(mm, prm->addr);
+ if (!vma)
+ /* Unmapped area */
+ goto out_put_mm;
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -317,7 +317,7 @@ create_elf_tables(struct linux_binprm *b
+ */
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+- vma = find_extend_vma_locked(mm, bprm->p, true);
++ vma = find_extend_vma_locked(mm, bprm->p);
+ mmap_write_unlock(mm);
+ if (!vma)
+ return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct
+ */
+ if (write && pos < vma->vm_start) {
+ mmap_write_lock(mm);
+- ret = expand_downwards(vma, pos, true);
++ ret = expand_downwards(vma, pos);
+ if (unlikely(ret < 0)) {
+ mmap_write_unlock(mm);
+ return NULL;
+@@ -860,7 +860,7 @@ int setup_arg_pages(struct linux_binprm
+ stack_base = vma->vm_start - stack_expand;
+ #endif
+ current->mm->start_stack = bprm->p;
+- ret = expand_stack_locked(vma, stack_base, true);
++ ret = expand_stack_locked(vma, stack_base);
+ if (ret)
+ ret = -EFAULT;
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2810,18 +2810,11 @@ extern vm_fault_t filemap_page_mkwrite(s
+
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked);
+-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
+
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked);
+-#if VM_GROWSUP
+-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+-#else
+- #define expand_upwards(vma, address) (0)
+-#endif
++int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+
+ /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+@@ -2916,9 +2909,8 @@ unsigned long change_prot_numa(struct vm
+ unsigned long start, unsigned long end);
+ #endif
+
+-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
+- unsigned long addr, bool write_locked);
++ unsigned long addr);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1182,7 +1182,7 @@ static long __get_user_pages(struct mm_s
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+- vma = find_extend_vma(mm, start);
++ vma = vma_lookup(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+@@ -1351,8 +1351,8 @@ int fixup_user_fault(struct mm_struct *m
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ retry:
+- vma = find_extend_vma(mm, address);
+- if (!vma || address < vma->vm_start)
++ vma = vma_lookup(mm, address);
++ if (!vma)
+ return -EFAULT;
+
+ if (!vma_permits_fault(vma, fault_flags))
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5352,7 +5352,7 @@ struct vm_area_struct *lock_mm_and_find_
+ goto fail;
+ }
+
+- if (expand_stack_locked(vma, addr, true))
++ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+ success:
+@@ -5636,6 +5636,14 @@ int __access_remote_vm(struct mm_struct
+ if (mmap_read_lock_killable(mm))
+ return 0;
+
++ /* We might need to expand the stack to access it */
++ vma = vma_lookup(mm, addr);
++ if (!vma) {
++ vma = expand_stack(mm, addr);
++ if (!vma)
++ return 0;
++ }
++
+ /* ignore errors, just check how much was successfully transferred */
+ while (len) {
+ int bytes, ret, offset;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1945,8 +1945,7 @@ static int acct_stack_growth(struct vm_a
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+@@ -1970,8 +1969,6 @@ int expand_upwards(struct vm_area_struct
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+- if (!write_locked)
+- return -EAGAIN;
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+@@ -2039,15 +2036,18 @@ int expand_upwards(struct vm_area_struct
+
+ /*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
++ * mmap_lock held for writing.
+ */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+ struct vm_area_struct *prev;
+ int error = 0;
+
++ if (!(vma->vm_flags & VM_GROWSDOWN))
++ return -EFAULT;
++
+ address &= PAGE_MASK;
+ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+@@ -2060,8 +2060,6 @@ int expand_downwards(struct vm_area_stru
+ vma_is_accessible(prev) &&
+ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
+- if (!write_locked && (prev->vm_end == address))
+- return -EAGAIN;
+ }
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+@@ -2139,14 +2137,12 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+- return expand_upwards(vma, address, write_locked);
++ return expand_upwards(vma, address);
+ }
+
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+- unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+ struct vm_area_struct *vma, *prev;
+
+@@ -2156,23 +2152,21 @@ struct vm_area_struct *find_extend_vma_l
+ return vma;
+ if (!prev)
+ return NULL;
+- if (expand_stack_locked(prev, addr, write_locked))
++ if (expand_stack_locked(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+ return prev;
+ }
+ #else
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+ return -EINVAL;
+- return expand_downwards(vma, address, write_locked);
++ return expand_downwards(vma, address);
+ }
+
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+- unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+ struct vm_area_struct *vma;
+ unsigned long start;
+@@ -2184,7 +2178,7 @@ struct vm_area_struct *find_extend_vma_l
+ if (vma->vm_start <= addr)
+ return vma;
+ start = vma->vm_start;
+- if (expand_stack_locked(vma, addr, write_locked))
++ if (expand_stack_locked(vma, addr))
+ return NULL;
+ if (vma->vm_flags & VM_LOCKED)
+ populate_vma_page_range(vma, addr, start, NULL);
+@@ -2192,12 +2186,91 @@ struct vm_area_struct *find_extend_vma_l
+ }
+ #endif
+
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
+- unsigned long addr)
++/*
++ * IA64 has some horrid mapping rules: it can expand both up and down,
++ * but with various special rules.
++ *
++ * We'll get rid of this architecture eventually, so the ugliness is
++ * temporary.
++ */
++#ifdef CONFIG_IA64
++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
++{
++ return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
++ REGION_OFFSET(addr) < RGN_MAP_LIMIT;
++}
++
++/*
++ * IA64 stacks grow down, but there's a special register backing store
++ * that can grow up. Only sequentially, though, so the new address must
++ * match vm_end.
++ */
++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
++{
++ if (!vma_expand_ok(vma, addr))
++ return -EFAULT;
++ if (vma->vm_end != (addr & PAGE_MASK))
++ return -EFAULT;
++ return expand_upwards(vma, addr);
++}
++
++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
++{
++ if (!vma_expand_ok(vma, addr))
++ return -EFAULT;
++ return expand_downwards(vma, addr);
++}
++
++#elif defined(CONFIG_STACK_GROWSUP)
++
++#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
++#define vma_expand_down(vma, addr) (-EFAULT)
++
++#else
++
++#define vma_expand_up(vma,addr) (-EFAULT)
++#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
++
++#endif
++
++/*
++ * expand_stack(): legacy interface for page faulting. Don't use unless
++ * you have to.
++ *
++ * This is called with the mm locked for reading, drops the lock, takes
++ * the lock for writing, tries to look up a vma again, expands it if
++ * necessary, and downgrades the lock to reading again.
++ *
++ * If no vma is found or it can't be expanded, it returns NULL and has
++ * dropped the lock.
++ */
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+ {
+- return find_extend_vma_locked(mm, addr, false);
++ struct vm_area_struct *vma, *prev;
++
++ mmap_read_unlock(mm);
++ if (mmap_write_lock_killable(mm))
++ return NULL;
++
++ vma = find_vma_prev(mm, addr, &prev);
++ if (vma && vma->vm_start <= addr)
++ goto success;
++
++ if (prev && !vma_expand_up(prev, addr)) {
++ vma = prev;
++ goto success;
++ }
++
++ if (vma && !vma_expand_down(vma, addr))
++ goto success;
++
++ mmap_write_unlock(mm);
++ return NULL;
++
++success:
++ mmap_write_downgrade(mm);
++ return vma;
+ }
+-EXPORT_SYMBOL_GPL(find_extend_vma);
+
+ /*
+ * Ok - we have the memory areas we should free on a maple tree so release them,
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -682,24 +682,20 @@ struct vm_area_struct *find_vma(struct m
+ EXPORT_SYMBOL(find_vma);
+
+ /*
+- * find a VMA
+- * - we don't extend stack VMAs under NOMMU conditions
+- */
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+-{
+- return find_vma(mm, addr);
+-}
+-
+-/*
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
+ {
+ return -ENOMEM;
+ }
+
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
++{
++ mmap_read_unlock(mm);
++ return NULL;
++}
++
+ /*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_lock at least held readlocked
--- /dev/null
+From f128a1b1b5a6b39471d62f1398196631160a24a2 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 10:55:38 -0700
+Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream.
+
+This does the simple pattern conversion of alpha, arc, csky, hexagon,
+loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma()
+helper. They all have the regular fault handling pattern without odd
+special cases.
+
+The remaining architectures all have something that keeps us from a
+straightforward conversion: ia64 and parisc have stacks that can grow
+both up as well as down (and ia64 has special address region checks).
+
+And m68k, microblaze, openrisc, sparc64, and um end up having extra
+rules about only expanding the stack down a limited amount below the
+user space stack pointer. That is something that x86 used to do too
+(long long ago), and it probably could just be skipped, but it still
+makes the conversion less than trivial.
+
+Note that this conversion was done manually and with the exception of
+alpha without any build testing, because I have a fairly limited cross-
+building environment. The cases are all simple, and I went through the
+changes several times, but...
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/alpha/Kconfig | 1 +
+ arch/alpha/mm/fault.c | 13 +++----------
+ arch/arc/Kconfig | 1 +
+ arch/arc/mm/fault.c | 11 +++--------
+ arch/csky/Kconfig | 1 +
+ arch/csky/mm/fault.c | 22 +++++-----------------
+ arch/hexagon/Kconfig | 1 +
+ arch/hexagon/mm/vm_fault.c | 18 ++++--------------
+ arch/loongarch/Kconfig | 1 +
+ arch/loongarch/mm/fault.c | 16 ++++++----------
+ arch/nios2/Kconfig | 1 +
+ arch/nios2/mm/fault.c | 17 ++---------------
+ arch/sh/Kconfig | 1 +
+ arch/sh/mm/fault.c | 17 ++---------------
+ arch/sparc/Kconfig | 1 +
+ arch/sparc/mm/fault_32.c | 32 ++++++++------------------------
+ arch/xtensa/Kconfig | 1 +
+ arch/xtensa/mm/fault.c | 14 +++-----------
+ 18 files changed, 45 insertions(+), 124 deletions(-)
+
+--- a/arch/alpha/Kconfig
++++ b/arch/alpha/Kconfig
+@@ -28,6 +28,7 @@ config ALPHA
+ select GENERIC_SMP_IDLE_THREAD
+ select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_MOD_ARCH_SPECIFIC
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select ODD_RT_SIGACTION
+ select OLD_SIGSUSPEND
+--- a/arch/alpha/mm/fault.c
++++ b/arch/alpha/mm/fault.c
+@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns
+ flags |= FAULT_FLAG_USER;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+
+ /* Ok, we have a good vm_area for this memory access, so
+ we can handle it. */
+- good_area:
+ si_code = SEGV_ACCERR;
+ if (cause < 0) {
+ if (!(vma->vm_flags & VM_EXEC))
+@@ -189,6 +181,7 @@ retry:
+ bad_area:
+ mmap_read_unlock(mm);
+
++ bad_area_nosemaphore:
+ if (user_mode(regs))
+ goto do_sigsegv;
+
+--- a/arch/arc/Kconfig
++++ b/arch/arc/Kconfig
+@@ -41,6 +41,7 @@ config ARC
+ select HAVE_PERF_EVENTS
+ select HAVE_SYSCALL_TRACEPOINTS
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select OF
+ select OF_EARLY_FLATTREE
+--- a/arch/arc/mm/fault.c
++++ b/arch/arc/mm/fault.c
+@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (unlikely(address < vma->vm_start)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address))
+- goto bad_area;
+- }
++ goto bad_area_nosemaphore;
+
+ /*
+ * vm_area is good, now check permissions for this memory access
+@@ -161,6 +155,7 @@ retry:
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ /*
+ * Major/minor page fault accounting
+ * (in case of retry we only land here once)
+--- a/arch/csky/Kconfig
++++ b/arch/csky/Kconfig
+@@ -96,6 +96,7 @@ config CSKY
+ select HAVE_RSEQ
+ select HAVE_STACKPROTECTOR
+ select HAVE_SYSCALL_TRACEPOINTS
++ select LOCK_MM_AND_FIND_VMA
+ select MAY_HAVE_SPARSE_IRQ
+ select MODULES_USE_ELF_RELA if MODULES
+ select OF
+--- a/arch/csky/mm/fault.c
++++ b/arch/csky/mm/fault.c
+@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct
+ BUG();
+ }
+
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
+ {
+ /*
+ * Something tried to access memory that isn't in our memory map.
+ * Fix it, but check if it's kernel or user first.
+ */
+- mmap_read_unlock(mm);
+ /* User mode accesses just cause a SIGSEGV */
+ if (user_mode(regs)) {
+ do_trap(regs, SIGSEGV, code, addr);
+@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_
+ if (is_write(regs))
+ flags |= FAULT_FLAG_WRITE;
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, addr);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (likely(vma->vm_start <= addr))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (unlikely(expand_stack(vma, addr))) {
+- bad_area(regs, mm, code, addr);
++ bad_area_nosemaphore(regs, mm, code, addr);
+ return;
+ }
+
+@@ -259,11 +247,11 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it.
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (unlikely(access_error(regs, vma))) {
+- bad_area(regs, mm, code, addr);
++ mmap_read_unlock(mm);
++ bad_area_nosemaphore(regs, mm, code, addr);
+ return;
+ }
+
+--- a/arch/hexagon/Kconfig
++++ b/arch/hexagon/Kconfig
+@@ -28,6 +28,7 @@ config HEXAGON
+ select GENERIC_SMP_IDLE_THREAD
+ select STACKTRACE_SUPPORT
+ select GENERIC_CLOCKEVENTS_BROADCAST
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select GENERIC_CPU_DEVICES
+ select ARCH_WANT_LD_ORPHAN_WARN
+--- a/arch/hexagon/mm/vm_fault.c
++++ b/arch/hexagon/mm/vm_fault.c
+@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+- if (!vma)
+- goto bad_area;
++ vma = lock_mm_and_find_vma(mm, address, regs);
++ if (unlikely(!vma))
++ goto bad_area_nosemaphore;
+
+- if (vma->vm_start <= address)
+- goto good_area;
+-
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+-
+- if (expand_stack(vma, address))
+- goto bad_area;
+-
+-good_area:
+ /* Address space is OK. Now check access rights. */
+ si_code = SEGV_ACCERR;
+
+@@ -140,6 +129,7 @@ good_area:
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ return;
+--- a/arch/loongarch/Kconfig
++++ b/arch/loongarch/Kconfig
+@@ -107,6 +107,7 @@ config LOONGARCH
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP
+ select IRQ_FORCED_THREADING
+ select IRQ_LOONGARCH_CPU
++ select LOCK_MM_AND_FIND_VMA
+ select MMU_GATHER_MERGE_VMAS if MMU
+ select MODULES_USE_ELF_RELA if MODULES
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+--- a/arch/loongarch/mm/fault.c
++++ b/arch/loongarch/mm/fault.c
+@@ -166,22 +166,18 @@ static void __kprobes __do_page_fault(st
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+- if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (!expand_stack(vma, address))
+- goto good_area;
++ vma = lock_mm_and_find_vma(mm, address, regs);
++ if (unlikely(!vma))
++ goto bad_area_nosemaphore;
++ goto good_area;
++
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ do_sigsegv(regs, write, address, si_code);
+ return;
+
+--- a/arch/nios2/Kconfig
++++ b/arch/nios2/Kconfig
+@@ -16,6 +16,7 @@ config NIOS2
+ select HAVE_ARCH_TRACEHOOK
+ select HAVE_ARCH_KGDB
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select OF
+ select OF_EARLY_FLATTREE
+--- a/arch/nios2/mm/fault.c
++++ b/arch/nios2/mm/fault.c
+@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ea))
+- goto bad_area_nosemaphore;
+ retry:
+- mmap_read_lock(mm);
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ switch (cause) {
+--- a/arch/sh/Kconfig
++++ b/arch/sh/Kconfig
+@@ -56,6 +56,7 @@ config SUPERH
+ select HAVE_STACKPROTECTOR
+ select HAVE_SYSCALL_TRACEPOINTS
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select NEED_SG_DMA_LENGTH
+ select NO_DMA if !MMU && !DMA_COHERENT
+--- a/arch/sh/mm/fault.c
++++ b/arch/sh/mm/fault.c
+@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(
+ }
+
+ retry:
+- mmap_read_lock(mm);
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (likely(vma->vm_start <= address))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (unlikely(expand_stack(vma, address))) {
+- bad_area(regs, error_code, address);
++ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+@@ -461,7 +449,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address);
+ return;
+--- a/arch/sparc/Kconfig
++++ b/arch/sparc/Kconfig
+@@ -56,6 +56,7 @@ config SPARC32
+ select DMA_DIRECT_REMAP
+ select GENERIC_ATOMIC64
+ select HAVE_UID16
++ select LOCK_MM_AND_FIND_VMA
+ select OLD_SIGACTION
+ select ZONE_DMA
+
+--- a/arch/sparc/mm/fault_32.c
++++ b/arch/sparc/mm/fault_32.c
+@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt
+ if (pagefault_disabled() || !mm)
+ goto no_context;
+
++ if (!from_user && address >= PAGE_OFFSET)
++ goto no_context;
++
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ retry:
+- mmap_read_lock(mm);
+-
+- if (!from_user && address >= PAGE_OFFSET)
+- goto bad_area;
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ code = SEGV_ACCERR;
+ if (write) {
+ if (!(vma->vm_flags & VM_WRITE))
+@@ -318,17 +309,9 @@ static void force_user_fault(unsigned lo
+
+ code = SEGV_MAPERR;
+
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
+-good_area:
++ goto bad_area_nosemaphore;
+ code = SEGV_ACCERR;
+ if (write) {
+ if (!(vma->vm_flags & VM_WRITE))
+@@ -347,6 +330,7 @@ good_area:
+ return;
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address);
+ return;
+
+--- a/arch/xtensa/Kconfig
++++ b/arch/xtensa/Kconfig
+@@ -49,6 +49,7 @@ config XTENSA
+ select HAVE_SYSCALL_TRACEPOINTS
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select PERF_USE_VMALLOC
+ select TRACE_IRQFLAGS_SUPPORT
+--- a/arch/xtensa/mm/fault.c
++++ b/arch/xtensa/mm/fault.c
+@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+-
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+
+ /* Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (is_write) {
+@@ -205,6 +196,7 @@ good_area:
+ */
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ current->thread.bad_vaddr = address;
+ current->thread.error_code = is_write;
--- /dev/null
+From 088826669e9cadc96824a9523a799bd6854a31ec Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 15:17:36 -0700
+Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream.
+
+.. and make x86 use it.
+
+This basically extracts the existing x86 "find and expand faulting vma"
+code, but extends it to also take the mmap lock for writing in case we
+actually do need to expand the vma.
+
+We've historically short-circuited that case, and have some rather ugly
+special logic to serialize the stack segment expansion (since we only
+hold the mmap lock for reading) that doesn't match the normal VM
+locking.
+
+That slight violation of locking worked well, right up until it didn't:
+the maple tree code really does want proper locking even for simple
+extension of an existing vma.
+
+So extract the code for "look up the vma of the fault" from x86, fix it
+up to do the necessary write locking, and make it available as a helper
+function for other architectures that can use the common helper.
+
+Note: I say "common helper", but it really only handles the normal
+stack-grows-down case. Which is all architectures except for PA-RISC
+and IA64. So some rare architectures can't use the helper, but if they
+care they'll just need to open-code this logic.
+
+It's also worth pointing out that this code really would like to have an
+optimistic "mmap_upgrade_trylock()" to make it quicker to go from a
+read-lock (for the common case) to taking the write lock (for having to
+extend the vma) in the normal single-threaded situation where there is
+no other locking activity.
+
+But that _is_ all the very uncommon special case, so while it would be
+nice to have such an operation, it probably doesn't matter in reality.
+I did put in the skeleton code for such a possible future expansion,
+even if it only acts as pseudo-documentation for what we're doing.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1: Ignore CONFIG_PER_VMA_LOCK context]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig | 1
+ arch/x86/mm/fault.c | 52 ----------------------
+ include/linux/mm.h | 2
+ mm/Kconfig | 4 +
+ mm/memory.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 130 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -271,6 +271,7 @@ config X86
+ select HAVE_GENERIC_VDSO
+ select HOTPLUG_SMT if SMP
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+ select NEED_PER_CPU_PAGE_FIRST_CHUNK
+ select NEED_SG_DMA_LENGTH
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -900,12 +900,6 @@ __bad_area(struct pt_regs *regs, unsigne
+ __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+ }
+
+-static noinline void
+-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+-{
+- __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
+-}
+-
+ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ struct vm_area_struct *vma)
+ {
+@@ -1354,51 +1348,10 @@ void do_user_addr_fault(struct pt_regs *
+ }
+ #endif
+
+- /*
+- * Kernel-mode access to the user address space should only occur
+- * on well-defined single instructions listed in the exception
+- * tables. But, an erroneous kernel fault occurring outside one of
+- * those areas which also holds mmap_lock might deadlock attempting
+- * to validate the fault against the address space.
+- *
+- * Only do the expensive exception table search when we might be at
+- * risk of a deadlock. This happens if we
+- * 1. Failed to acquire mmap_lock, and
+- * 2. The access did not originate in userspace.
+- */
+- if (unlikely(!mmap_read_trylock(mm))) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
+- /*
+- * Fault from code in kernel from
+- * which we do not expect faults.
+- */
+- bad_area_nosemaphore(regs, error_code, address);
+- return;
+- }
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case we'll have missed the might_sleep() from
+- * down_read():
+- */
+- might_sleep();
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (likely(vma->vm_start <= address))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (unlikely(expand_stack(vma, address))) {
+- bad_area(regs, error_code, address);
++ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+@@ -1406,7 +1359,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address, vma);
+ return;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1932,6 +1932,8 @@ void unmap_mapping_pages(struct address_
+ pgoff_t start, pgoff_t nr, bool even_cows);
+ void unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows);
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++ unsigned long address, struct pt_regs *regs);
+ #else
+ static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1150,6 +1150,10 @@ config LRU_GEN_STATS
+ This option has a per-memcg and per-node memory overhead.
+ # }
+
++config LOCK_MM_AND_FIND_VMA
++ bool
++ depends on !STACK_GROWSUP
++
+ source "mm/damon/Kconfig"
+
+ endmenu
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5246,6 +5246,127 @@ vm_fault_t handle_mm_fault(struct vm_are
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
+
++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
++#include <linux/extable.h>
++
++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++ /* Even if this succeeds, make it clear we *might* have slept */
++ if (likely(mmap_read_trylock(mm))) {
++ might_sleep();
++ return true;
++ }
++
++ if (regs && !user_mode(regs)) {
++ unsigned long ip = instruction_pointer(regs);
++ if (!search_exception_tables(ip))
++ return false;
++ }
++
++ mmap_read_lock(mm);
++ return true;
++}
++
++static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
++{
++ /*
++ * We don't have this operation yet.
++ *
++ * It should be easy enough to do: it's basically a
++ * atomic_long_try_cmpxchg_acquire()
++ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
++ * it also needs the proper lockdep magic etc.
++ */
++ return false;
++}
++
++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++ mmap_read_unlock(mm);
++ if (regs && !user_mode(regs)) {
++ unsigned long ip = instruction_pointer(regs);
++ if (!search_exception_tables(ip))
++ return false;
++ }
++ mmap_write_lock(mm);
++ return true;
++}
++
++/*
++ * Helper for page fault handling.
++ *
++ * This is kind of equivalend to "mmap_read_lock()" followed
++ * by "find_extend_vma()", except it's a lot more careful about
++ * the locking (and will drop the lock on failure).
++ *
++ * For example, if we have a kernel bug that causes a page
++ * fault, we don't want to just use mmap_read_lock() to get
++ * the mm lock, because that would deadlock if the bug were
++ * to happen while we're holding the mm lock for writing.
++ *
++ * So this checks the exception tables on kernel faults in
++ * order to only do this all for instructions that are actually
++ * expected to fault.
++ *
++ * We can also actually take the mm lock for writing if we
++ * need to extend the vma, which helps the VM layer a lot.
++ */
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++ unsigned long addr, struct pt_regs *regs)
++{
++ struct vm_area_struct *vma;
++
++ if (!get_mmap_lock_carefully(mm, regs))
++ return NULL;
++
++ vma = find_vma(mm, addr);
++ if (likely(vma && (vma->vm_start <= addr)))
++ return vma;
++
++ /*
++ * Well, dang. We might still be successful, but only
++ * if we can extend a vma to do so.
++ */
++ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
++ mmap_read_unlock(mm);
++ return NULL;
++ }
++
++ /*
++ * We can try to upgrade the mmap lock atomically,
++ * in which case we can continue to use the vma
++ * we already looked up.
++ *
++ * Otherwise we'll have to drop the mmap lock and
++ * re-take it, and also look up the vma again,
++ * re-checking it.
++ */
++ if (!mmap_upgrade_trylock(mm)) {
++ if (!upgrade_mmap_lock_carefully(mm, regs))
++ return NULL;
++
++ vma = find_vma(mm, addr);
++ if (!vma)
++ goto fail;
++ if (vma->vm_start <= addr)
++ goto success;
++ if (!(vma->vm_flags & VM_GROWSDOWN))
++ goto fail;
++ }
++
++ if (expand_stack(vma, addr))
++ goto fail;
++
++success:
++ mmap_write_downgrade(mm);
++ return vma;
++
++fail:
++ mmap_write_unlock(mm);
++ return NULL;
++}
++#endif
++
+ #ifndef __PAGETABLE_P4D_FOLDED
+ /*
+ * Allocate p4d page table.
--- /dev/null
+From 37a9a30aeabe9fd620bcda2bb333f28a1593820d Mon Sep 17 00:00:00 2001
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Fri, 16 Jun 2023 15:58:54 -0700
+Subject: mm: make find_extend_vma() fail if write lock not held
+
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+
+commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream.
+
+Make calls to extend_vma() and find_extend_vma() fail if the write lock
+is required.
+
+To avoid making this a flag-day event, this still allows the old
+read-locking case for the trivial situations, and passes in a flag to
+say "is it write-locked". That way write-lockers can say "yes, I'm
+being careful", and legacy users will continue to work in all the common
+cases until they have been fully converted to the new world order.
+
+Co-Developed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/binfmt_elf.c | 6 +++---
+ fs/exec.c | 5 +++--
+ include/linux/mm.h | 10 +++++++---
+ mm/memory.c | 2 +-
+ mm/mmap.c | 50 +++++++++++++++++++++++++++++++++-----------------
+ mm/nommu.c | 3 ++-
+ 6 files changed, 49 insertions(+), 27 deletions(-)
+
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -315,10 +315,10 @@ create_elf_tables(struct linux_binprm *b
+ * Grow the stack manually; some architectures have a limit on how
+ * far ahead a user-space access may be in order to grow the stack.
+ */
+- if (mmap_read_lock_killable(mm))
++ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+- vma = find_extend_vma(mm, bprm->p);
+- mmap_read_unlock(mm);
++ vma = find_extend_vma_locked(mm, bprm->p, true);
++ mmap_write_unlock(mm);
+ if (!vma)
+ return -EFAULT;
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -203,7 +203,8 @@ static struct page *get_arg_page(struct
+
+ #ifdef CONFIG_STACK_GROWSUP
+ if (write) {
+- ret = expand_downwards(bprm->vma, pos);
++ /* We claim to hold the lock - nobody to race with */
++ ret = expand_downwards(bprm->vma, pos, true);
+ if (ret < 0)
+ return NULL;
+ }
+@@ -854,7 +855,7 @@ int setup_arg_pages(struct linux_binprm
+ stack_base = vma->vm_start - stack_expand;
+ #endif
+ current->mm->start_stack = bprm->p;
+- ret = expand_stack(vma, stack_base);
++ ret = expand_stack_locked(vma, stack_base, true);
+ if (ret)
+ ret = -EFAULT;
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2810,11 +2810,13 @@ extern vm_fault_t filemap_page_mkwrite(s
+
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked);
++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
+
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-extern int expand_downwards(struct vm_area_struct *vma,
+- unsigned long address);
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked);
+ #if VM_GROWSUP
+ extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+ #else
+@@ -2915,6 +2917,8 @@ unsigned long change_prot_numa(struct vm
+ #endif
+
+ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
++ unsigned long addr, bool write_locked);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5352,7 +5352,7 @@ struct vm_area_struct *lock_mm_and_find_
+ goto fail;
+ }
+
+- if (expand_stack(vma, addr))
++ if (expand_stack_locked(vma, addr, true))
+ goto fail;
+
+ success:
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1945,7 +1945,8 @@ static int acct_stack_growth(struct vm_a
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
++int expand_upwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+@@ -1969,6 +1970,8 @@ int expand_upwards(struct vm_area_struct
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
++ if (!write_locked)
++ return -EAGAIN;
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+@@ -2037,7 +2040,8 @@ int expand_upwards(struct vm_area_struct
+ /*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
+ */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+@@ -2051,10 +2055,13 @@ int expand_downwards(struct vm_area_stru
+ /* Enforce stack_guard_gap */
+ prev = mas_prev(&mas, 0);
+ /* Check that both stack segments have the same anon_vma? */
+- if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+- vma_is_accessible(prev)) {
+- if (address - prev->vm_end < stack_guard_gap)
++ if (prev) {
++ if (!(prev->vm_flags & VM_GROWSDOWN) &&
++ vma_is_accessible(prev) &&
++ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
++ if (!write_locked && (prev->vm_end == address))
++ return -EAGAIN;
+ }
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+@@ -2132,13 +2139,14 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+- return expand_upwards(vma, address);
++ return expand_upwards(vma, address, write_locked);
+ }
+
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++ unsigned long addr, bool write_locked)
+ {
+ struct vm_area_struct *vma, *prev;
+
+@@ -2146,20 +2154,25 @@ find_extend_vma(struct mm_struct *mm, un
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+- if (!prev || expand_stack(prev, addr))
++ if (!prev)
++ return NULL;
++ if (expand_stack_locked(prev, addr, write_locked))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+ return prev;
+ }
+ #else
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+- return expand_downwards(vma, address);
++ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
++ return -EINVAL;
++ return expand_downwards(vma, address, write_locked);
+ }
+
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++ unsigned long addr, bool write_locked)
+ {
+ struct vm_area_struct *vma;
+ unsigned long start;
+@@ -2170,10 +2183,8 @@ find_extend_vma(struct mm_struct *mm, un
+ return NULL;
+ if (vma->vm_start <= addr)
+ return vma;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return NULL;
+ start = vma->vm_start;
+- if (expand_stack(vma, addr))
++ if (expand_stack_locked(vma, addr, write_locked))
+ return NULL;
+ if (vma->vm_flags & VM_LOCKED)
+ populate_vma_page_range(vma, addr, start, NULL);
+@@ -2181,6 +2192,11 @@ find_extend_vma(struct mm_struct *mm, un
+ }
+ #endif
+
++struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
++ unsigned long addr)
++{
++ return find_extend_vma_locked(mm, addr, false);
++}
+ EXPORT_SYMBOL_GPL(find_extend_vma);
+
+ /*
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -694,7 +694,8 @@ struct vm_area_struct *find_extend_vma(s
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ return -ENOMEM;
+ }
--- /dev/null
+From 92a6879f1c3fc7fdf6660b10be045c457ec697c6 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 16:17:48 -0700
+Subject: mm: make the page fault mmap locking killable
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream.
+
+This is done as a separate patch from introducing the new
+lock_mm_and_find_vma() helper, because while it's an obvious change,
+it's not what x86 used to do in this area.
+
+We already abort the page fault on fatal signals anyway, so why should
+we wait for the mmap lock only to then abort later? With the new helper
+function that returns without the lock held on failure anyway, this is
+particularly easy and straightforward.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5263,8 +5263,7 @@ static inline bool get_mmap_lock_careful
+ return false;
+ }
+
+- mmap_read_lock(mm);
+- return true;
++ return !mmap_read_lock_killable(mm);
+ }
+
+ static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+@@ -5288,8 +5287,7 @@ static inline bool upgrade_mmap_lock_car
+ if (!search_exception_tables(ip))
+ return false;
+ }
+- mmap_write_lock(mm);
+- return true;
++ return !mmap_write_lock_killable(mm);
+ }
+
+ /*
--- /dev/null
+From d47a1e567e9744a2a097ae2a39a2b028619d1f15 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 11:17:05 -0700
+Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream.
+
+This is one of the simple cases, except there's no pt_regs pointer.
+Which is fine, as lock_mm_and_find_vma() is set up to work fine with a
+NULL pt_regs.
+
+Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting,
+so we can just use the helper without any extra work.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/mm/copro_fault.c | 14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/arch/powerpc/mm/copro_fault.c
++++ b/arch/powerpc/mm/copro_fault.c
+@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru
+ if (mm->pgd == NULL)
+ return -EFAULT;
+
+- mmap_read_lock(mm);
+- ret = -EFAULT;
+- vma = find_vma(mm, ea);
++ vma = lock_mm_and_find_vma(mm, ea, NULL);
+ if (!vma)
+- goto out_unlock;
+-
+- if (ea < vma->vm_start) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto out_unlock;
+- if (expand_stack(vma, ea))
+- goto out_unlock;
+- }
++ return -EFAULT;
+
++ ret = -EFAULT;
+ is_write = dsisr & DSISR_ISSTORE;
+ if (is_write) {
+ if (!(vma->vm_flags & VM_WRITE))
--- /dev/null
+From 689298e7d498f2c6d3e8116bce0a7c769e5369dc Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Fri, 16 Jun 2023 15:51:29 +1000
+Subject: powerpc/mm: Convert to using lock_mm_and_find_vma()
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream.
+
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/Kconfig | 1 +
+ arch/powerpc/mm/fault.c | 41 ++++-------------------------------------
+ 2 files changed, 5 insertions(+), 37 deletions(-)
+
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -257,6 +257,7 @@ config PPC
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
+ select KASAN_VMALLOC if KASAN && MODULES
++ select LOCK_MM_AND_FIND_VMA
+ select MMU_GATHER_PAGE_SIZE
+ select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_GATHER_MERGE_VMAS
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re
+ return __bad_area_nosemaphore(regs, address, si_code);
+ }
+
+-static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+-{
+- return __bad_area(regs, address, SEGV_MAPERR);
+-}
+-
+ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ struct vm_area_struct *vma)
+ {
+@@ -481,40 +476,12 @@ static int ___do_page_fault(struct pt_re
+ * we will deadlock attempting to validate the fault against the
+ * address space. Luckily the kernel only validly references user
+ * space from well defined areas of code, which are listed in the
+- * exceptions table.
+- *
+- * As the vast majority of faults will be valid we will only perform
+- * the source reference check when there is a possibility of a deadlock.
+- * Attempt to lock the address space, if we cannot we then validate the
+- * source. If this is invalid we can skip the address space check,
+- * thus avoiding the deadlock.
+- */
+- if (unlikely(!mmap_read_trylock(mm))) {
+- if (!is_user && !search_exception_tables(regs->nip))
+- return bad_area_nosemaphore(regs, address);
+-
++ * exceptions table. lock_mm_and_find_vma() handles that logic.
++ */
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case we'll have missed the might_sleep() from
+- * down_read():
+- */
+- might_sleep();
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma))
+- return bad_area(regs, address);
+-
+- if (unlikely(vma->vm_start > address)) {
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+- return bad_area(regs, address);
+-
+- if (unlikely(expand_stack(vma, address)))
+- return bad_area(regs, address);
+- }
++ return bad_area_nosemaphore(regs, address);
+
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma)))
--- /dev/null
+From a907c689b4e7014c73c7c34fb1520431b75c787c Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 20:18:18 +0200
+Subject: riscv/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1: Kconfig context]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/Kconfig | 1 +
+ arch/riscv/mm/fault.c | 31 +++++++++++++------------------
+ 2 files changed, 14 insertions(+), 18 deletions(-)
+
+--- a/arch/riscv/Kconfig
++++ b/arch/riscv/Kconfig
+@@ -114,6 +114,7 @@ config RISCV
+ select HAVE_RSEQ
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA if MODULES
+ select MODULE_SECTIONS if MODULES
+ select OF
+--- a/arch/riscv/mm/fault.c
++++ b/arch/riscv/mm/fault.c
+@@ -83,13 +83,13 @@ static inline void mm_fault_error(struct
+ BUG();
+ }
+
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void
++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr)
+ {
+ /*
+ * Something tried to access memory that isn't in our memory map.
+ * Fix it, but check if it's kernel or user first.
+ */
+- mmap_read_unlock(mm);
+ /* User mode accesses just cause a SIGSEGV */
+ if (user_mode(regs)) {
+ do_trap(regs, SIGSEGV, code, addr);
+@@ -99,6 +99,15 @@ static inline void bad_area(struct pt_re
+ no_context(regs, addr);
+ }
+
++static inline void
++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code,
++ unsigned long addr)
++{
++ mmap_read_unlock(mm);
++
++ bad_area_nosemaphore(regs, code, addr);
++}
++
+ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
+ {
+ pgd_t *pgd, *pgd_k;
+@@ -281,23 +290,10 @@ asmlinkage void do_page_fault(struct pt_
+ else if (cause == EXC_INST_PAGE_FAULT)
+ flags |= FAULT_FLAG_INSTRUCTION;
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, addr);
++ vma = lock_mm_and_find_vma(mm, addr, regs);
+ if (unlikely(!vma)) {
+ tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (likely(vma->vm_start <= addr))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (unlikely(expand_stack(vma, addr))) {
+- tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
++ bad_area_nosemaphore(regs, code, addr);
+ return;
+ }
+
+@@ -305,7 +301,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it.
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (unlikely(access_error(cause, vma))) {
x86-smp-cure-kexec-vs.-mwait_play_dead-breakage.patch
can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch
maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch
+
+mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
+mm-make-the-page-fault-mmap-locking-killable.patch
+arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
+powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
+mips-mm-convert-to-using-lock_mm_and_find_vma.patch
+riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
+arm-mm-convert-to-using-lock_mm_and_find_vma.patch
+mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
+powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
+mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
+execve-expand-new-process-stack-manually-ahead-of-time.patch
+mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
--- /dev/null
+From 8b35ca3e45e35a26a21427f35d4093606e93ad0a Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 21:24:30 +0200
+Subject: arm/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream.
+
+arm has an additional check for address < FIRST_USER_ADDRESS before
+expanding the stack. Since FIRST_USER_ADDRESS is defined everywhere
+(generally as 0), move that check to the generic expand_downwards().
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/Kconfig | 1
+ arch/arm/mm/fault.c | 63 +++++++++++-----------------------------------------
+ mm/mmap.c | 2 -
+ 3 files changed, 16 insertions(+), 50 deletions(-)
+
+--- a/arch/arm/Kconfig
++++ b/arch/arm/Kconfig
+@@ -125,6 +125,7 @@ config ARM
+ select HAVE_UID16
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_REL
+ select NEED_DMA_MAP_STATE
+ select OF_EARLY_FLATTREE if OF
+--- a/arch/arm/mm/fault.c
++++ b/arch/arm/mm/fault.c
+@@ -232,37 +232,11 @@ static inline bool is_permission_fault(u
+ return false;
+ }
+
+-static vm_fault_t __kprobes
+-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags,
+- unsigned long vma_flags, struct pt_regs *regs)
+-{
+- struct vm_area_struct *vma = find_vma(mm, addr);
+- if (unlikely(!vma))
+- return VM_FAULT_BADMAP;
+-
+- if (unlikely(vma->vm_start > addr)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return VM_FAULT_BADMAP;
+- if (addr < FIRST_USER_ADDRESS)
+- return VM_FAULT_BADMAP;
+- if (expand_stack(vma, addr))
+- return VM_FAULT_BADMAP;
+- }
+-
+- /*
+- * ok, we have a good vm_area for this memory access, check the
+- * permissions on the VMA allow for the fault which occurred.
+- */
+- if (!(vma->vm_flags & vma_flags))
+- return VM_FAULT_BADACCESS;
+-
+- return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+-}
+-
+ static int __kprobes
+ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ {
+ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
+ int sig, code;
+ vm_fault_t fault;
+ unsigned int flags = FAULT_FLAG_DEFAULT;
+@@ -301,31 +275,21 @@ do_page_fault(unsigned long addr, unsign
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+
+- /*
+- * As per x86, we may deadlock here. However, since the kernel only
+- * validly references user space from well defined areas of the code,
+- * we can bug out early if this is from code which shouldn't.
+- */
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
+- goto no_context;
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case, we'll have missed the might_sleep() from
+- * down_read()
+- */
+- might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+- if (!user_mode(regs) &&
+- !search_exception_tables(regs->ARM_pc))
+- goto no_context;
+-#endif
++ vma = lock_mm_and_find_vma(mm, addr, regs);
++ if (unlikely(!vma)) {
++ fault = VM_FAULT_BADMAP;
++ goto bad_area;
+ }
+
+- fault = __do_page_fault(mm, addr, flags, vm_flags, regs);
++ /*
++ * ok, we have a good vm_area for this memory access, check the
++ * permissions on the VMA allow for the fault which occurred.
++ */
++ if (!(vma->vm_flags & vm_flags))
++ fault = VM_FAULT_BADACCESS;
++ else
++ fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+
+ /* If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_lock because
+@@ -356,6 +320,7 @@ retry:
+ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+ return 0;
+
++bad_area:
+ /*
+ * If we are in kernel mode at this point, we
+ * have no context to handle this fault with.
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1999,7 +1999,7 @@ int expand_downwards(struct vm_area_stru
+ int error = 0;
+
+ address &= PAGE_MASK;
+- if (address < mmap_min_addr)
++ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+
+ /* Enforce stack_guard_gap */
--- /dev/null
+From ae870a68b5d13d67cf4f18d47bb01ee3fee40acb Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 17:11:44 -0700
+Subject: arm64/mm: Convert to using lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream.
+
+This converts arm64 to use the new page fault helper. It was very
+straightforward, but still needed a fix for the "obvious" conversion I
+initially did. Thanks to Suren for the fix and testing.
+
+Fixed-and-tested-by: Suren Baghdasaryan <surenb@google.com>
+Unnecessary-code-removal-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/Kconfig | 1 +
+ arch/arm64/mm/fault.c | 44 +++++++-------------------------------------
+ 2 files changed, 8 insertions(+), 37 deletions(-)
+
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -219,6 +219,7 @@ config ARM64
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
+ select KASAN_VMALLOC if KASAN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa
+ #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
+ #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
+
+-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
++static vm_fault_t __do_page_fault(struct mm_struct *mm,
++ struct vm_area_struct *vma, unsigned long addr,
+ unsigned int mm_flags, unsigned long vm_flags,
+ struct pt_regs *regs)
+ {
+- struct vm_area_struct *vma = find_vma(mm, addr);
+-
+- if (unlikely(!vma))
+- return VM_FAULT_BADMAP;
+-
+ /*
+ * Ok, we have a good vm_area for this memory access, so we can handle
+ * it.
+- */
+- if (unlikely(vma->vm_start > addr)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return VM_FAULT_BADMAP;
+- if (expand_stack(vma, addr))
+- return VM_FAULT_BADMAP;
+- }
+-
+- /*
+ * Check that the permissions on the VMA allow for the fault which
+ * occurred.
+ */
+@@ -585,31 +572,14 @@ static int __kprobes do_page_fault(unsig
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+
+- /*
+- * As per x86, we may deadlock here. However, since the kernel only
+- * validly references user space from well defined areas of the code,
+- * we can bug out early if this is from code which shouldn't.
+- */
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->pc))
+- goto no_context;
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above mmap_read_trylock() might have succeeded in which
+- * case, we'll have missed the might_sleep() from down_read().
+- */
+- might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+- if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
+- mmap_read_unlock(mm);
+- goto no_context;
+- }
+-#endif
++ vma = lock_mm_and_find_vma(mm, addr, regs);
++ if (unlikely(!vma)) {
++ fault = VM_FAULT_BADMAP;
++ goto done;
+ }
+
+- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
++ fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
--- /dev/null
+From f313c51d26aa87e69633c9b46efb37a930faca71 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Mon, 19 Jun 2023 11:34:15 -0700
+Subject: execve: expand new process stack manually ahead of time
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream.
+
+This is a small step towards a model where GUP itself would not expand
+the stack, and any user that needs GUP to not look up existing mappings,
+but actually expand on them, would have to do so manually before-hand,
+and with the mm lock held for writing.
+
+It turns out that execve() already did almost exactly that, except it
+didn't take the mm lock at all (it's single-threaded so no locking
+technically needed, but it could cause lockdep errors). And it only did
+it for the CONFIG_STACK_GROWSUP case, since in that case GUP has
+obviously never expanded the stack downwards.
+
+So just make that CONFIG_STACK_GROWSUP case do the right thing with
+locking, and enable it generally. This will eventually help GUP, and in
+the meantime avoids a special case and the lockdep issue.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c | 37 +++++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 16 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -199,34 +199,39 @@ static struct page *get_arg_page(struct
+ int write)
+ {
+ struct page *page;
++ struct vm_area_struct *vma = bprm->vma;
++ struct mm_struct *mm = bprm->mm;
+ int ret;
+- unsigned int gup_flags = 0;
+
+-#ifdef CONFIG_STACK_GROWSUP
+- if (write) {
+- /* We claim to hold the lock - nobody to race with */
+- ret = expand_downwards(bprm->vma, pos, true);
+- if (ret < 0)
++ /*
++ * Avoid relying on expanding the stack down in GUP (which
++ * does not work for STACK_GROWSUP anyway), and just do it
++ * by hand ahead of time.
++ */
++ if (write && pos < vma->vm_start) {
++ mmap_write_lock(mm);
++ ret = expand_downwards(vma, pos, true);
++ if (unlikely(ret < 0)) {
++ mmap_write_unlock(mm);
+ return NULL;
+- }
+-#endif
+-
+- if (write)
+- gup_flags |= FOLL_WRITE;
++ }
++ mmap_write_downgrade(mm);
++ } else
++ mmap_read_lock(mm);
+
+ /*
+ * We are doing an exec(). 'current' is the process
+- * doing the exec and bprm->mm is the new process's mm.
++ * doing the exec and 'mm' is the new process's mm.
+ */
+- mmap_read_lock(bprm->mm);
+- ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
++ ret = get_user_pages_remote(mm, pos, 1,
++ write ? FOLL_WRITE : 0,
+ &page, NULL, NULL);
+- mmap_read_unlock(bprm->mm);
++ mmap_read_unlock(mm);
+ if (ret <= 0)
+ return NULL;
+
+ if (write)
+- acct_arg_size(bprm, vma_pages(bprm->vma));
++ acct_arg_size(bprm, vma_pages(vma));
+
+ return page;
+ }
--- /dev/null
+From a425ac5365f6cb3cc47bf83e6bff0213c10445f7 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sun, 25 Jun 2023 14:02:25 -0700
+Subject: gup: add warning if some caller would seem to want stack expansion
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a425ac5365f6cb3cc47bf83e6bff0213c10445f7 upstream.
+
+It feels very unlikely that anybody would want to do a GUP in an
+unmapped area under the stack pointer, but real users sometimes do some
+really strange things. So add a (temporary) warning for the case where
+a GUP fails and expanding the stack might have made it work.
+
+It's trivial to do the expansion in the caller as part of getting the mm
+lock in the first place - see __access_remote_vm() for ptrace, for
+example - it's just that it's unnecessarily painful to do it deep in the
+guts of the GUP lookup when we might have to drop and re-take the lock.
+
+I doubt anybody actually does anything quite this strange, but let's be
+proactive: adding these warnings is simple, and will make debugging it
+much easier if they trigger.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/gup.c | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1096,7 +1096,11 @@ static long __get_user_pages(struct mm_s
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+- vma = vma_lookup(mm, start);
++ vma = find_vma(mm, start);
++ if (vma && (start < vma->vm_start)) {
++ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
++ vma = NULL;
++ }
+ if (!vma && in_gate_area(mm, start)) {
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+@@ -1265,9 +1269,13 @@ int fixup_user_fault(struct mm_struct *m
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ retry:
+- vma = vma_lookup(mm, address);
++ vma = find_vma(mm, address);
+ if (!vma)
+ return -EFAULT;
++ if (address < vma->vm_start ) {
++ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
++ return -EFAULT;
++ }
+
+ if (!vma_permits_fault(vma, fault_flags))
+ return -EFAULT;
--- /dev/null
+From 4bce37a68ff884e821a02a731897a8119e0c37b7 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 18:47:40 +0200
+Subject: mips/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/mips/Kconfig | 1 +
+ arch/mips/mm/fault.c | 12 ++----------
+ 2 files changed, 3 insertions(+), 10 deletions(-)
+
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -94,6 +94,7 @@ config MIPS
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP
+ select IRQ_FORCED_THREADING
+ select ISA if EISA
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_REL if MODULES
+ select MODULES_USE_ELF_RELA if MODULES && 64BIT
+ select PERF_USE_VMALLOC
+--- a/arch/mips/mm/fault.c
++++ b/arch/mips/mm/fault.c
+@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ si_code = SEGV_ACCERR;
+
+ if (write) {
--- /dev/null
+From 8d7071af890768438c14db6172cc8f9f4d04e184 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 13:45:51 -0700
+Subject: mm: always expand the stack with the mmap write lock held
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream.
+
+This finishes the job of always holding the mmap write lock when
+extending the user stack vma, and removes the 'write_locked' argument
+from the vm helper functions again.
+
+For some cases, we just avoid expanding the stack at all: drivers and
+page pinning really shouldn't be extending any stacks. Let's see if any
+strange users really wanted that.
+
+It's worth noting that architectures that weren't converted to the new
+lock_mm_and_find_vma() helper function are left using the legacy
+"expand_stack()" function, but it has been changed to drop the mmap_lock
+and take it for writing while expanding the vma. This makes it fairly
+straightforward to convert the remaining architectures.
+
+As a result of dropping and re-taking the lock, the calling conventions
+for this function have also changed, since the old vma may no longer be
+valid. So it will now return the new vma if successful, and NULL - and
+the lock dropped - if the area could not be extended.
+
+Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
+Tested-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> # ia64
+Tested-by: Frank Scheiner <frank.scheiner@web.de> # ia64
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/ia64/mm/fault.c | 36 ++----------
+ arch/m68k/mm/fault.c | 9 ++-
+ arch/microblaze/mm/fault.c | 5 +
+ arch/openrisc/mm/fault.c | 5 +
+ arch/parisc/mm/fault.c | 23 +++-----
+ arch/s390/mm/fault.c | 5 +
+ arch/sparc/mm/fault_64.c | 8 +-
+ arch/um/kernel/trap.c | 11 ++-
+ drivers/iommu/amd/iommu_v2.c | 4 -
+ drivers/iommu/iommu-sva.c | 2
+ fs/binfmt_elf.c | 2
+ fs/exec.c | 4 -
+ include/linux/mm.h | 16 +----
+ mm/gup.c | 6 +-
+ mm/memory.c | 10 +++
+ mm/mmap.c | 121 ++++++++++++++++++++++++++++++++++---------
+ mm/nommu.c | 18 ++----
+ 17 files changed, 169 insertions(+), 116 deletions(-)
+
+--- a/arch/ia64/mm/fault.c
++++ b/arch/ia64/mm/fault.c
+@@ -110,10 +110,12 @@ retry:
+ * register backing store that needs to expand upwards, in
+ * this case vma will be null, but prev_vma will ne non-null
+ */
+- if (( !vma && prev_vma ) || (address < vma->vm_start) )
+- goto check_expansion;
++ if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
++ }
+
+- good_area:
+ code = SEGV_ACCERR;
+
+ /* OK, we've got a good vm_area for this memory area. Check the access permissions: */
+@@ -177,35 +179,9 @@ retry:
+ mmap_read_unlock(mm);
+ return;
+
+- check_expansion:
+- if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+- if (!vma)
+- goto bad_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+- || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
+- } else {
+- vma = prev_vma;
+- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+- || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+- goto bad_area;
+- /*
+- * Since the register backing store is accessed sequentially,
+- * we disallow growing it by more than a page at a time.
+- */
+- if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+- goto bad_area;
+- if (expand_upwards(vma, address))
+- goto bad_area;
+- }
+- goto good_area;
+-
+ bad_area:
+ mmap_read_unlock(mm);
++ bad_area_nosemaphore:
+ if ((isr & IA64_ISR_SP)
+ || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+ {
+--- a/arch/m68k/mm/fault.c
++++ b/arch/m68k/mm/fault.c
+@@ -105,8 +105,9 @@ retry:
+ if (address + 256 < rdusp())
+ goto map_err;
+ }
+- if (expand_stack(vma, address))
+- goto map_err;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto map_err_nosemaphore;
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+@@ -196,10 +197,12 @@ bus_err:
+ goto send_sig;
+
+ map_err:
++ mmap_read_unlock(mm);
++map_err_nosemaphore:
+ current->thread.signo = SIGSEGV;
+ current->thread.code = SEGV_MAPERR;
+ current->thread.faddr = address;
+- goto send_sig;
++ return send_fault_sig(regs);
+
+ acc_err:
+ current->thread.signo = SIGSEGV;
+--- a/arch/microblaze/mm/fault.c
++++ b/arch/microblaze/mm/fault.c
+@@ -192,8 +192,9 @@ retry:
+ && (kernel_mode(regs) || !store_updates_sp(regs)))
+ goto bad_area;
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+
+ good_area:
+ code = SEGV_ACCERR;
+--- a/arch/openrisc/mm/fault.c
++++ b/arch/openrisc/mm/fault.c
+@@ -127,8 +127,9 @@ retry:
+ if (address + PAGE_SIZE < regs->sp)
+ goto bad_area;
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+--- a/arch/parisc/mm/fault.c
++++ b/arch/parisc/mm/fault.c
+@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs,
+ retry:
+ mmap_read_lock(mm);
+ vma = find_vma_prev(mm, address, &prev_vma);
+- if (!vma || address < vma->vm_start)
+- goto check_expansion;
++ if (!vma || address < vma->vm_start) {
++ if (!prev || !(prev->vm_flags & VM_GROWSUP))
++ goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
++ }
++
+ /*
+ * Ok, we have a good vm_area for this memory access. We still need to
+ * check the access permissions.
+ */
+
+-good_area:
+-
+ if ((vma->vm_flags & acc_type) != acc_type)
+ goto bad_area;
+
+@@ -347,17 +351,13 @@ good_area:
+ mmap_read_unlock(mm);
+ return;
+
+-check_expansion:
+- vma = prev_vma;
+- if (vma && (expand_stack(vma, address) == 0))
+- goto good_area;
+-
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ */
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ int signo, si_code;
+
+@@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ {
+ unsigned long insn = regs->iir;
+ int breg, treg, xreg, val = 0;
+- struct vm_area_struct *vma, *prev_vma;
++ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ unsigned long address;
+@@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ /* Search for VMA */
+ address = regs->ior;
+ mmap_read_lock(mm);
+- vma = find_vma_prev(mm, address, &prev_vma);
++ vma = vma_lookup(mm, address);
+ mmap_read_unlock(mm);
+
+ /*
+@@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs
+ */
+ acc_type = (insn & 0x40) ? VM_WRITE : VM_READ;
+ if (vma
+- && address >= vma->vm_start
+ && (vma->vm_flags & acc_type) == acc_type)
+ val = 1;
+ }
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -433,8 +433,9 @@ retry:
+ if (unlikely(vma->vm_start > address)) {
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out_up;
+- if (expand_stack(vma, address))
+- goto out_up;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto out;
+ }
+
+ /*
+--- a/arch/sparc/mm/fault_64.c
++++ b/arch/sparc/mm/fault_64.c
+@@ -383,8 +383,9 @@ continue_fault:
+ goto bad_area;
+ }
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+@@ -487,8 +488,9 @@ exit_exception:
+ * Fix it, but check if it's kernel or user first..
+ */
+ bad_area:
+- insn = get_fault_insn(regs, insn);
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
++ insn = get_fault_insn(regs, insn);
+
+ handle_kernel_fault:
+ do_kernel_fault(regs, si_code, fault_code, insn, address);
+--- a/arch/um/kernel/trap.c
++++ b/arch/um/kernel/trap.c
+@@ -47,14 +47,15 @@ retry:
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto out;
+- else if (vma->vm_start <= address)
++ if (vma->vm_start <= address)
+ goto good_area;
+- else if (!(vma->vm_flags & VM_GROWSDOWN))
++ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out;
+- else if (is_user && !ARCH_IS_STACKGROW(address))
+- goto out;
+- else if (expand_stack(vma, address))
++ if (is_user && !ARCH_IS_STACKGROW(address))
+ goto out;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto out_nosemaphore;
+
+ good_area:
+ *code_out = SEGV_ACCERR;
+--- a/drivers/iommu/amd/iommu_v2.c
++++ b/drivers/iommu/amd/iommu_v2.c
+@@ -485,8 +485,8 @@ static void do_fault(struct work_struct
+ flags |= FAULT_FLAG_REMOTE;
+
+ mmap_read_lock(mm);
+- vma = find_extend_vma(mm, address);
+- if (!vma || address < vma->vm_start)
++ vma = vma_lookup(mm, address);
++ if (!vma)
+ /* failed to get a vma in the right range */
+ goto out;
+
+--- a/drivers/iommu/iommu-sva.c
++++ b/drivers/iommu/iommu-sva.c
+@@ -203,7 +203,7 @@ iommu_sva_handle_iopf(struct iommu_fault
+
+ mmap_read_lock(mm);
+
+- vma = find_extend_vma(mm, prm->addr);
++ vma = vma_lookup(mm, prm->addr);
+ if (!vma)
+ /* Unmapped area */
+ goto out_put_mm;
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -322,7 +322,7 @@ create_elf_tables(struct linux_binprm *b
+ */
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+- vma = find_extend_vma_locked(mm, bprm->p, true);
++ vma = find_extend_vma_locked(mm, bprm->p);
+ mmap_write_unlock(mm);
+ if (!vma)
+ return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -210,7 +210,7 @@ static struct page *get_arg_page(struct
+ */
+ if (write && pos < vma->vm_start) {
+ mmap_write_lock(mm);
+- ret = expand_downwards(vma, pos, true);
++ ret = expand_downwards(vma, pos);
+ if (unlikely(ret < 0)) {
+ mmap_write_unlock(mm);
+ return NULL;
+@@ -858,7 +858,7 @@ int setup_arg_pages(struct linux_binprm
+ stack_base = vma->vm_end - stack_expand;
+ #endif
+ current->mm->start_stack = bprm->p;
+- ret = expand_stack_locked(vma, stack_base, true);
++ ret = expand_stack_locked(vma, stack_base);
+ if (ret)
+ ret = -EFAULT;
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3065,18 +3065,11 @@ extern vm_fault_t filemap_page_mkwrite(s
+
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked);
+-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
+
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked);
+-#if VM_GROWSUP
+-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+-#else
+- #define expand_upwards(vma, address) (0)
+-#endif
++int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+
+ /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+@@ -3171,9 +3164,8 @@ unsigned long change_prot_numa(struct vm
+ unsigned long start, unsigned long end);
+ #endif
+
+-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
+- unsigned long addr, bool write_locked);
++ unsigned long addr);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1096,7 +1096,7 @@ static long __get_user_pages(struct mm_s
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+- vma = find_extend_vma(mm, start);
++ vma = vma_lookup(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+@@ -1265,8 +1265,8 @@ int fixup_user_fault(struct mm_struct *m
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ retry:
+- vma = find_extend_vma(mm, address);
+- if (!vma || address < vma->vm_start)
++ vma = vma_lookup(mm, address);
++ if (!vma)
+ return -EFAULT;
+
+ if (!vma_permits_fault(vma, fault_flags))
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5336,7 +5336,7 @@ struct vm_area_struct *lock_mm_and_find_
+ goto fail;
+ }
+
+- if (expand_stack_locked(vma, addr, true))
++ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+ success:
+@@ -5620,6 +5620,14 @@ int __access_remote_vm(struct mm_struct
+ if (mmap_read_lock_killable(mm))
+ return 0;
+
++ /* We might need to expand the stack to access it */
++ vma = vma_lookup(mm, addr);
++ if (!vma) {
++ vma = expand_stack(mm, addr);
++ if (!vma)
++ return 0;
++ }
++
+ /* ignore errors, just check how much was successfully transferred */
+ while (len) {
+ int bytes, ret, offset;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1898,8 +1898,7 @@ static int acct_stack_growth(struct vm_a
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+@@ -1923,8 +1922,6 @@ int expand_upwards(struct vm_area_struct
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+- if (!write_locked)
+- return -EAGAIN;
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+@@ -1993,15 +1990,18 @@ int expand_upwards(struct vm_area_struct
+
+ /*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
++ * mmap_lock held for writing.
+ */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+ struct vm_area_struct *prev;
+ int error = 0;
+
++ if (!(vma->vm_flags & VM_GROWSDOWN))
++ return -EFAULT;
++
+ address &= PAGE_MASK;
+ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+@@ -2014,8 +2014,6 @@ int expand_downwards(struct vm_area_stru
+ vma_is_accessible(prev) &&
+ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
+- if (!write_locked && (prev->vm_end == address))
+- return -EAGAIN;
+ }
+
+ if (mas_preallocate(&mas, GFP_KERNEL))
+@@ -2094,14 +2092,12 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+- return expand_upwards(vma, address, write_locked);
++ return expand_upwards(vma, address);
+ }
+
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+- unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+ struct vm_area_struct *vma, *prev;
+
+@@ -2111,23 +2107,21 @@ struct vm_area_struct *find_extend_vma_l
+ return vma;
+ if (!prev)
+ return NULL;
+- if (expand_stack_locked(prev, addr, write_locked))
++ if (expand_stack_locked(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+ return prev;
+ }
+ #else
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+ return -EINVAL;
+- return expand_downwards(vma, address, write_locked);
++ return expand_downwards(vma, address);
+ }
+
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+- unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+ struct vm_area_struct *vma;
+ unsigned long start;
+@@ -2139,7 +2133,7 @@ struct vm_area_struct *find_extend_vma_l
+ if (vma->vm_start <= addr)
+ return vma;
+ start = vma->vm_start;
+- if (expand_stack_locked(vma, addr, write_locked))
++ if (expand_stack_locked(vma, addr))
+ return NULL;
+ if (vma->vm_flags & VM_LOCKED)
+ populate_vma_page_range(vma, addr, start, NULL);
+@@ -2147,12 +2141,91 @@ struct vm_area_struct *find_extend_vma_l
+ }
+ #endif
+
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
+- unsigned long addr)
++/*
++ * IA64 has some horrid mapping rules: it can expand both up and down,
++ * but with various special rules.
++ *
++ * We'll get rid of this architecture eventually, so the ugliness is
++ * temporary.
++ */
++#ifdef CONFIG_IA64
++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
++{
++ return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
++ REGION_OFFSET(addr) < RGN_MAP_LIMIT;
++}
++
++/*
++ * IA64 stacks grow down, but there's a special register backing store
++ * that can grow up. Only sequentially, though, so the new address must
++ * match vm_end.
++ */
++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
++{
++ if (!vma_expand_ok(vma, addr))
++ return -EFAULT;
++ if (vma->vm_end != (addr & PAGE_MASK))
++ return -EFAULT;
++ return expand_upwards(vma, addr);
++}
++
++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
++{
++ if (!vma_expand_ok(vma, addr))
++ return -EFAULT;
++ return expand_downwards(vma, addr);
++}
++
++#elif defined(CONFIG_STACK_GROWSUP)
++
++#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
++#define vma_expand_down(vma, addr) (-EFAULT)
++
++#else
++
++#define vma_expand_up(vma,addr) (-EFAULT)
++#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
++
++#endif
++
++/*
++ * expand_stack(): legacy interface for page faulting. Don't use unless
++ * you have to.
++ *
++ * This is called with the mm locked for reading, drops the lock, takes
++ * the lock for writing, tries to look up a vma again, expands it if
++ * necessary, and downgrades the lock to reading again.
++ *
++ * If no vma is found or it can't be expanded, it returns NULL and has
++ * dropped the lock.
++ */
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+ {
+- return find_extend_vma_locked(mm, addr, false);
++ struct vm_area_struct *vma, *prev;
++
++ mmap_read_unlock(mm);
++ if (mmap_write_lock_killable(mm))
++ return NULL;
++
++ vma = find_vma_prev(mm, addr, &prev);
++ if (vma && vma->vm_start <= addr)
++ goto success;
++
++ if (prev && !vma_expand_up(prev, addr)) {
++ vma = prev;
++ goto success;
++ }
++
++ if (vma && !vma_expand_down(vma, addr))
++ goto success;
++
++ mmap_write_unlock(mm);
++ return NULL;
++
++success:
++ mmap_write_downgrade(mm);
++ return vma;
+ }
+-EXPORT_SYMBOL_GPL(find_extend_vma);
+
+ /*
+ * Ok - we have the memory areas we should free on a maple tree so release them,
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -631,24 +631,20 @@ struct vm_area_struct *find_vma(struct m
+ EXPORT_SYMBOL(find_vma);
+
+ /*
+- * find a VMA
+- * - we don't extend stack VMAs under NOMMU conditions
+- */
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+-{
+- return find_vma(mm, addr);
+-}
+-
+-/*
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
+ {
+ return -ENOMEM;
+ }
+
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
++{
++ mmap_read_unlock(mm);
++ return NULL;
++}
++
+ /*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_lock at least held readlocked
--- /dev/null
+From a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 10:55:38 -0700
+Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream.
+
+This does the simple pattern conversion of alpha, arc, csky, hexagon,
+loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma()
+helper. They all have the regular fault handling pattern without odd
+special cases.
+
+The remaining architectures all have something that keeps us from a
+straightforward conversion: ia64 and parisc have stacks that can grow
+both up as well as down (and ia64 has special address region checks).
+
+And m68k, microblaze, openrisc, sparc64, and um end up having extra
+rules about only expanding the stack down a limited amount below the
+user space stack pointer. That is something that x86 used to do too
+(long long ago), and it probably could just be skipped, but it still
+makes the conversion less than trivial.
+
+Note that this conversion was done manually and with the exception of
+alpha without any build testing, because I have a fairly limited cross-
+building environment. The cases are all simple, and I went through the
+changes several times, but...
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/alpha/Kconfig | 1 +
+ arch/alpha/mm/fault.c | 13 +++----------
+ arch/arc/Kconfig | 1 +
+ arch/arc/mm/fault.c | 11 +++--------
+ arch/csky/Kconfig | 1 +
+ arch/csky/mm/fault.c | 22 +++++-----------------
+ arch/hexagon/Kconfig | 1 +
+ arch/hexagon/mm/vm_fault.c | 18 ++++--------------
+ arch/loongarch/Kconfig | 1 +
+ arch/loongarch/mm/fault.c | 16 ++++++----------
+ arch/nios2/Kconfig | 1 +
+ arch/nios2/mm/fault.c | 17 ++---------------
+ arch/sh/Kconfig | 1 +
+ arch/sh/mm/fault.c | 17 ++---------------
+ arch/sparc/Kconfig | 1 +
+ arch/sparc/mm/fault_32.c | 32 ++++++++------------------------
+ arch/xtensa/Kconfig | 1 +
+ arch/xtensa/mm/fault.c | 14 +++-----------
+ 18 files changed, 45 insertions(+), 124 deletions(-)
+
+--- a/arch/alpha/Kconfig
++++ b/arch/alpha/Kconfig
+@@ -29,6 +29,7 @@ config ALPHA
+ select GENERIC_SMP_IDLE_THREAD
+ select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_MOD_ARCH_SPECIFIC
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select ODD_RT_SIGACTION
+ select OLD_SIGSUSPEND
+--- a/arch/alpha/mm/fault.c
++++ b/arch/alpha/mm/fault.c
+@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns
+ flags |= FAULT_FLAG_USER;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+
+ /* Ok, we have a good vm_area for this memory access, so
+ we can handle it. */
+- good_area:
+ si_code = SEGV_ACCERR;
+ if (cause < 0) {
+ if (!(vma->vm_flags & VM_EXEC))
+@@ -192,6 +184,7 @@ retry:
+ bad_area:
+ mmap_read_unlock(mm);
+
++ bad_area_nosemaphore:
+ if (user_mode(regs))
+ goto do_sigsegv;
+
+--- a/arch/arc/Kconfig
++++ b/arch/arc/Kconfig
+@@ -41,6 +41,7 @@ config ARC
+ select HAVE_PERF_EVENTS
+ select HAVE_SYSCALL_TRACEPOINTS
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select OF
+ select OF_EARLY_FLATTREE
+--- a/arch/arc/mm/fault.c
++++ b/arch/arc/mm/fault.c
+@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (unlikely(address < vma->vm_start)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address))
+- goto bad_area;
+- }
++ goto bad_area_nosemaphore;
+
+ /*
+ * vm_area is good, now check permissions for this memory access
+@@ -161,6 +155,7 @@ retry:
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ /*
+ * Major/minor page fault accounting
+ * (in case of retry we only land here once)
+--- a/arch/csky/Kconfig
++++ b/arch/csky/Kconfig
+@@ -96,6 +96,7 @@ config CSKY
+ select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_STACKPROTECTOR
+ select HAVE_SYSCALL_TRACEPOINTS
++ select LOCK_MM_AND_FIND_VMA
+ select MAY_HAVE_SPARSE_IRQ
+ select MODULES_USE_ELF_RELA if MODULES
+ select OF
+--- a/arch/csky/mm/fault.c
++++ b/arch/csky/mm/fault.c
+@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct
+ BUG();
+ }
+
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
+ {
+ /*
+ * Something tried to access memory that isn't in our memory map.
+ * Fix it, but check if it's kernel or user first.
+ */
+- mmap_read_unlock(mm);
+ /* User mode accesses just cause a SIGSEGV */
+ if (user_mode(regs)) {
+ do_trap(regs, SIGSEGV, code, addr);
+@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_
+ if (is_write(regs))
+ flags |= FAULT_FLAG_WRITE;
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, addr);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (likely(vma->vm_start <= addr))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (unlikely(expand_stack(vma, addr))) {
+- bad_area(regs, mm, code, addr);
++ bad_area_nosemaphore(regs, mm, code, addr);
+ return;
+ }
+
+@@ -259,11 +247,11 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it.
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (unlikely(access_error(regs, vma))) {
+- bad_area(regs, mm, code, addr);
++ mmap_read_unlock(mm);
++ bad_area_nosemaphore(regs, mm, code, addr);
+ return;
+ }
+
+--- a/arch/hexagon/Kconfig
++++ b/arch/hexagon/Kconfig
+@@ -28,6 +28,7 @@ config HEXAGON
+ select GENERIC_SMP_IDLE_THREAD
+ select STACKTRACE_SUPPORT
+ select GENERIC_CLOCKEVENTS_BROADCAST
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select GENERIC_CPU_DEVICES
+ select ARCH_WANT_LD_ORPHAN_WARN
+--- a/arch/hexagon/mm/vm_fault.c
++++ b/arch/hexagon/mm/vm_fault.c
+@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+- if (!vma)
+- goto bad_area;
++ vma = lock_mm_and_find_vma(mm, address, regs);
++ if (unlikely(!vma))
++ goto bad_area_nosemaphore;
+
+- if (vma->vm_start <= address)
+- goto good_area;
+-
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+-
+- if (expand_stack(vma, address))
+- goto bad_area;
+-
+-good_area:
+ /* Address space is OK. Now check access rights. */
+ si_code = SEGV_ACCERR;
+
+@@ -143,6 +132,7 @@ good_area:
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ return;
+--- a/arch/loongarch/Kconfig
++++ b/arch/loongarch/Kconfig
+@@ -125,6 +125,7 @@ config LOONGARCH
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP
+ select IRQ_FORCED_THREADING
+ select IRQ_LOONGARCH_CPU
++ select LOCK_MM_AND_FIND_VMA
+ select MMU_GATHER_MERGE_VMAS if MMU
+ select MODULES_USE_ELF_RELA if MODULES
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+--- a/arch/loongarch/mm/fault.c
++++ b/arch/loongarch/mm/fault.c
+@@ -169,22 +169,18 @@ static void __kprobes __do_page_fault(st
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+- if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (!expand_stack(vma, address))
+- goto good_area;
++ vma = lock_mm_and_find_vma(mm, address, regs);
++ if (unlikely(!vma))
++ goto bad_area_nosemaphore;
++ goto good_area;
++
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ do_sigsegv(regs, write, address, si_code);
+ return;
+
+--- a/arch/nios2/Kconfig
++++ b/arch/nios2/Kconfig
+@@ -16,6 +16,7 @@ config NIOS2
+ select HAVE_ARCH_TRACEHOOK
+ select HAVE_ARCH_KGDB
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select OF
+ select OF_EARLY_FLATTREE
+--- a/arch/nios2/mm/fault.c
++++ b/arch/nios2/mm/fault.c
+@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ea))
+- goto bad_area_nosemaphore;
+ retry:
+- mmap_read_lock(mm);
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ switch (cause) {
+--- a/arch/sh/Kconfig
++++ b/arch/sh/Kconfig
+@@ -56,6 +56,7 @@ config SUPERH
+ select HAVE_STACKPROTECTOR
+ select HAVE_SYSCALL_TRACEPOINTS
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select NEED_SG_DMA_LENGTH
+ select NO_DMA if !MMU && !DMA_COHERENT
+--- a/arch/sh/mm/fault.c
++++ b/arch/sh/mm/fault.c
+@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(
+ }
+
+ retry:
+- mmap_read_lock(mm);
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (likely(vma->vm_start <= address))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (unlikely(expand_stack(vma, address))) {
+- bad_area(regs, error_code, address);
++ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+@@ -461,7 +449,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address);
+ return;
+--- a/arch/sparc/Kconfig
++++ b/arch/sparc/Kconfig
+@@ -56,6 +56,7 @@ config SPARC32
+ select DMA_DIRECT_REMAP
+ select GENERIC_ATOMIC64
+ select HAVE_UID16
++ select LOCK_MM_AND_FIND_VMA
+ select OLD_SIGACTION
+ select ZONE_DMA
+
+--- a/arch/sparc/mm/fault_32.c
++++ b/arch/sparc/mm/fault_32.c
+@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt
+ if (pagefault_disabled() || !mm)
+ goto no_context;
+
++ if (!from_user && address >= PAGE_OFFSET)
++ goto no_context;
++
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ retry:
+- mmap_read_lock(mm);
+-
+- if (!from_user && address >= PAGE_OFFSET)
+- goto bad_area;
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ code = SEGV_ACCERR;
+ if (write) {
+ if (!(vma->vm_flags & VM_WRITE))
+@@ -321,17 +312,9 @@ static void force_user_fault(unsigned lo
+
+ code = SEGV_MAPERR;
+
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
+-good_area:
++ goto bad_area_nosemaphore;
+ code = SEGV_ACCERR;
+ if (write) {
+ if (!(vma->vm_flags & VM_WRITE))
+@@ -350,6 +333,7 @@ good_area:
+ return;
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address);
+ return;
+
+--- a/arch/xtensa/Kconfig
++++ b/arch/xtensa/Kconfig
+@@ -49,6 +49,7 @@ config XTENSA
+ select HAVE_SYSCALL_TRACEPOINTS
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select PERF_USE_VMALLOC
+ select TRACE_IRQFLAGS_SUPPORT
+--- a/arch/xtensa/mm/fault.c
++++ b/arch/xtensa/mm/fault.c
+@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+-
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+
+ /* Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (is_write) {
+@@ -205,6 +196,7 @@ good_area:
+ */
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ force_sig_fault(SIGSEGV, code, (void *) address);
+ return;
--- /dev/null
+From c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 15:17:36 -0700
+Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream.
+
+.. and make x86 use it.
+
+This basically extracts the existing x86 "find and expand faulting vma"
+code, but extends it to also take the mmap lock for writing in case we
+actually do need to expand the vma.
+
+We've historically short-circuited that case, and have some rather ugly
+special logic to serialize the stack segment expansion (since we only
+hold the mmap lock for reading) that doesn't match the normal VM
+locking.
+
+That slight violation of locking worked well, right up until it didn't:
+the maple tree code really does want proper locking even for simple
+extension of an existing vma.
+
+So extract the code for "look up the vma of the fault" from x86, fix it
+up to do the necessary write locking, and make it available as a helper
+function for other architectures that can use the common helper.
+
+Note: I say "common helper", but it really only handles the normal
+stack-grows-down case. Which is all architectures except for PA-RISC
+and IA64. So some rare architectures can't use the helper, but if they
+care they'll just need to open-code this logic.
+
+It's also worth pointing out that this code really would like to have an
+optimistic "mmap_upgrade_trylock()" to make it quicker to go from a
+read-lock (for the common case) to taking the write lock (for having to
+extend the vma) in the normal single-threaded situation where there is
+no other locking activity.
+
+But that _is_ all the very uncommon special case, so while it would be
+nice to have such an operation, it probably doesn't matter in reality.
+I did put in the skeleton code for such a possible future expansion,
+even if it only acts as pseudo-documentation for what we're doing.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig | 1
+ arch/x86/mm/fault.c | 52 ----------------------
+ include/linux/mm.h | 2
+ mm/Kconfig | 4 +
+ mm/memory.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 130 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -274,6 +274,7 @@ config X86
+ select HAVE_GENERIC_VDSO
+ select HOTPLUG_SMT if SMP
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+ select NEED_PER_CPU_PAGE_FIRST_CHUNK
+ select NEED_SG_DMA_LENGTH
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -879,12 +879,6 @@ __bad_area(struct pt_regs *regs, unsigne
+ __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+ }
+
+-static noinline void
+-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+-{
+- __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
+-}
+-
+ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ struct vm_area_struct *vma)
+ {
+@@ -1333,51 +1327,10 @@ void do_user_addr_fault(struct pt_regs *
+ }
+ #endif
+
+- /*
+- * Kernel-mode access to the user address space should only occur
+- * on well-defined single instructions listed in the exception
+- * tables. But, an erroneous kernel fault occurring outside one of
+- * those areas which also holds mmap_lock might deadlock attempting
+- * to validate the fault against the address space.
+- *
+- * Only do the expensive exception table search when we might be at
+- * risk of a deadlock. This happens if we
+- * 1. Failed to acquire mmap_lock, and
+- * 2. The access did not originate in userspace.
+- */
+- if (unlikely(!mmap_read_trylock(mm))) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
+- /*
+- * Fault from code in kernel from
+- * which we do not expect faults.
+- */
+- bad_area_nosemaphore(regs, error_code, address);
+- return;
+- }
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case we'll have missed the might_sleep() from
+- * down_read():
+- */
+- might_sleep();
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (likely(vma->vm_start <= address))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (unlikely(expand_stack(vma, address))) {
+- bad_area(regs, error_code, address);
++ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+@@ -1385,7 +1338,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address, vma);
+ return;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2190,6 +2190,8 @@ void unmap_mapping_pages(struct address_
+ pgoff_t start, pgoff_t nr, bool even_cows);
+ void unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows);
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++ unsigned long address, struct pt_regs *regs);
+ #else
+ static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1202,6 +1202,10 @@ config LRU_GEN_STATS
+ This option has a per-memcg and per-node memory overhead.
+ # }
+
++config LOCK_MM_AND_FIND_VMA
++ bool
++ depends on !STACK_GROWSUP
++
+ source "mm/damon/Kconfig"
+
+ endmenu
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5230,6 +5230,127 @@ vm_fault_t handle_mm_fault(struct vm_are
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
+
++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
++#include <linux/extable.h>
++
++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++ /* Even if this succeeds, make it clear we *might* have slept */
++ if (likely(mmap_read_trylock(mm))) {
++ might_sleep();
++ return true;
++ }
++
++ if (regs && !user_mode(regs)) {
++ unsigned long ip = instruction_pointer(regs);
++ if (!search_exception_tables(ip))
++ return false;
++ }
++
++ mmap_read_lock(mm);
++ return true;
++}
++
++static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
++{
++ /*
++ * We don't have this operation yet.
++ *
++ * It should be easy enough to do: it's basically a
++ * atomic_long_try_cmpxchg_acquire()
++ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
++ * it also needs the proper lockdep magic etc.
++ */
++ return false;
++}
++
++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++ mmap_read_unlock(mm);
++ if (regs && !user_mode(regs)) {
++ unsigned long ip = instruction_pointer(regs);
++ if (!search_exception_tables(ip))
++ return false;
++ }
++ mmap_write_lock(mm);
++ return true;
++}
++
++/*
++ * Helper for page fault handling.
++ *
++ * This is kind of equivalend to "mmap_read_lock()" followed
++ * by "find_extend_vma()", except it's a lot more careful about
++ * the locking (and will drop the lock on failure).
++ *
++ * For example, if we have a kernel bug that causes a page
++ * fault, we don't want to just use mmap_read_lock() to get
++ * the mm lock, because that would deadlock if the bug were
++ * to happen while we're holding the mm lock for writing.
++ *
++ * So this checks the exception tables on kernel faults in
++ * order to only do this all for instructions that are actually
++ * expected to fault.
++ *
++ * We can also actually take the mm lock for writing if we
++ * need to extend the vma, which helps the VM layer a lot.
++ */
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++ unsigned long addr, struct pt_regs *regs)
++{
++ struct vm_area_struct *vma;
++
++ if (!get_mmap_lock_carefully(mm, regs))
++ return NULL;
++
++ vma = find_vma(mm, addr);
++ if (likely(vma && (vma->vm_start <= addr)))
++ return vma;
++
++ /*
++ * Well, dang. We might still be successful, but only
++ * if we can extend a vma to do so.
++ */
++ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
++ mmap_read_unlock(mm);
++ return NULL;
++ }
++
++ /*
++ * We can try to upgrade the mmap lock atomically,
++ * in which case we can continue to use the vma
++ * we already looked up.
++ *
++ * Otherwise we'll have to drop the mmap lock and
++ * re-take it, and also look up the vma again,
++ * re-checking it.
++ */
++ if (!mmap_upgrade_trylock(mm)) {
++ if (!upgrade_mmap_lock_carefully(mm, regs))
++ return NULL;
++
++ vma = find_vma(mm, addr);
++ if (!vma)
++ goto fail;
++ if (vma->vm_start <= addr)
++ goto success;
++ if (!(vma->vm_flags & VM_GROWSDOWN))
++ goto fail;
++ }
++
++ if (expand_stack(vma, addr))
++ goto fail;
++
++success:
++ mmap_write_downgrade(mm);
++ return vma;
++
++fail:
++ mmap_write_unlock(mm);
++ return NULL;
++}
++#endif
++
+ #ifndef __PAGETABLE_P4D_FOLDED
+ /*
+ * Allocate p4d page table.
--- /dev/null
+From f440fa1ac955e2898893f9301568435eb5cdfc4b Mon Sep 17 00:00:00 2001
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Fri, 16 Jun 2023 15:58:54 -0700
+Subject: mm: make find_extend_vma() fail if write lock not held
+
+From: Liam R. Howlett <Liam.Howlett@oracle.com>
+
+commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream.
+
+Make calls to extend_vma() and find_extend_vma() fail if the write lock
+is required.
+
+To avoid making this a flag-day event, this still allows the old
+read-locking case for the trivial situations, and passes in a flag to
+say "is it write-locked". That way write-lockers can say "yes, I'm
+being careful", and legacy users will continue to work in all the common
+cases until they have been fully converted to the new world order.
+
+Co-Developed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/binfmt_elf.c | 6 +++---
+ fs/exec.c | 5 +++--
+ include/linux/mm.h | 10 +++++++---
+ mm/memory.c | 2 +-
+ mm/mmap.c | 50 +++++++++++++++++++++++++++++++++-----------------
+ mm/nommu.c | 3 ++-
+ 6 files changed, 49 insertions(+), 27 deletions(-)
+
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *b
+ * Grow the stack manually; some architectures have a limit on how
+ * far ahead a user-space access may be in order to grow the stack.
+ */
+- if (mmap_read_lock_killable(mm))
++ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+- vma = find_extend_vma(mm, bprm->p);
+- mmap_read_unlock(mm);
++ vma = find_extend_vma_locked(mm, bprm->p, true);
++ mmap_write_unlock(mm);
+ if (!vma)
+ return -EFAULT;
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -204,7 +204,8 @@ static struct page *get_arg_page(struct
+
+ #ifdef CONFIG_STACK_GROWSUP
+ if (write) {
+- ret = expand_downwards(bprm->vma, pos);
++ /* We claim to hold the lock - nobody to race with */
++ ret = expand_downwards(bprm->vma, pos, true);
+ if (ret < 0)
+ return NULL;
+ }
+@@ -852,7 +853,7 @@ int setup_arg_pages(struct linux_binprm
+ stack_base = vma->vm_end - stack_expand;
+ #endif
+ current->mm->start_stack = bprm->p;
+- ret = expand_stack(vma, stack_base);
++ ret = expand_stack_locked(vma, stack_base, true);
+ if (ret)
+ ret = -EFAULT;
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3065,11 +3065,13 @@ extern vm_fault_t filemap_page_mkwrite(s
+
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked);
++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
+
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-extern int expand_downwards(struct vm_area_struct *vma,
+- unsigned long address);
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked);
+ #if VM_GROWSUP
+ extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+ #else
+@@ -3170,6 +3172,8 @@ unsigned long change_prot_numa(struct vm
+ #endif
+
+ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
++ unsigned long addr, bool write_locked);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5336,7 +5336,7 @@ struct vm_area_struct *lock_mm_and_find_
+ goto fail;
+ }
+
+- if (expand_stack(vma, addr))
++ if (expand_stack_locked(vma, addr, true))
+ goto fail;
+
+ success:
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1898,7 +1898,8 @@ static int acct_stack_growth(struct vm_a
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
++int expand_upwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+@@ -1922,6 +1923,8 @@ int expand_upwards(struct vm_area_struct
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
++ if (!write_locked)
++ return -EAGAIN;
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+@@ -1991,7 +1994,8 @@ int expand_upwards(struct vm_area_struct
+ /*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
+ */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+@@ -2005,10 +2009,13 @@ int expand_downwards(struct vm_area_stru
+ /* Enforce stack_guard_gap */
+ prev = mas_prev(&mas, 0);
+ /* Check that both stack segments have the same anon_vma? */
+- if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+- vma_is_accessible(prev)) {
+- if (address - prev->vm_end < stack_guard_gap)
++ if (prev) {
++ if (!(prev->vm_flags & VM_GROWSDOWN) &&
++ vma_is_accessible(prev) &&
++ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
++ if (!write_locked && (prev->vm_end == address))
++ return -EAGAIN;
+ }
+
+ if (mas_preallocate(&mas, GFP_KERNEL))
+@@ -2087,13 +2094,14 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+- return expand_upwards(vma, address);
++ return expand_upwards(vma, address, write_locked);
+ }
+
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++ unsigned long addr, bool write_locked)
+ {
+ struct vm_area_struct *vma, *prev;
+
+@@ -2101,20 +2109,25 @@ find_extend_vma(struct mm_struct *mm, un
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+- if (!prev || expand_stack(prev, addr))
++ if (!prev)
++ return NULL;
++ if (expand_stack_locked(prev, addr, write_locked))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+ return prev;
+ }
+ #else
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+- return expand_downwards(vma, address);
++ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
++ return -EINVAL;
++ return expand_downwards(vma, address, write_locked);
+ }
+
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++ unsigned long addr, bool write_locked)
+ {
+ struct vm_area_struct *vma;
+ unsigned long start;
+@@ -2125,10 +2138,8 @@ find_extend_vma(struct mm_struct *mm, un
+ return NULL;
+ if (vma->vm_start <= addr)
+ return vma;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return NULL;
+ start = vma->vm_start;
+- if (expand_stack(vma, addr))
++ if (expand_stack_locked(vma, addr, write_locked))
+ return NULL;
+ if (vma->vm_flags & VM_LOCKED)
+ populate_vma_page_range(vma, addr, start, NULL);
+@@ -2136,6 +2147,11 @@ find_extend_vma(struct mm_struct *mm, un
+ }
+ #endif
+
++struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
++ unsigned long addr)
++{
++ return find_extend_vma_locked(mm, addr, false);
++}
+ EXPORT_SYMBOL_GPL(find_extend_vma);
+
+ /*
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -643,7 +643,8 @@ struct vm_area_struct *find_extend_vma(s
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ return -ENOMEM;
+ }
--- /dev/null
+From eda0047296a16d65a7f2bc60a408f70d178b2014 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 16:17:48 -0700
+Subject: mm: make the page fault mmap locking killable
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream.
+
+This is done as a separate patch from introducing the new
+lock_mm_and_find_vma() helper, because while it's an obvious change,
+it's not what x86 used to do in this area.
+
+We already abort the page fault on fatal signals anyway, so why should
+we wait for the mmap lock only to then abort later? With the new helper
+function that returns without the lock held on failure anyway, this is
+particularly easy and straightforward.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5247,8 +5247,7 @@ static inline bool get_mmap_lock_careful
+ return false;
+ }
+
+- mmap_read_lock(mm);
+- return true;
++ return !mmap_read_lock_killable(mm);
+ }
+
+ static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+@@ -5272,8 +5271,7 @@ static inline bool upgrade_mmap_lock_car
+ if (!search_exception_tables(ip))
+ return false;
+ }
+- mmap_write_lock(mm);
+- return true;
++ return !mmap_write_lock_killable(mm);
+ }
+
+ /*
--- /dev/null
+From 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 11:17:05 -0700
+Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream.
+
+This is one of the simple cases, except there's no pt_regs pointer.
+Which is fine, as lock_mm_and_find_vma() is set up to work fine with a
+NULL pt_regs.
+
+Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting,
+so we can just use the helper without any extra work.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/mm/copro_fault.c | 14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/arch/powerpc/mm/copro_fault.c
++++ b/arch/powerpc/mm/copro_fault.c
+@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru
+ if (mm->pgd == NULL)
+ return -EFAULT;
+
+- mmap_read_lock(mm);
+- ret = -EFAULT;
+- vma = find_vma(mm, ea);
++ vma = lock_mm_and_find_vma(mm, ea, NULL);
+ if (!vma)
+- goto out_unlock;
+-
+- if (ea < vma->vm_start) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto out_unlock;
+- if (expand_stack(vma, ea))
+- goto out_unlock;
+- }
++ return -EFAULT;
+
++ ret = -EFAULT;
+ is_write = dsisr & DSISR_ISSTORE;
+ if (is_write) {
+ if (!(vma->vm_flags & VM_WRITE))
--- /dev/null
+From e6fe228c4ffafdfc970cf6d46883a1f481baf7ea Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Fri, 16 Jun 2023 15:51:29 +1000
+Subject: powerpc/mm: Convert to using lock_mm_and_find_vma()
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream.
+
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/Kconfig | 1 +
+ arch/powerpc/mm/fault.c | 41 ++++-------------------------------------
+ 2 files changed, 5 insertions(+), 37 deletions(-)
+
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -263,6 +263,7 @@ config PPC
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
+ select KASAN_VMALLOC if KASAN && MODULES
++ select LOCK_MM_AND_FIND_VMA
+ select MMU_GATHER_PAGE_SIZE
+ select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_GATHER_MERGE_VMAS
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re
+ return __bad_area_nosemaphore(regs, address, si_code);
+ }
+
+-static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+-{
+- return __bad_area(regs, address, SEGV_MAPERR);
+-}
+-
+ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ struct vm_area_struct *vma)
+ {
+@@ -481,40 +476,12 @@ static int ___do_page_fault(struct pt_re
+ * we will deadlock attempting to validate the fault against the
+ * address space. Luckily the kernel only validly references user
+ * space from well defined areas of code, which are listed in the
+- * exceptions table.
+- *
+- * As the vast majority of faults will be valid we will only perform
+- * the source reference check when there is a possibility of a deadlock.
+- * Attempt to lock the address space, if we cannot we then validate the
+- * source. If this is invalid we can skip the address space check,
+- * thus avoiding the deadlock.
+- */
+- if (unlikely(!mmap_read_trylock(mm))) {
+- if (!is_user && !search_exception_tables(regs->nip))
+- return bad_area_nosemaphore(regs, address);
+-
++ * exceptions table. lock_mm_and_find_vma() handles that logic.
++ */
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case we'll have missed the might_sleep() from
+- * down_read():
+- */
+- might_sleep();
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma))
+- return bad_area(regs, address);
+-
+- if (unlikely(vma->vm_start > address)) {
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+- return bad_area(regs, address);
+-
+- if (unlikely(expand_stack(vma, address)))
+- return bad_area(regs, address);
+- }
++ return bad_area_nosemaphore(regs, address);
+
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma)))
--- /dev/null
+From 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 20:18:18 +0200
+Subject: riscv/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/Kconfig | 1 +
+ arch/riscv/mm/fault.c | 31 +++++++++++++------------------
+ 2 files changed, 14 insertions(+), 18 deletions(-)
+
+--- a/arch/riscv/Kconfig
++++ b/arch/riscv/Kconfig
+@@ -119,6 +119,7 @@ config RISCV
+ select HAVE_SYSCALL_TRACEPOINTS
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA if MODULES
+ select MODULE_SECTIONS if MODULES
+ select OF
+--- a/arch/riscv/mm/fault.c
++++ b/arch/riscv/mm/fault.c
+@@ -83,13 +83,13 @@ static inline void mm_fault_error(struct
+ BUG();
+ }
+
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void
++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr)
+ {
+ /*
+ * Something tried to access memory that isn't in our memory map.
+ * Fix it, but check if it's kernel or user first.
+ */
+- mmap_read_unlock(mm);
+ /* User mode accesses just cause a SIGSEGV */
+ if (user_mode(regs)) {
+ do_trap(regs, SIGSEGV, code, addr);
+@@ -99,6 +99,15 @@ static inline void bad_area(struct pt_re
+ no_context(regs, addr);
+ }
+
++static inline void
++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code,
++ unsigned long addr)
++{
++ mmap_read_unlock(mm);
++
++ bad_area_nosemaphore(regs, code, addr);
++}
++
+ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
+ {
+ pgd_t *pgd, *pgd_k;
+@@ -286,23 +295,10 @@ asmlinkage void do_page_fault(struct pt_
+ else if (cause == EXC_INST_PAGE_FAULT)
+ flags |= FAULT_FLAG_INSTRUCTION;
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, addr);
++ vma = lock_mm_and_find_vma(mm, addr, regs);
+ if (unlikely(!vma)) {
+ tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (likely(vma->vm_start <= addr))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (unlikely(expand_stack(vma, addr))) {
+- tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
++ bad_area_nosemaphore(regs, code, addr);
+ return;
+ }
+
+@@ -310,7 +306,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it.
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (unlikely(access_error(cause, vma))) {
cpufreq-amd-pstate-make-amd-pstate-epp-driver-name-hyphenated.patch
can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch
maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch
+
+mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
+mm-make-the-page-fault-mmap-locking-killable.patch
+arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
+powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
+mips-mm-convert-to-using-lock_mm_and_find_vma.patch
+riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
+arm-mm-convert-to-using-lock_mm_and_find_vma.patch
+mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
+powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
+mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
+execve-expand-new-process-stack-manually-ahead-of-time.patch
+mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
+gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch
--- /dev/null
+From 8b35ca3e45e35a26a21427f35d4093606e93ad0a Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 21:24:30 +0200
+Subject: arm/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream.
+
+arm has an additional check for address < FIRST_USER_ADDRESS before
+expanding the stack. Since FIRST_USER_ADDRESS is defined everywhere
+(generally as 0), move that check to the generic expand_downwards().
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/Kconfig | 1
+ arch/arm/mm/fault.c | 63 +++++++++++-----------------------------------------
+ mm/mmap.c | 2 -
+ 3 files changed, 16 insertions(+), 50 deletions(-)
+
+--- a/arch/arm/Kconfig
++++ b/arch/arm/Kconfig
+@@ -125,6 +125,7 @@ config ARM
+ select HAVE_UID16
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_REL
+ select NEED_DMA_MAP_STATE
+ select OF_EARLY_FLATTREE if OF
+--- a/arch/arm/mm/fault.c
++++ b/arch/arm/mm/fault.c
+@@ -232,37 +232,11 @@ static inline bool is_permission_fault(u
+ return false;
+ }
+
+-static vm_fault_t __kprobes
+-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags,
+- unsigned long vma_flags, struct pt_regs *regs)
+-{
+- struct vm_area_struct *vma = find_vma(mm, addr);
+- if (unlikely(!vma))
+- return VM_FAULT_BADMAP;
+-
+- if (unlikely(vma->vm_start > addr)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return VM_FAULT_BADMAP;
+- if (addr < FIRST_USER_ADDRESS)
+- return VM_FAULT_BADMAP;
+- if (expand_stack(vma, addr))
+- return VM_FAULT_BADMAP;
+- }
+-
+- /*
+- * ok, we have a good vm_area for this memory access, check the
+- * permissions on the VMA allow for the fault which occurred.
+- */
+- if (!(vma->vm_flags & vma_flags))
+- return VM_FAULT_BADACCESS;
+-
+- return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+-}
+-
+ static int __kprobes
+ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ {
+ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
+ int sig, code;
+ vm_fault_t fault;
+ unsigned int flags = FAULT_FLAG_DEFAULT;
+@@ -301,31 +275,21 @@ do_page_fault(unsigned long addr, unsign
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+
+- /*
+- * As per x86, we may deadlock here. However, since the kernel only
+- * validly references user space from well defined areas of the code,
+- * we can bug out early if this is from code which shouldn't.
+- */
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
+- goto no_context;
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case, we'll have missed the might_sleep() from
+- * down_read()
+- */
+- might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+- if (!user_mode(regs) &&
+- !search_exception_tables(regs->ARM_pc))
+- goto no_context;
+-#endif
++ vma = lock_mm_and_find_vma(mm, addr, regs);
++ if (unlikely(!vma)) {
++ fault = VM_FAULT_BADMAP;
++ goto bad_area;
+ }
+
+- fault = __do_page_fault(mm, addr, flags, vm_flags, regs);
++ /*
++ * ok, we have a good vm_area for this memory access, check the
++ * permissions on the VMA allow for the fault which occurred.
++ */
++ if (!(vma->vm_flags & vm_flags))
++ fault = VM_FAULT_BADACCESS;
++ else
++ fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+
+ /* If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_lock because
+@@ -356,6 +320,7 @@ retry:
+ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+ return 0;
+
++bad_area:
+ /*
+ * If we are in kernel mode at this point, we
+ * have no context to handle this fault with.
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2036,7 +2036,7 @@ int expand_downwards(struct vm_area_stru
+ int error = 0;
+
+ address &= PAGE_MASK;
+- if (address < mmap_min_addr)
++ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+
+ /* Enforce stack_guard_gap */
--- /dev/null
+From ae870a68b5d13d67cf4f18d47bb01ee3fee40acb Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 17:11:44 -0700
+Subject: arm64/mm: Convert to using lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream.
+
+This converts arm64 to use the new page fault helper. It was very
+straightforward, but still needed a fix for the "obvious" conversion I
+initially did. Thanks to Suren for the fix and testing.
+
+Fixed-and-tested-by: Suren Baghdasaryan <surenb@google.com>
+Unnecessary-code-removal-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/Kconfig | 1 +
+ arch/arm64/mm/fault.c | 47 ++++++++---------------------------------------
+ 2 files changed, 9 insertions(+), 39 deletions(-)
+
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -225,6 +225,7 @@ config ARM64
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
+ select KASAN_VMALLOC if KASAN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select NEED_DMA_MAP_STATE
+ select NEED_SG_DMA_LENGTH
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa
+ #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
+ #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
+
+-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
++static vm_fault_t __do_page_fault(struct mm_struct *mm,
++ struct vm_area_struct *vma, unsigned long addr,
+ unsigned int mm_flags, unsigned long vm_flags,
+ struct pt_regs *regs)
+ {
+- struct vm_area_struct *vma = find_vma(mm, addr);
+-
+- if (unlikely(!vma))
+- return VM_FAULT_BADMAP;
+-
+ /*
+ * Ok, we have a good vm_area for this memory access, so we can handle
+ * it.
+- */
+- if (unlikely(vma->vm_start > addr)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return VM_FAULT_BADMAP;
+- if (expand_stack(vma, addr))
+- return VM_FAULT_BADMAP;
+- }
+-
+- /*
+ * Check that the permissions on the VMA allow for the fault which
+ * occurred.
+ */
+@@ -617,31 +604,15 @@ static int __kprobes do_page_fault(unsig
+ }
+ lock_mmap:
+ #endif /* CONFIG_PER_VMA_LOCK */
+- /*
+- * As per x86, we may deadlock here. However, since the kernel only
+- * validly references user space from well defined areas of the code,
+- * we can bug out early if this is from code which shouldn't.
+- */
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->pc))
+- goto no_context;
++
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above mmap_read_trylock() might have succeeded in which
+- * case, we'll have missed the might_sleep() from down_read().
+- */
+- might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+- if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
+- mmap_read_unlock(mm);
+- goto no_context;
+- }
+-#endif
++ vma = lock_mm_and_find_vma(mm, addr, regs);
++ if (unlikely(!vma)) {
++ fault = VM_FAULT_BADMAP;
++ goto done;
+ }
+
+- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
++ fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+@@ -660,9 +631,7 @@ retry:
+ }
+ mmap_read_unlock(mm);
+
+-#ifdef CONFIG_PER_VMA_LOCK
+ done:
+-#endif
+ /*
+ * Handle the "normal" (no error) case first.
+ */
--- /dev/null
+From f313c51d26aa87e69633c9b46efb37a930faca71 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Mon, 19 Jun 2023 11:34:15 -0700
+Subject: execve: expand new process stack manually ahead of time
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream.
+
+This is a small step towards a model where GUP itself would not expand
+the stack, and any user that needs GUP to not look up existing mappings,
+but actually expand on them, would have to do so manually before-hand,
+and with the mm lock held for writing.
+
+It turns out that execve() already did almost exactly that, except it
+didn't take the mm lock at all (it's single-threaded so no locking
+technically needed, but it could cause lockdep errors). And it only did
+it for the CONFIG_STACK_GROWSUP case, since in that case GUP has
+obviously never expanded the stack downwards.
+
+So just make that CONFIG_STACK_GROWSUP case do the right thing with
+locking, and enable it generally. This will eventually help GUP, and in
+the meantime avoids a special case and the lockdep issue.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c | 37 +++++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 16 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -200,34 +200,39 @@ static struct page *get_arg_page(struct
+ int write)
+ {
+ struct page *page;
++ struct vm_area_struct *vma = bprm->vma;
++ struct mm_struct *mm = bprm->mm;
+ int ret;
+- unsigned int gup_flags = 0;
+
+-#ifdef CONFIG_STACK_GROWSUP
+- if (write) {
+- /* We claim to hold the lock - nobody to race with */
+- ret = expand_downwards(bprm->vma, pos, true);
+- if (ret < 0)
++ /*
++ * Avoid relying on expanding the stack down in GUP (which
++ * does not work for STACK_GROWSUP anyway), and just do it
++ * by hand ahead of time.
++ */
++ if (write && pos < vma->vm_start) {
++ mmap_write_lock(mm);
++ ret = expand_downwards(vma, pos, true);
++ if (unlikely(ret < 0)) {
++ mmap_write_unlock(mm);
+ return NULL;
+- }
+-#endif
+-
+- if (write)
+- gup_flags |= FOLL_WRITE;
++ }
++ mmap_write_downgrade(mm);
++ } else
++ mmap_read_lock(mm);
+
+ /*
+ * We are doing an exec(). 'current' is the process
+- * doing the exec and bprm->mm is the new process's mm.
++ * doing the exec and 'mm' is the new process's mm.
+ */
+- mmap_read_lock(bprm->mm);
+- ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
++ ret = get_user_pages_remote(mm, pos, 1,
++ write ? FOLL_WRITE : 0,
+ &page, NULL, NULL);
+- mmap_read_unlock(bprm->mm);
++ mmap_read_unlock(mm);
+ if (ret <= 0)
+ return NULL;
+
+ if (write)
+- acct_arg_size(bprm, vma_pages(bprm->vma));
++ acct_arg_size(bprm, vma_pages(vma));
+
+ return page;
+ }
--- /dev/null
+From a425ac5365f6cb3cc47bf83e6bff0213c10445f7 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sun, 25 Jun 2023 14:02:25 -0700
+Subject: gup: add warning if some caller would seem to want stack expansion
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a425ac5365f6cb3cc47bf83e6bff0213c10445f7 upstream.
+
+It feels very unlikely that anybody would want to do a GUP in an
+unmapped area under the stack pointer, but real users sometimes do some
+really strange things. So add a (temporary) warning for the case where
+a GUP fails and expanding the stack might have made it work.
+
+It's trivial to do the expansion in the caller as part of getting the mm
+lock in the first place - see __access_remote_vm() for ptrace, for
+example - it's just that it's unnecessarily painful to do it deep in the
+guts of the GUP lookup when we might have to drop and re-take the lock.
+
+I doubt anybody actually does anything quite this strange, but let's be
+proactive: adding these warnings is simple, and will make debugging it
+much easier if they trigger.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/gup.c | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1096,7 +1096,11 @@ static long __get_user_pages(struct mm_s
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+- vma = vma_lookup(mm, start);
++ vma = find_vma(mm, start);
++ if (vma && (start < vma->vm_start)) {
++ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
++ vma = NULL;
++ }
+ if (!vma && in_gate_area(mm, start)) {
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+@@ -1265,9 +1269,13 @@ int fixup_user_fault(struct mm_struct *m
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ retry:
+- vma = vma_lookup(mm, address);
++ vma = find_vma(mm, address);
+ if (!vma)
+ return -EFAULT;
++ if (address < vma->vm_start ) {
++ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
++ return -EFAULT;
++ }
+
+ if (!vma_permits_fault(vma, fault_flags))
+ return -EFAULT;
--- /dev/null
+From 4bce37a68ff884e821a02a731897a8119e0c37b7 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 18:47:40 +0200
+Subject: mips/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/mips/Kconfig | 1 +
+ arch/mips/mm/fault.c | 12 ++----------
+ 2 files changed, 3 insertions(+), 10 deletions(-)
+
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -91,6 +91,7 @@ config MIPS
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP
+ select IRQ_FORCED_THREADING
+ select ISA if EISA
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_REL if MODULES
+ select MODULES_USE_ELF_RELA if MODULES && 64BIT
+ select PERF_USE_VMALLOC
+--- a/arch/mips/mm/fault.c
++++ b/arch/mips/mm/fault.c
+@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ si_code = SEGV_ACCERR;
+
+ if (write) {
--- /dev/null
+From 8d7071af890768438c14db6172cc8f9f4d04e184 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 13:45:51 -0700
+Subject: mm: always expand the stack with the mmap write lock held
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream.
+
+This finishes the job of always holding the mmap write lock when
+extending the user stack vma, and removes the 'write_locked' argument
+from the vm helper functions again.
+
+For some cases, we just avoid expanding the stack at all: drivers and
+page pinning really shouldn't be extending any stacks. Let's see if any
+strange users really wanted that.
+
+It's worth noting that architectures that weren't converted to the new
+lock_mm_and_find_vma() helper function are left using the legacy
+"expand_stack()" function, but it has been changed to drop the mmap_lock
+and take it for writing while expanding the vma. This makes it fairly
+straightforward to convert the remaining architectures.
+
+As a result of dropping and re-taking the lock, the calling conventions
+for this function have also changed, since the old vma may no longer be
+valid. So it will now return the new vma if successful, and NULL - and
+the lock dropped - if the area could not be extended.
+
+Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
+Tested-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> # ia64
+Tested-by: Frank Scheiner <frank.scheiner@web.de> # ia64
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/ia64/mm/fault.c | 36 ++----------
+ arch/m68k/mm/fault.c | 9 ++-
+ arch/microblaze/mm/fault.c | 5 +
+ arch/openrisc/mm/fault.c | 5 +
+ arch/parisc/mm/fault.c | 23 +++-----
+ arch/s390/mm/fault.c | 5 +
+ arch/sparc/mm/fault_64.c | 8 +-
+ arch/um/kernel/trap.c | 11 ++-
+ drivers/iommu/amd/iommu_v2.c | 4 -
+ drivers/iommu/iommu-sva.c | 2
+ fs/binfmt_elf.c | 2
+ fs/exec.c | 4 -
+ include/linux/mm.h | 16 +----
+ mm/gup.c | 6 +-
+ mm/memory.c | 10 +++
+ mm/mmap.c | 121 ++++++++++++++++++++++++++++++++++---------
+ mm/nommu.c | 18 ++----
+ 17 files changed, 169 insertions(+), 116 deletions(-)
+
+--- a/arch/ia64/mm/fault.c
++++ b/arch/ia64/mm/fault.c
+@@ -110,10 +110,12 @@ retry:
+ * register backing store that needs to expand upwards, in
+ * this case vma will be null, but prev_vma will ne non-null
+ */
+- if (( !vma && prev_vma ) || (address < vma->vm_start) )
+- goto check_expansion;
++ if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
++ }
+
+- good_area:
+ code = SEGV_ACCERR;
+
+ /* OK, we've got a good vm_area for this memory area. Check the access permissions: */
+@@ -177,35 +179,9 @@ retry:
+ mmap_read_unlock(mm);
+ return;
+
+- check_expansion:
+- if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+- if (!vma)
+- goto bad_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+- || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
+- } else {
+- vma = prev_vma;
+- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+- || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+- goto bad_area;
+- /*
+- * Since the register backing store is accessed sequentially,
+- * we disallow growing it by more than a page at a time.
+- */
+- if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+- goto bad_area;
+- if (expand_upwards(vma, address))
+- goto bad_area;
+- }
+- goto good_area;
+-
+ bad_area:
+ mmap_read_unlock(mm);
++ bad_area_nosemaphore:
+ if ((isr & IA64_ISR_SP)
+ || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+ {
+--- a/arch/m68k/mm/fault.c
++++ b/arch/m68k/mm/fault.c
+@@ -105,8 +105,9 @@ retry:
+ if (address + 256 < rdusp())
+ goto map_err;
+ }
+- if (expand_stack(vma, address))
+- goto map_err;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto map_err_nosemaphore;
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+@@ -196,10 +197,12 @@ bus_err:
+ goto send_sig;
+
+ map_err:
++ mmap_read_unlock(mm);
++map_err_nosemaphore:
+ current->thread.signo = SIGSEGV;
+ current->thread.code = SEGV_MAPERR;
+ current->thread.faddr = address;
+- goto send_sig;
++ return send_fault_sig(regs);
+
+ acc_err:
+ current->thread.signo = SIGSEGV;
+--- a/arch/microblaze/mm/fault.c
++++ b/arch/microblaze/mm/fault.c
+@@ -192,8 +192,9 @@ retry:
+ && (kernel_mode(regs) || !store_updates_sp(regs)))
+ goto bad_area;
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+
+ good_area:
+ code = SEGV_ACCERR;
+--- a/arch/openrisc/mm/fault.c
++++ b/arch/openrisc/mm/fault.c
+@@ -127,8 +127,9 @@ retry:
+ if (address + PAGE_SIZE < regs->sp)
+ goto bad_area;
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+--- a/arch/parisc/mm/fault.c
++++ b/arch/parisc/mm/fault.c
+@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs,
+ retry:
+ mmap_read_lock(mm);
+ vma = find_vma_prev(mm, address, &prev_vma);
+- if (!vma || address < vma->vm_start)
+- goto check_expansion;
++ if (!vma || address < vma->vm_start) {
++ if (!prev || !(prev->vm_flags & VM_GROWSUP))
++ goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
++ }
++
+ /*
+ * Ok, we have a good vm_area for this memory access. We still need to
+ * check the access permissions.
+ */
+
+-good_area:
+-
+ if ((vma->vm_flags & acc_type) != acc_type)
+ goto bad_area;
+
+@@ -347,17 +351,13 @@ good_area:
+ mmap_read_unlock(mm);
+ return;
+
+-check_expansion:
+- vma = prev_vma;
+- if (vma && (expand_stack(vma, address) == 0))
+- goto good_area;
+-
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ */
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ int signo, si_code;
+
+@@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ {
+ unsigned long insn = regs->iir;
+ int breg, treg, xreg, val = 0;
+- struct vm_area_struct *vma, *prev_vma;
++ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ unsigned long address;
+@@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ /* Search for VMA */
+ address = regs->ior;
+ mmap_read_lock(mm);
+- vma = find_vma_prev(mm, address, &prev_vma);
++ vma = vma_lookup(mm, address);
+ mmap_read_unlock(mm);
+
+ /*
+@@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs
+ */
+ acc_type = (insn & 0x40) ? VM_WRITE : VM_READ;
+ if (vma
+- && address >= vma->vm_start
+ && (vma->vm_flags & acc_type) == acc_type)
+ val = 1;
+ }
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -457,8 +457,9 @@ retry:
+ if (unlikely(vma->vm_start > address)) {
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out_up;
+- if (expand_stack(vma, address))
+- goto out_up;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto out;
+ }
+
+ /*
+--- a/arch/sparc/mm/fault_64.c
++++ b/arch/sparc/mm/fault_64.c
+@@ -383,8 +383,9 @@ continue_fault:
+ goto bad_area;
+ }
+ }
+- if (expand_stack(vma, address))
+- goto bad_area;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+@@ -487,8 +488,9 @@ exit_exception:
+ * Fix it, but check if it's kernel or user first..
+ */
+ bad_area:
+- insn = get_fault_insn(regs, insn);
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
++ insn = get_fault_insn(regs, insn);
+
+ handle_kernel_fault:
+ do_kernel_fault(regs, si_code, fault_code, insn, address);
+--- a/arch/um/kernel/trap.c
++++ b/arch/um/kernel/trap.c
+@@ -47,14 +47,15 @@ retry:
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto out;
+- else if (vma->vm_start <= address)
++ if (vma->vm_start <= address)
+ goto good_area;
+- else if (!(vma->vm_flags & VM_GROWSDOWN))
++ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out;
+- else if (is_user && !ARCH_IS_STACKGROW(address))
+- goto out;
+- else if (expand_stack(vma, address))
++ if (is_user && !ARCH_IS_STACKGROW(address))
+ goto out;
++ vma = expand_stack(mm, address);
++ if (!vma)
++ goto out_nosemaphore;
+
+ good_area:
+ *code_out = SEGV_ACCERR;
+--- a/drivers/iommu/amd/iommu_v2.c
++++ b/drivers/iommu/amd/iommu_v2.c
+@@ -485,8 +485,8 @@ static void do_fault(struct work_struct
+ flags |= FAULT_FLAG_REMOTE;
+
+ mmap_read_lock(mm);
+- vma = find_extend_vma(mm, address);
+- if (!vma || address < vma->vm_start)
++ vma = vma_lookup(mm, address);
++ if (!vma)
+ /* failed to get a vma in the right range */
+ goto out;
+
+--- a/drivers/iommu/iommu-sva.c
++++ b/drivers/iommu/iommu-sva.c
+@@ -175,7 +175,7 @@ iommu_sva_handle_iopf(struct iommu_fault
+
+ mmap_read_lock(mm);
+
+- vma = find_extend_vma(mm, prm->addr);
++ vma = vma_lookup(mm, prm->addr);
+ if (!vma)
+ /* Unmapped area */
+ goto out_put_mm;
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -322,7 +322,7 @@ create_elf_tables(struct linux_binprm *b
+ */
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+- vma = find_extend_vma_locked(mm, bprm->p, true);
++ vma = find_extend_vma_locked(mm, bprm->p);
+ mmap_write_unlock(mm);
+ if (!vma)
+ return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -211,7 +211,7 @@ static struct page *get_arg_page(struct
+ */
+ if (write && pos < vma->vm_start) {
+ mmap_write_lock(mm);
+- ret = expand_downwards(vma, pos, true);
++ ret = expand_downwards(vma, pos);
+ if (unlikely(ret < 0)) {
+ mmap_write_unlock(mm);
+ return NULL;
+@@ -859,7 +859,7 @@ int setup_arg_pages(struct linux_binprm
+ stack_base = vma->vm_end - stack_expand;
+ #endif
+ current->mm->start_stack = bprm->p;
+- ret = expand_stack_locked(vma, stack_base, true);
++ ret = expand_stack_locked(vma, stack_base);
+ if (ret)
+ ret = -EFAULT;
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3192,18 +3192,11 @@ extern vm_fault_t filemap_page_mkwrite(s
+
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked);
+-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
+
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked);
+-#if VM_GROWSUP
+-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+-#else
+- #define expand_upwards(vma, address) (0)
+-#endif
++int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+
+ /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+@@ -3298,9 +3291,8 @@ unsigned long change_prot_numa(struct vm
+ unsigned long start, unsigned long end);
+ #endif
+
+-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
+- unsigned long addr, bool write_locked);
++ unsigned long addr);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1096,7 +1096,7 @@ static long __get_user_pages(struct mm_s
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+- vma = find_extend_vma(mm, start);
++ vma = vma_lookup(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+@@ -1265,8 +1265,8 @@ int fixup_user_fault(struct mm_struct *m
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ retry:
+- vma = find_extend_vma(mm, address);
+- if (!vma || address < vma->vm_start)
++ vma = vma_lookup(mm, address);
++ if (!vma)
+ return -EFAULT;
+
+ if (!vma_permits_fault(vma, fault_flags))
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_
+ goto fail;
+ }
+
+- if (expand_stack_locked(vma, addr, true))
++ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+ success:
+@@ -5713,6 +5713,14 @@ int __access_remote_vm(struct mm_struct
+ if (mmap_read_lock_killable(mm))
+ return 0;
+
++ /* We might need to expand the stack to access it */
++ vma = vma_lookup(mm, addr);
++ if (!vma) {
++ vma = expand_stack(mm, addr);
++ if (!vma)
++ return 0;
++ }
++
+ /* ignore errors, just check how much was successfully transferred */
+ while (len) {
+ int bytes, ret, offset;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1935,8 +1935,7 @@ static int acct_stack_growth(struct vm_a
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+@@ -1960,8 +1959,6 @@ int expand_upwards(struct vm_area_struct
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+- if (!write_locked)
+- return -EAGAIN;
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+@@ -2030,15 +2027,18 @@ int expand_upwards(struct vm_area_struct
+
+ /*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
++ * mmap_lock held for writing.
+ */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+ struct vm_area_struct *prev;
+ int error = 0;
+
++ if (!(vma->vm_flags & VM_GROWSDOWN))
++ return -EFAULT;
++
+ address &= PAGE_MASK;
+ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+@@ -2051,8 +2051,6 @@ int expand_downwards(struct vm_area_stru
+ vma_is_accessible(prev) &&
+ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
+- if (!write_locked && (prev->vm_end == address))
+- return -EAGAIN;
+ }
+
+ if (mas_preallocate(&mas, GFP_KERNEL))
+@@ -2131,14 +2129,12 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+- return expand_upwards(vma, address, write_locked);
++ return expand_upwards(vma, address);
+ }
+
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+- unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+ struct vm_area_struct *vma, *prev;
+
+@@ -2148,23 +2144,21 @@ struct vm_area_struct *find_extend_vma_l
+ return vma;
+ if (!prev)
+ return NULL;
+- if (expand_stack_locked(prev, addr, write_locked))
++ if (expand_stack_locked(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+ return prev;
+ }
+ #else
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+ return -EINVAL;
+- return expand_downwards(vma, address, write_locked);
++ return expand_downwards(vma, address);
+ }
+
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+- unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+ struct vm_area_struct *vma;
+ unsigned long start;
+@@ -2176,7 +2170,7 @@ struct vm_area_struct *find_extend_vma_l
+ if (vma->vm_start <= addr)
+ return vma;
+ start = vma->vm_start;
+- if (expand_stack_locked(vma, addr, write_locked))
++ if (expand_stack_locked(vma, addr))
+ return NULL;
+ if (vma->vm_flags & VM_LOCKED)
+ populate_vma_page_range(vma, addr, start, NULL);
+@@ -2184,12 +2178,91 @@ struct vm_area_struct *find_extend_vma_l
+ }
+ #endif
+
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
+- unsigned long addr)
++/*
++ * IA64 has some horrid mapping rules: it can expand both up and down,
++ * but with various special rules.
++ *
++ * We'll get rid of this architecture eventually, so the ugliness is
++ * temporary.
++ */
++#ifdef CONFIG_IA64
++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
++{
++ return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
++ REGION_OFFSET(addr) < RGN_MAP_LIMIT;
++}
++
++/*
++ * IA64 stacks grow down, but there's a special register backing store
++ * that can grow up. Only sequentially, though, so the new address must
++ * match vm_end.
++ */
++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
++{
++ if (!vma_expand_ok(vma, addr))
++ return -EFAULT;
++ if (vma->vm_end != (addr & PAGE_MASK))
++ return -EFAULT;
++ return expand_upwards(vma, addr);
++}
++
++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
++{
++ if (!vma_expand_ok(vma, addr))
++ return -EFAULT;
++ return expand_downwards(vma, addr);
++}
++
++#elif defined(CONFIG_STACK_GROWSUP)
++
++#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
++#define vma_expand_down(vma, addr) (-EFAULT)
++
++#else
++
++#define vma_expand_up(vma,addr) (-EFAULT)
++#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
++
++#endif
++
++/*
++ * expand_stack(): legacy interface for page faulting. Don't use unless
++ * you have to.
++ *
++ * This is called with the mm locked for reading, drops the lock, takes
++ * the lock for writing, tries to look up a vma again, expands it if
++ * necessary, and downgrades the lock to reading again.
++ *
++ * If no vma is found or it can't be expanded, it returns NULL and has
++ * dropped the lock.
++ */
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+ {
+- return find_extend_vma_locked(mm, addr, false);
++ struct vm_area_struct *vma, *prev;
++
++ mmap_read_unlock(mm);
++ if (mmap_write_lock_killable(mm))
++ return NULL;
++
++ vma = find_vma_prev(mm, addr, &prev);
++ if (vma && vma->vm_start <= addr)
++ goto success;
++
++ if (prev && !vma_expand_up(prev, addr)) {
++ vma = prev;
++ goto success;
++ }
++
++ if (vma && !vma_expand_down(vma, addr))
++ goto success;
++
++ mmap_write_unlock(mm);
++ return NULL;
++
++success:
++ mmap_write_downgrade(mm);
++ return vma;
+ }
+-EXPORT_SYMBOL_GPL(find_extend_vma);
+
+ /*
+ * Ok - we have the memory areas we should free on a maple tree so release them,
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -631,24 +631,20 @@ struct vm_area_struct *find_vma(struct m
+ EXPORT_SYMBOL(find_vma);
+
+ /*
+- * find a VMA
+- * - we don't extend stack VMAs under NOMMU conditions
+- */
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+-{
+- return find_vma(mm, addr);
+-}
+-
+-/*
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+- bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
+ {
+ return -ENOMEM;
+ }
+
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
++{
++ mmap_read_unlock(mm);
++ return NULL;
++}
++
+ /*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_lock at least held readlocked
--- /dev/null
+From a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 10:55:38 -0700
+Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream.
+
+This does the simple pattern conversion of alpha, arc, csky, hexagon,
+loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma()
+helper. They all have the regular fault handling pattern without odd
+special cases.
+
+The remaining architectures all have something that keeps us from a
+straightforward conversion: ia64 and parisc have stacks that can grow
+both up as well as down (and ia64 has special address region checks).
+
+And m68k, microblaze, openrisc, sparc64, and um end up having extra
+rules about only expanding the stack down a limited amount below the
+user space stack pointer. That is something that x86 used to do too
+(long long ago), and it probably could just be skipped, but it still
+makes the conversion less than trivial.
+
+Note that this conversion was done manually and with the exception of
+alpha without any build testing, because I have a fairly limited cross-
+building environment. The cases are all simple, and I went through the
+changes several times, but...
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/alpha/Kconfig | 1 +
+ arch/alpha/mm/fault.c | 13 +++----------
+ arch/arc/Kconfig | 1 +
+ arch/arc/mm/fault.c | 11 +++--------
+ arch/csky/Kconfig | 1 +
+ arch/csky/mm/fault.c | 22 +++++-----------------
+ arch/hexagon/Kconfig | 1 +
+ arch/hexagon/mm/vm_fault.c | 18 ++++--------------
+ arch/loongarch/Kconfig | 1 +
+ arch/loongarch/mm/fault.c | 16 ++++++----------
+ arch/nios2/Kconfig | 1 +
+ arch/nios2/mm/fault.c | 17 ++---------------
+ arch/sh/Kconfig | 1 +
+ arch/sh/mm/fault.c | 17 ++---------------
+ arch/sparc/Kconfig | 1 +
+ arch/sparc/mm/fault_32.c | 32 ++++++++------------------------
+ arch/xtensa/Kconfig | 1 +
+ arch/xtensa/mm/fault.c | 14 +++-----------
+ 18 files changed, 45 insertions(+), 124 deletions(-)
+
+--- a/arch/alpha/Kconfig
++++ b/arch/alpha/Kconfig
+@@ -30,6 +30,7 @@ config ALPHA
+ select HAS_IOPORT
+ select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_MOD_ARCH_SPECIFIC
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select ODD_RT_SIGACTION
+ select OLD_SIGSUSPEND
+--- a/arch/alpha/mm/fault.c
++++ b/arch/alpha/mm/fault.c
+@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns
+ flags |= FAULT_FLAG_USER;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+
+ /* Ok, we have a good vm_area for this memory access, so
+ we can handle it. */
+- good_area:
+ si_code = SEGV_ACCERR;
+ if (cause < 0) {
+ if (!(vma->vm_flags & VM_EXEC))
+@@ -192,6 +184,7 @@ retry:
+ bad_area:
+ mmap_read_unlock(mm);
+
++ bad_area_nosemaphore:
+ if (user_mode(regs))
+ goto do_sigsegv;
+
+--- a/arch/arc/Kconfig
++++ b/arch/arc/Kconfig
+@@ -41,6 +41,7 @@ config ARC
+ select HAVE_PERF_EVENTS
+ select HAVE_SYSCALL_TRACEPOINTS
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select OF
+ select OF_EARLY_FLATTREE
+--- a/arch/arc/mm/fault.c
++++ b/arch/arc/mm/fault.c
+@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (unlikely(address < vma->vm_start)) {
+- if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address))
+- goto bad_area;
+- }
++ goto bad_area_nosemaphore;
+
+ /*
+ * vm_area is good, now check permissions for this memory access
+@@ -161,6 +155,7 @@ retry:
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ /*
+ * Major/minor page fault accounting
+ * (in case of retry we only land here once)
+--- a/arch/csky/Kconfig
++++ b/arch/csky/Kconfig
+@@ -96,6 +96,7 @@ config CSKY
+ select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_STACKPROTECTOR
+ select HAVE_SYSCALL_TRACEPOINTS
++ select LOCK_MM_AND_FIND_VMA
+ select MAY_HAVE_SPARSE_IRQ
+ select MODULES_USE_ELF_RELA if MODULES
+ select OF
+--- a/arch/csky/mm/fault.c
++++ b/arch/csky/mm/fault.c
+@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct
+ BUG();
+ }
+
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
+ {
+ /*
+ * Something tried to access memory that isn't in our memory map.
+ * Fix it, but check if it's kernel or user first.
+ */
+- mmap_read_unlock(mm);
+ /* User mode accesses just cause a SIGSEGV */
+ if (user_mode(regs)) {
+ do_trap(regs, SIGSEGV, code, addr);
+@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_
+ if (is_write(regs))
+ flags |= FAULT_FLAG_WRITE;
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, addr);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (likely(vma->vm_start <= addr))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (unlikely(expand_stack(vma, addr))) {
+- bad_area(regs, mm, code, addr);
++ bad_area_nosemaphore(regs, mm, code, addr);
+ return;
+ }
+
+@@ -259,11 +247,11 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it.
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (unlikely(access_error(regs, vma))) {
+- bad_area(regs, mm, code, addr);
++ mmap_read_unlock(mm);
++ bad_area_nosemaphore(regs, mm, code, addr);
+ return;
+ }
+
+--- a/arch/hexagon/Kconfig
++++ b/arch/hexagon/Kconfig
+@@ -28,6 +28,7 @@ config HEXAGON
+ select GENERIC_SMP_IDLE_THREAD
+ select STACKTRACE_SUPPORT
+ select GENERIC_CLOCKEVENTS_BROADCAST
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select GENERIC_CPU_DEVICES
+ select ARCH_WANT_LD_ORPHAN_WARN
+--- a/arch/hexagon/mm/vm_fault.c
++++ b/arch/hexagon/mm/vm_fault.c
+@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+- if (!vma)
+- goto bad_area;
++ vma = lock_mm_and_find_vma(mm, address, regs);
++ if (unlikely(!vma))
++ goto bad_area_nosemaphore;
+
+- if (vma->vm_start <= address)
+- goto good_area;
+-
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+-
+- if (expand_stack(vma, address))
+- goto bad_area;
+-
+-good_area:
+ /* Address space is OK. Now check access rights. */
+ si_code = SEGV_ACCERR;
+
+@@ -143,6 +132,7 @@ good_area:
+ bad_area:
+ mmap_read_unlock(mm);
+
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ return;
+--- a/arch/loongarch/Kconfig
++++ b/arch/loongarch/Kconfig
+@@ -130,6 +130,7 @@ config LOONGARCH
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP
+ select IRQ_FORCED_THREADING
+ select IRQ_LOONGARCH_CPU
++ select LOCK_MM_AND_FIND_VMA
+ select MMU_GATHER_MERGE_VMAS if MMU
+ select MODULES_USE_ELF_RELA if MODULES
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+--- a/arch/loongarch/mm/fault.c
++++ b/arch/loongarch/mm/fault.c
+@@ -169,22 +169,18 @@ static void __kprobes __do_page_fault(st
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+- if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (!expand_stack(vma, address))
+- goto good_area;
++ vma = lock_mm_and_find_vma(mm, address, regs);
++ if (unlikely(!vma))
++ goto bad_area_nosemaphore;
++ goto good_area;
++
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ do_sigsegv(regs, write, address, si_code);
+ return;
+
+--- a/arch/nios2/Kconfig
++++ b/arch/nios2/Kconfig
+@@ -16,6 +16,7 @@ config NIOS2
+ select HAVE_ARCH_TRACEHOOK
+ select HAVE_ARCH_KGDB
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select OF
+ select OF_EARLY_FLATTREE
+--- a/arch/nios2/mm/fault.c
++++ b/arch/nios2/mm/fault.c
+@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+- if (!mmap_read_trylock(mm)) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ea))
+- goto bad_area_nosemaphore;
+ retry:
+- mmap_read_lock(mm);
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ switch (cause) {
+--- a/arch/sh/Kconfig
++++ b/arch/sh/Kconfig
+@@ -59,6 +59,7 @@ config SUPERH
+ select HAVE_STACKPROTECTOR
+ select HAVE_SYSCALL_TRACEPOINTS
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select NEED_SG_DMA_LENGTH
+ select NO_DMA if !MMU && !DMA_COHERENT
+--- a/arch/sh/mm/fault.c
++++ b/arch/sh/mm/fault.c
+@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(
+ }
+
+ retry:
+- mmap_read_lock(mm);
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (likely(vma->vm_start <= address))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (unlikely(expand_stack(vma, address))) {
+- bad_area(regs, error_code, address);
++ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+@@ -461,7 +449,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address);
+ return;
+--- a/arch/sparc/Kconfig
++++ b/arch/sparc/Kconfig
+@@ -57,6 +57,7 @@ config SPARC32
+ select DMA_DIRECT_REMAP
+ select GENERIC_ATOMIC64
+ select HAVE_UID16
++ select LOCK_MM_AND_FIND_VMA
+ select OLD_SIGACTION
+ select ZONE_DMA
+
+--- a/arch/sparc/mm/fault_32.c
++++ b/arch/sparc/mm/fault_32.c
+@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt
+ if (pagefault_disabled() || !mm)
+ goto no_context;
+
++ if (!from_user && address >= PAGE_OFFSET)
++ goto no_context;
++
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ retry:
+- mmap_read_lock(mm);
+-
+- if (!from_user && address >= PAGE_OFFSET)
+- goto bad_area;
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ code = SEGV_ACCERR;
+ if (write) {
+ if (!(vma->vm_flags & VM_WRITE))
+@@ -321,17 +312,9 @@ static void force_user_fault(unsigned lo
+
+ code = SEGV_MAPERR;
+
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
+-good_area:
++ goto bad_area_nosemaphore;
+ code = SEGV_ACCERR;
+ if (write) {
+ if (!(vma->vm_flags & VM_WRITE))
+@@ -350,6 +333,7 @@ good_area:
+ return;
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address);
+ return;
+
+--- a/arch/xtensa/Kconfig
++++ b/arch/xtensa/Kconfig
+@@ -49,6 +49,7 @@ config XTENSA
+ select HAVE_SYSCALL_TRACEPOINTS
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN
+ select IRQ_DOMAIN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA
+ select PERF_USE_VMALLOC
+ select TRACE_IRQFLAGS_SUPPORT
+--- a/arch/xtensa/mm/fault.c
++++ b/arch/xtensa/mm/fault.c
+@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, address);
+-
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (!vma)
+- goto bad_area;
+- if (vma->vm_start <= address)
+- goto good_area;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto bad_area;
+- if (expand_stack(vma, address))
+- goto bad_area;
++ goto bad_area_nosemaphore;
+
+ /* Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (is_write) {
+@@ -205,6 +196,7 @@ good_area:
+ */
+ bad_area:
+ mmap_read_unlock(mm);
++bad_area_nosemaphore:
+ if (user_mode(regs)) {
+ force_sig_fault(SIGSEGV, code, (void *) address);
+ return;
--- /dev/null
+From c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 15:17:36 -0700
+Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream.
+
+.. and make x86 use it.
+
+This basically extracts the existing x86 "find and expand faulting vma"
+code, but extends it to also take the mmap lock for writing in case we
+actually do need to expand the vma.
+
+We've historically short-circuited that case, and have some rather ugly
+special logic to serialize the stack segment expansion (since we only
+hold the mmap lock for reading) that doesn't match the normal VM
+locking.
+
+That slight violation of locking worked well, right up until it didn't:
+the maple tree code really does want proper locking even for simple
+extension of an existing vma.
+
+So extract the code for "look up the vma of the fault" from x86, fix it
+up to do the necessary write locking, and make it available as a helper
+function for other architectures that can use the common helper.
+
+Note: I say "common helper", but it really only handles the normal
+stack-grows-down case. Which is all architectures except for PA-RISC
+and IA64. So some rare architectures can't use the helper, but if they
+care they'll just need to open-code this logic.
+
+It's also worth pointing out that this code really would like to have an
+optimistic "mmap_upgrade_trylock()" to make it quicker to go from a
+read-lock (for the common case) to taking the write lock (for having to
+extend the vma) in the normal single-threaded situation where there is
+no other locking activity.
+
+But that _is_ all the very uncommon special case, so while it would be
+nice to have such an operation, it probably doesn't matter in reality.
+I did put in the skeleton code for such a possible future expansion,
+even if it only acts as pseudo-documentation for what we're doing.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig | 1
+ arch/x86/mm/fault.c | 52 ----------------------
+ include/linux/mm.h | 2
+ mm/Kconfig | 4 +
+ mm/memory.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 130 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -276,6 +276,7 @@ config X86
+ select HAVE_GENERIC_VDSO
+ select HOTPLUG_SMT if SMP
+ select IRQ_FORCED_THREADING
++ select LOCK_MM_AND_FIND_VMA
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+ select NEED_PER_CPU_PAGE_FIRST_CHUNK
+ select NEED_SG_DMA_LENGTH
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -880,12 +880,6 @@ __bad_area(struct pt_regs *regs, unsigne
+ __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+ }
+
+-static noinline void
+-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+-{
+- __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
+-}
+-
+ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ struct vm_area_struct *vma)
+ {
+@@ -1366,51 +1360,10 @@ void do_user_addr_fault(struct pt_regs *
+ lock_mmap:
+ #endif /* CONFIG_PER_VMA_LOCK */
+
+- /*
+- * Kernel-mode access to the user address space should only occur
+- * on well-defined single instructions listed in the exception
+- * tables. But, an erroneous kernel fault occurring outside one of
+- * those areas which also holds mmap_lock might deadlock attempting
+- * to validate the fault against the address space.
+- *
+- * Only do the expensive exception table search when we might be at
+- * risk of a deadlock. This happens if we
+- * 1. Failed to acquire mmap_lock, and
+- * 2. The access did not originate in userspace.
+- */
+- if (unlikely(!mmap_read_trylock(mm))) {
+- if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
+- /*
+- * Fault from code in kernel from
+- * which we do not expect faults.
+- */
+- bad_area_nosemaphore(regs, error_code, address);
+- return;
+- }
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case we'll have missed the might_sleep() from
+- * down_read():
+- */
+- might_sleep();
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma)) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (likely(vma->vm_start <= address))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- bad_area(regs, error_code, address);
+- return;
+- }
+- if (unlikely(expand_stack(vma, address))) {
+- bad_area(regs, error_code, address);
++ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+@@ -1418,7 +1371,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+-good_area:
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address, vma);
+ return;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2325,6 +2325,8 @@ void unmap_mapping_pages(struct address_
+ pgoff_t start, pgoff_t nr, bool even_cows);
+ void unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows);
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++ unsigned long address, struct pt_regs *regs);
+ #else
+ static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1206,6 +1206,10 @@ config PER_VMA_LOCK
+ This feature allows locking each virtual memory area separately when
+ handling page faults instead of taking mmap_lock.
+
++config LOCK_MM_AND_FIND_VMA
++ bool
++ depends on !STACK_GROWSUP
++
+ source "mm/damon/Kconfig"
+
+ endmenu
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5262,6 +5262,127 @@ out:
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
+
++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
++#include <linux/extable.h>
++
++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++ /* Even if this succeeds, make it clear we *might* have slept */
++ if (likely(mmap_read_trylock(mm))) {
++ might_sleep();
++ return true;
++ }
++
++ if (regs && !user_mode(regs)) {
++ unsigned long ip = instruction_pointer(regs);
++ if (!search_exception_tables(ip))
++ return false;
++ }
++
++ mmap_read_lock(mm);
++ return true;
++}
++
++static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
++{
++ /*
++ * We don't have this operation yet.
++ *
++ * It should be easy enough to do: it's basically a
++ * atomic_long_try_cmpxchg_acquire()
++ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
++ * it also needs the proper lockdep magic etc.
++ */
++ return false;
++}
++
++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++ mmap_read_unlock(mm);
++ if (regs && !user_mode(regs)) {
++ unsigned long ip = instruction_pointer(regs);
++ if (!search_exception_tables(ip))
++ return false;
++ }
++ mmap_write_lock(mm);
++ return true;
++}
++
++/*
++ * Helper for page fault handling.
++ *
++ * This is kind of equivalend to "mmap_read_lock()" followed
++ * by "find_extend_vma()", except it's a lot more careful about
++ * the locking (and will drop the lock on failure).
++ *
++ * For example, if we have a kernel bug that causes a page
++ * fault, we don't want to just use mmap_read_lock() to get
++ * the mm lock, because that would deadlock if the bug were
++ * to happen while we're holding the mm lock for writing.
++ *
++ * So this checks the exception tables on kernel faults in
++ * order to only do this all for instructions that are actually
++ * expected to fault.
++ *
++ * We can also actually take the mm lock for writing if we
++ * need to extend the vma, which helps the VM layer a lot.
++ */
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++ unsigned long addr, struct pt_regs *regs)
++{
++ struct vm_area_struct *vma;
++
++ if (!get_mmap_lock_carefully(mm, regs))
++ return NULL;
++
++ vma = find_vma(mm, addr);
++ if (likely(vma && (vma->vm_start <= addr)))
++ return vma;
++
++ /*
++ * Well, dang. We might still be successful, but only
++ * if we can extend a vma to do so.
++ */
++ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
++ mmap_read_unlock(mm);
++ return NULL;
++ }
++
++ /*
++ * We can try to upgrade the mmap lock atomically,
++ * in which case we can continue to use the vma
++ * we already looked up.
++ *
++ * Otherwise we'll have to drop the mmap lock and
++ * re-take it, and also look up the vma again,
++ * re-checking it.
++ */
++ if (!mmap_upgrade_trylock(mm)) {
++ if (!upgrade_mmap_lock_carefully(mm, regs))
++ return NULL;
++
++ vma = find_vma(mm, addr);
++ if (!vma)
++ goto fail;
++ if (vma->vm_start <= addr)
++ goto success;
++ if (!(vma->vm_flags & VM_GROWSDOWN))
++ goto fail;
++ }
++
++ if (expand_stack(vma, addr))
++ goto fail;
++
++success:
++ mmap_write_downgrade(mm);
++ return vma;
++
++fail:
++ mmap_write_unlock(mm);
++ return NULL;
++}
++#endif
++
+ #ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
--- /dev/null
+From f440fa1ac955e2898893f9301568435eb5cdfc4b Mon Sep 17 00:00:00 2001
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Fri, 16 Jun 2023 15:58:54 -0700
+Subject: mm: make find_extend_vma() fail if write lock not held
+
+From: Liam R. Howlett <Liam.Howlett@oracle.com>
+
+commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream.
+
+Make calls to extend_vma() and find_extend_vma() fail if the write lock
+is required.
+
+To avoid making this a flag-day event, this still allows the old
+read-locking case for the trivial situations, and passes in a flag to
+say "is it write-locked". That way write-lockers can say "yes, I'm
+being careful", and legacy users will continue to work in all the common
+cases until they have been fully converted to the new world order.
+
+Co-Developed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/binfmt_elf.c | 6 +++---
+ fs/exec.c | 5 +++--
+ include/linux/mm.h | 10 +++++++---
+ mm/memory.c | 2 +-
+ mm/mmap.c | 50 +++++++++++++++++++++++++++++++++-----------------
+ mm/nommu.c | 3 ++-
+ 6 files changed, 49 insertions(+), 27 deletions(-)
+
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *b
+ * Grow the stack manually; some architectures have a limit on how
+ * far ahead a user-space access may be in order to grow the stack.
+ */
+- if (mmap_read_lock_killable(mm))
++ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+- vma = find_extend_vma(mm, bprm->p);
+- mmap_read_unlock(mm);
++ vma = find_extend_vma_locked(mm, bprm->p, true);
++ mmap_write_unlock(mm);
+ if (!vma)
+ return -EFAULT;
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -205,7 +205,8 @@ static struct page *get_arg_page(struct
+
+ #ifdef CONFIG_STACK_GROWSUP
+ if (write) {
+- ret = expand_downwards(bprm->vma, pos);
++ /* We claim to hold the lock - nobody to race with */
++ ret = expand_downwards(bprm->vma, pos, true);
+ if (ret < 0)
+ return NULL;
+ }
+@@ -853,7 +854,7 @@ int setup_arg_pages(struct linux_binprm
+ stack_base = vma->vm_end - stack_expand;
+ #endif
+ current->mm->start_stack = bprm->p;
+- ret = expand_stack(vma, stack_base);
++ ret = expand_stack_locked(vma, stack_base, true);
+ if (ret)
+ ret = -EFAULT;
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3192,11 +3192,13 @@ extern vm_fault_t filemap_page_mkwrite(s
+
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked);
++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
+
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-extern int expand_downwards(struct vm_area_struct *vma,
+- unsigned long address);
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked);
+ #if VM_GROWSUP
+ extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+ #else
+@@ -3297,6 +3299,8 @@ unsigned long change_prot_numa(struct vm
+ #endif
+
+ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
++ unsigned long addr, bool write_locked);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_
+ goto fail;
+ }
+
+- if (expand_stack(vma, addr))
++ if (expand_stack_locked(vma, addr, true))
+ goto fail;
+
+ success:
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1935,7 +1935,8 @@ static int acct_stack_growth(struct vm_a
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
++int expand_upwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+@@ -1959,6 +1960,8 @@ int expand_upwards(struct vm_area_struct
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
++ if (!write_locked)
++ return -EAGAIN;
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+@@ -2028,7 +2031,8 @@ int expand_upwards(struct vm_area_struct
+ /*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
+ */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+@@ -2042,10 +2046,13 @@ int expand_downwards(struct vm_area_stru
+ /* Enforce stack_guard_gap */
+ prev = mas_prev(&mas, 0);
+ /* Check that both stack segments have the same anon_vma? */
+- if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+- vma_is_accessible(prev)) {
+- if (address - prev->vm_end < stack_guard_gap)
++ if (prev) {
++ if (!(prev->vm_flags & VM_GROWSDOWN) &&
++ vma_is_accessible(prev) &&
++ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
++ if (!write_locked && (prev->vm_end == address))
++ return -EAGAIN;
+ }
+
+ if (mas_preallocate(&mas, GFP_KERNEL))
+@@ -2124,13 +2131,14 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+- return expand_upwards(vma, address);
++ return expand_upwards(vma, address, write_locked);
+ }
+
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++ unsigned long addr, bool write_locked)
+ {
+ struct vm_area_struct *vma, *prev;
+
+@@ -2138,20 +2146,25 @@ find_extend_vma(struct mm_struct *mm, un
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+- if (!prev || expand_stack(prev, addr))
++ if (!prev)
++ return NULL;
++ if (expand_stack_locked(prev, addr, write_locked))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+ return prev;
+ }
+ #else
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+- return expand_downwards(vma, address);
++ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
++ return -EINVAL;
++ return expand_downwards(vma, address, write_locked);
+ }
+
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++ unsigned long addr, bool write_locked)
+ {
+ struct vm_area_struct *vma;
+ unsigned long start;
+@@ -2162,10 +2175,8 @@ find_extend_vma(struct mm_struct *mm, un
+ return NULL;
+ if (vma->vm_start <= addr)
+ return vma;
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- return NULL;
+ start = vma->vm_start;
+- if (expand_stack(vma, addr))
++ if (expand_stack_locked(vma, addr, write_locked))
+ return NULL;
+ if (vma->vm_flags & VM_LOCKED)
+ populate_vma_page_range(vma, addr, start, NULL);
+@@ -2173,6 +2184,11 @@ find_extend_vma(struct mm_struct *mm, un
+ }
+ #endif
+
++struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
++ unsigned long addr)
++{
++ return find_extend_vma_locked(mm, addr, false);
++}
+ EXPORT_SYMBOL_GPL(find_extend_vma);
+
+ /*
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -643,7 +643,8 @@ struct vm_area_struct *find_extend_vma(s
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++ bool write_locked)
+ {
+ return -ENOMEM;
+ }
--- /dev/null
+From eda0047296a16d65a7f2bc60a408f70d178b2014 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 16:17:48 -0700
+Subject: mm: make the page fault mmap locking killable
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream.
+
+This is done as a separate patch from introducing the new
+lock_mm_and_find_vma() helper, because while it's an obvious change,
+it's not what x86 used to do in this area.
+
+We already abort the page fault on fatal signals anyway, so why should
+we wait for the mmap lock only to then abort later? With the new helper
+function that returns without the lock held on failure anyway, this is
+particularly easy and straightforward.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5279,8 +5279,7 @@ static inline bool get_mmap_lock_careful
+ return false;
+ }
+
+- mmap_read_lock(mm);
+- return true;
++ return !mmap_read_lock_killable(mm);
+ }
+
+ static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+@@ -5304,8 +5303,7 @@ static inline bool upgrade_mmap_lock_car
+ if (!search_exception_tables(ip))
+ return false;
+ }
+- mmap_write_lock(mm);
+- return true;
++ return !mmap_write_lock_killable(mm);
+ }
+
+ /*
--- /dev/null
+From 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 11:17:05 -0700
+Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream.
+
+This is one of the simple cases, except there's no pt_regs pointer.
+Which is fine, as lock_mm_and_find_vma() is set up to work fine with a
+NULL pt_regs.
+
+Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting,
+so we can just use the helper without any extra work.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/mm/copro_fault.c | 14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/arch/powerpc/mm/copro_fault.c
++++ b/arch/powerpc/mm/copro_fault.c
+@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru
+ if (mm->pgd == NULL)
+ return -EFAULT;
+
+- mmap_read_lock(mm);
+- ret = -EFAULT;
+- vma = find_vma(mm, ea);
++ vma = lock_mm_and_find_vma(mm, ea, NULL);
+ if (!vma)
+- goto out_unlock;
+-
+- if (ea < vma->vm_start) {
+- if (!(vma->vm_flags & VM_GROWSDOWN))
+- goto out_unlock;
+- if (expand_stack(vma, ea))
+- goto out_unlock;
+- }
++ return -EFAULT;
+
++ ret = -EFAULT;
+ is_write = dsisr & DSISR_ISSTORE;
+ if (is_write) {
+ if (!(vma->vm_flags & VM_WRITE))
--- /dev/null
+From e6fe228c4ffafdfc970cf6d46883a1f481baf7ea Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Fri, 16 Jun 2023 15:51:29 +1000
+Subject: powerpc/mm: Convert to using lock_mm_and_find_vma()
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream.
+
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/Kconfig | 1 +
+ arch/powerpc/mm/fault.c | 41 ++++-------------------------------------
+ 2 files changed, 5 insertions(+), 37 deletions(-)
+
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -278,6 +278,7 @@ config PPC
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
+ select KASAN_VMALLOC if KASAN && MODULES
++ select LOCK_MM_AND_FIND_VMA
+ select MMU_GATHER_PAGE_SIZE
+ select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_GATHER_MERGE_VMAS
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re
+ return __bad_area_nosemaphore(regs, address, si_code);
+ }
+
+-static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+-{
+- return __bad_area(regs, address, SEGV_MAPERR);
+-}
+-
+ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ struct vm_area_struct *vma)
+ {
+@@ -515,40 +510,12 @@ lock_mmap:
+ * we will deadlock attempting to validate the fault against the
+ * address space. Luckily the kernel only validly references user
+ * space from well defined areas of code, which are listed in the
+- * exceptions table.
+- *
+- * As the vast majority of faults will be valid we will only perform
+- * the source reference check when there is a possibility of a deadlock.
+- * Attempt to lock the address space, if we cannot we then validate the
+- * source. If this is invalid we can skip the address space check,
+- * thus avoiding the deadlock.
+- */
+- if (unlikely(!mmap_read_trylock(mm))) {
+- if (!is_user && !search_exception_tables(regs->nip))
+- return bad_area_nosemaphore(regs, address);
+-
++ * exceptions table. lock_mm_and_find_vma() handles that logic.
++ */
+ retry:
+- mmap_read_lock(mm);
+- } else {
+- /*
+- * The above down_read_trylock() might have succeeded in
+- * which case we'll have missed the might_sleep() from
+- * down_read():
+- */
+- might_sleep();
+- }
+-
+- vma = find_vma(mm, address);
++ vma = lock_mm_and_find_vma(mm, address, regs);
+ if (unlikely(!vma))
+- return bad_area(regs, address);
+-
+- if (unlikely(vma->vm_start > address)) {
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+- return bad_area(regs, address);
+-
+- if (unlikely(expand_stack(vma, address)))
+- return bad_area(regs, address);
+- }
++ return bad_area_nosemaphore(regs, address);
+
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma)))
--- /dev/null
+From 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 20:18:18 +0200
+Subject: riscv/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/Kconfig | 1 +
+ arch/riscv/mm/fault.c | 31 +++++++++++++------------------
+ 2 files changed, 14 insertions(+), 18 deletions(-)
+
+--- a/arch/riscv/Kconfig
++++ b/arch/riscv/Kconfig
+@@ -126,6 +126,7 @@ config RISCV
+ select IRQ_DOMAIN
+ select IRQ_FORCED_THREADING
+ select KASAN_VMALLOC if KASAN
++ select LOCK_MM_AND_FIND_VMA
+ select MODULES_USE_ELF_RELA if MODULES
+ select MODULE_SECTIONS if MODULES
+ select OF
+--- a/arch/riscv/mm/fault.c
++++ b/arch/riscv/mm/fault.c
+@@ -84,13 +84,13 @@ static inline void mm_fault_error(struct
+ BUG();
+ }
+
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void
++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr)
+ {
+ /*
+ * Something tried to access memory that isn't in our memory map.
+ * Fix it, but check if it's kernel or user first.
+ */
+- mmap_read_unlock(mm);
+ /* User mode accesses just cause a SIGSEGV */
+ if (user_mode(regs)) {
+ do_trap(regs, SIGSEGV, code, addr);
+@@ -100,6 +100,15 @@ static inline void bad_area(struct pt_re
+ no_context(regs, addr);
+ }
+
++static inline void
++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code,
++ unsigned long addr)
++{
++ mmap_read_unlock(mm);
++
++ bad_area_nosemaphore(regs, code, addr);
++}
++
+ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
+ {
+ pgd_t *pgd, *pgd_k;
+@@ -287,23 +296,10 @@ void handle_page_fault(struct pt_regs *r
+ else if (cause == EXC_INST_PAGE_FAULT)
+ flags |= FAULT_FLAG_INSTRUCTION;
+ retry:
+- mmap_read_lock(mm);
+- vma = find_vma(mm, addr);
++ vma = lock_mm_and_find_vma(mm, addr, regs);
+ if (unlikely(!vma)) {
+ tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (likely(vma->vm_start <= addr))
+- goto good_area;
+- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+- tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
+- return;
+- }
+- if (unlikely(expand_stack(vma, addr))) {
+- tsk->thread.bad_cause = cause;
+- bad_area(regs, mm, code, addr);
++ bad_area_nosemaphore(regs, code, addr);
+ return;
+ }
+
+@@ -311,7 +307,6 @@ retry:
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it.
+ */
+-good_area:
+ code = SEGV_ACCERR;
+
+ if (unlikely(access_error(cause, vma))) {
cpufreq-amd-pstate-make-amd-pstate-epp-driver-name-hyphenated.patch
can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch
maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch
+mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
+mm-make-the-page-fault-mmap-locking-killable.patch
+arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
+powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
+mips-mm-convert-to-using-lock_mm_and_find_vma.patch
+riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
+arm-mm-convert-to-using-lock_mm_and_find_vma.patch
+mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
+powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
+mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
+execve-expand-new-process-stack-manually-ahead-of-time.patch
+mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
+gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch