]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
some mm patches for 6.1, 6.3, and 6.4
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 27 Jun 2023 19:25:07 +0000 (21:25 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 29 Jun 2023 15:37:49 +0000 (17:37 +0200)
41 files changed:
queue-6.1/arm-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.1/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.1/execve-expand-new-process-stack-manually-ahead-of-time.patch [new file with mode: 0644]
queue-6.1/mips-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.1/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch [new file with mode: 0644]
queue-6.1/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.1/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch [new file with mode: 0644]
queue-6.1/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch [new file with mode: 0644]
queue-6.1/mm-make-the-page-fault-mmap-locking-killable.patch [new file with mode: 0644]
queue-6.1/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.1/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.1/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.1/series
queue-6.3/arm-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.3/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.3/execve-expand-new-process-stack-manually-ahead-of-time.patch [new file with mode: 0644]
queue-6.3/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch [new file with mode: 0644]
queue-6.3/mips-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.3/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch [new file with mode: 0644]
queue-6.3/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.3/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch [new file with mode: 0644]
queue-6.3/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch [new file with mode: 0644]
queue-6.3/mm-make-the-page-fault-mmap-locking-killable.patch [new file with mode: 0644]
queue-6.3/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.3/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.3/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.3/series
queue-6.4/arm-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.4/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.4/execve-expand-new-process-stack-manually-ahead-of-time.patch [new file with mode: 0644]
queue-6.4/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch [new file with mode: 0644]
queue-6.4/mips-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.4/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch [new file with mode: 0644]
queue-6.4/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.4/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch [new file with mode: 0644]
queue-6.4/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch [new file with mode: 0644]
queue-6.4/mm-make-the-page-fault-mmap-locking-killable.patch [new file with mode: 0644]
queue-6.4/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.4/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.4/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch [new file with mode: 0644]
queue-6.4/series

diff --git a/queue-6.1/arm-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/arm-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..d5583bb
--- /dev/null
@@ -0,0 +1,138 @@
+From f8c4e35d716b886d05595706af1be757fede502d Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 21:24:30 +0200
+Subject: arm/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream.
+
+arm has an additional check for address < FIRST_USER_ADDRESS before
+expanding the stack.  Since FIRST_USER_ADDRESS is defined everywhere
+(generally as 0), move that check to the generic expand_downwards().
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/Kconfig    |    1 
+ arch/arm/mm/fault.c |   63 +++++++++++-----------------------------------------
+ mm/mmap.c           |    2 -
+ 3 files changed, 16 insertions(+), 50 deletions(-)
+
+--- a/arch/arm/Kconfig
++++ b/arch/arm/Kconfig
+@@ -122,6 +122,7 @@ config ARM
+       select HAVE_UID16
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_REL
+       select NEED_DMA_MAP_STATE
+       select OF_EARLY_FLATTREE if OF
+--- a/arch/arm/mm/fault.c
++++ b/arch/arm/mm/fault.c
+@@ -231,37 +231,11 @@ static inline bool is_permission_fault(u
+       return false;
+ }
+-static vm_fault_t __kprobes
+-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags,
+-              unsigned long vma_flags, struct pt_regs *regs)
+-{
+-      struct vm_area_struct *vma = find_vma(mm, addr);
+-      if (unlikely(!vma))
+-              return VM_FAULT_BADMAP;
+-
+-      if (unlikely(vma->vm_start > addr)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      return VM_FAULT_BADMAP;
+-              if (addr < FIRST_USER_ADDRESS)
+-                      return VM_FAULT_BADMAP;
+-              if (expand_stack(vma, addr))
+-                      return VM_FAULT_BADMAP;
+-      }
+-
+-      /*
+-       * ok, we have a good vm_area for this memory access, check the
+-       * permissions on the VMA allow for the fault which occurred.
+-       */
+-      if (!(vma->vm_flags & vma_flags))
+-              return VM_FAULT_BADACCESS;
+-
+-      return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+-}
+-
+ static int __kprobes
+ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ {
+       struct mm_struct *mm = current->mm;
++      struct vm_area_struct *vma;
+       int sig, code;
+       vm_fault_t fault;
+       unsigned int flags = FAULT_FLAG_DEFAULT;
+@@ -300,31 +274,21 @@ do_page_fault(unsigned long addr, unsign
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+-      /*
+-       * As per x86, we may deadlock here.  However, since the kernel only
+-       * validly references user space from well defined areas of the code,
+-       * we can bug out early if this is from code which shouldn't.
+-       */
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
+-                      goto no_context;
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case, we'll have missed the might_sleep() from
+-               * down_read()
+-               */
+-              might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+-              if (!user_mode(regs) &&
+-                  !search_exception_tables(regs->ARM_pc))
+-                      goto no_context;
+-#endif
++      vma = lock_mm_and_find_vma(mm, addr, regs);
++      if (unlikely(!vma)) {
++              fault = VM_FAULT_BADMAP;
++              goto bad_area;
+       }
+-      fault = __do_page_fault(mm, addr, flags, vm_flags, regs);
++      /*
++       * ok, we have a good vm_area for this memory access, check the
++       * permissions on the VMA allow for the fault which occurred.
++       */
++      if (!(vma->vm_flags & vm_flags))
++              fault = VM_FAULT_BADACCESS;
++      else
++              fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+       /* If we need to retry but a fatal signal is pending, handle the
+        * signal first. We do not need to release the mmap_lock because
+@@ -355,6 +319,7 @@ retry:
+       if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+               return 0;
++bad_area:
+       /*
+        * If we are in kernel mode at this point, we
+        * have no context to handle this fault with.
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2045,7 +2045,7 @@ int expand_downwards(struct vm_area_stru
+       int error = 0;
+       address &= PAGE_MASK;
+-      if (address < mmap_min_addr)
++      if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+               return -EPERM;
+       /* Enforce stack_guard_gap */
diff --git a/queue-6.1/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..4320d8e
--- /dev/null
@@ -0,0 +1,120 @@
+From a45a8a9f70fe70c7c4479b9256b1eb1b5774df64 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 17:11:44 -0700
+Subject: arm64/mm: Convert to using lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream.
+
+This converts arm64 to use the new page fault helper.  It was very
+straightforward, but still needed a fix for the "obvious" conversion I
+initially did.  Thanks to Suren for the fix and testing.
+
+Fixed-and-tested-by: Suren Baghdasaryan <surenb@google.com>
+Unnecessary-code-removal-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1: Ignore CONFIG_PER_VMA_LOCK context]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/Kconfig    |    1 +
+ arch/arm64/mm/fault.c |   46 +++++++++-------------------------------------
+ 2 files changed, 10 insertions(+), 37 deletions(-)
+
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -211,6 +211,7 @@ config ARM64
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
+       select KASAN_VMALLOC if KASAN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select NEED_DMA_MAP_STATE
+       select NEED_SG_DMA_LENGTH
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa
+ #define VM_FAULT_BADMAP               ((__force vm_fault_t)0x010000)
+ #define VM_FAULT_BADACCESS    ((__force vm_fault_t)0x020000)
+-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
++static vm_fault_t __do_page_fault(struct mm_struct *mm,
++                                struct vm_area_struct *vma, unsigned long addr,
+                                 unsigned int mm_flags, unsigned long vm_flags,
+                                 struct pt_regs *regs)
+ {
+-      struct vm_area_struct *vma = find_vma(mm, addr);
+-
+-      if (unlikely(!vma))
+-              return VM_FAULT_BADMAP;
+-
+       /*
+        * Ok, we have a good vm_area for this memory access, so we can handle
+        * it.
+-       */
+-      if (unlikely(vma->vm_start > addr)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      return VM_FAULT_BADMAP;
+-              if (expand_stack(vma, addr))
+-                      return VM_FAULT_BADMAP;
+-      }
+-
+-      /*
+        * Check that the permissions on the VMA allow for the fault which
+        * occurred.
+        */
+@@ -535,6 +522,7 @@ static int __kprobes do_page_fault(unsig
+       unsigned long vm_flags;
+       unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+       unsigned long addr = untagged_addr(far);
++      struct vm_area_struct *vma;
+       if (kprobe_page_fault(regs, esr))
+               return 0;
+@@ -585,31 +573,14 @@ static int __kprobes do_page_fault(unsig
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+-      /*
+-       * As per x86, we may deadlock here. However, since the kernel only
+-       * validly references user space from well defined areas of the code,
+-       * we can bug out early if this is from code which shouldn't.
+-       */
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->pc))
+-                      goto no_context;
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above mmap_read_trylock() might have succeeded in which
+-               * case, we'll have missed the might_sleep() from down_read().
+-               */
+-              might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+-              if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
+-                      mmap_read_unlock(mm);
+-                      goto no_context;
+-              }
+-#endif
++      vma = lock_mm_and_find_vma(mm, addr, regs);
++      if (unlikely(!vma)) {
++              fault = VM_FAULT_BADMAP;
++              goto done;
+       }
+-      fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
++      fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
+       /* Quick path to respond to signals */
+       if (fault_signal_pending(fault, regs)) {
+@@ -628,6 +599,7 @@ retry:
+       }
+       mmap_read_unlock(mm);
++done:
+       /*
+        * Handle the "normal" (no error) case first.
+        */
diff --git a/queue-6.1/execve-expand-new-process-stack-manually-ahead-of-time.patch b/queue-6.1/execve-expand-new-process-stack-manually-ahead-of-time.patch
new file mode 100644 (file)
index 0000000..833c47b
--- /dev/null
@@ -0,0 +1,91 @@
+From 9e1f3d01ba1f6ffa0ad902d594b1b44619568b74 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Mon, 19 Jun 2023 11:34:15 -0700
+Subject: execve: expand new process stack manually ahead of time
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream.
+
+This is a small step towards a model where GUP itself would not expand
+the stack, and any user that needs GUP to not look up existing mappings,
+but actually expand on them, would have to do so manually before-hand,
+and with the mm lock held for writing.
+
+It turns out that execve() already did almost exactly that, except it
+didn't take the mm lock at all (it's single-threaded so no locking
+technically needed, but it could cause lockdep errors).  And it only did
+it for the CONFIG_STACK_GROWSUP case, since in that case GUP has
+obviously never expanded the stack downwards.
+
+So just make that CONFIG_STACK_GROWSUP case do the right thing with
+locking, and enable it generally.  This will eventually help GUP, and in
+the meantime avoids a special case and the lockdep issue.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1 Minor context from still having FOLL_FORCE flags set]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c |   37 +++++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 16 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -198,34 +198,39 @@ static struct page *get_arg_page(struct
+               int write)
+ {
+       struct page *page;
++      struct vm_area_struct *vma = bprm->vma;
++      struct mm_struct *mm = bprm->mm;
+       int ret;
+-      unsigned int gup_flags = FOLL_FORCE;
+-#ifdef CONFIG_STACK_GROWSUP
+-      if (write) {
+-              /* We claim to hold the lock - nobody to race with */
+-              ret = expand_downwards(bprm->vma, pos, true);
+-              if (ret < 0)
++      /*
++       * Avoid relying on expanding the stack down in GUP (which
++       * does not work for STACK_GROWSUP anyway), and just do it
++       * by hand ahead of time.
++       */
++      if (write && pos < vma->vm_start) {
++              mmap_write_lock(mm);
++              ret = expand_downwards(vma, pos, true);
++              if (unlikely(ret < 0)) {
++                      mmap_write_unlock(mm);
+                       return NULL;
+-      }
+-#endif
+-
+-      if (write)
+-              gup_flags |= FOLL_WRITE;
++              }
++              mmap_write_downgrade(mm);
++      } else
++              mmap_read_lock(mm);
+       /*
+        * We are doing an exec().  'current' is the process
+-       * doing the exec and bprm->mm is the new process's mm.
++       * doing the exec and 'mm' is the new process's mm.
+        */
+-      mmap_read_lock(bprm->mm);
+-      ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
++      ret = get_user_pages_remote(mm, pos, 1,
++                      write ? FOLL_WRITE : 0,
+                       &page, NULL, NULL);
+-      mmap_read_unlock(bprm->mm);
++      mmap_read_unlock(mm);
+       if (ret <= 0)
+               return NULL;
+       if (write)
+-              acct_arg_size(bprm, vma_pages(bprm->vma));
++              acct_arg_size(bprm, vma_pages(vma));
+       return page;
+ }
diff --git a/queue-6.1/mips-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/mips-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..112cae5
--- /dev/null
@@ -0,0 +1,55 @@
+From f9ced2ac8976a6560505cc4bf14ffdf1c076e475 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 18:47:40 +0200
+Subject: mips/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/mips/Kconfig    |    1 +
+ arch/mips/mm/fault.c |   12 ++----------
+ 2 files changed, 3 insertions(+), 10 deletions(-)
+
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -94,6 +94,7 @@ config MIPS
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP
+       select IRQ_FORCED_THREADING
+       select ISA if EISA
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_REL if MODULES
+       select MODULES_USE_ELF_RELA if MODULES && 64BIT
+       select PERF_USE_VMALLOC
+--- a/arch/mips/mm/fault.c
++++ b/arch/mips/mm/fault.c
+@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+  * we can handle it..
+  */
+-good_area:
+       si_code = SEGV_ACCERR;
+       if (write) {
diff --git a/queue-6.1/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch b/queue-6.1/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
new file mode 100644 (file)
index 0000000..013f951
--- /dev/null
@@ -0,0 +1,671 @@
+From 2956a81444985ffb601685f3a796e79470b56353 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 13:45:51 -0700
+Subject: mm: always expand the stack with the mmap write lock held
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream
+
+This finishes the job of always holding the mmap write lock when
+extending the user stack vma, and removes the 'write_locked' argument
+from the vm helper functions again.
+
+For some cases, we just avoid expanding the stack at all: drivers and
+page pinning really shouldn't be extending any stacks.  Let's see if any
+strange users really wanted that.
+
+It's worth noting that architectures that weren't converted to the new
+lock_mm_and_find_vma() helper function are left using the legacy
+"expand_stack()" function, but it has been changed to drop the mmap_lock
+and take it for writing while expanding the vma.  This makes it fairly
+straightforward to convert the remaining architectures.
+
+As a result of dropping and re-taking the lock, the calling conventions
+for this function have also changed, since the old vma may no longer be
+valid.  So it will now return the new vma if successful, and NULL - and
+the lock dropped - if the area could not be extended.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1: Patch drivers/iommu/io-pgfault.c instead]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/ia64/mm/fault.c         |   36 ++----------
+ arch/m68k/mm/fault.c         |    9 ++-
+ arch/microblaze/mm/fault.c   |    5 +
+ arch/openrisc/mm/fault.c     |    5 +
+ arch/parisc/mm/fault.c       |   23 +++-----
+ arch/s390/mm/fault.c         |    5 +
+ arch/sparc/mm/fault_64.c     |    8 +-
+ arch/um/kernel/trap.c        |   11 ++-
+ drivers/iommu/amd/iommu_v2.c |    4 -
+ drivers/iommu/io-pgfault.c   |    2 
+ fs/binfmt_elf.c              |    2 
+ fs/exec.c                    |    4 -
+ include/linux/mm.h           |   16 +----
+ mm/gup.c                     |    6 +-
+ mm/memory.c                  |   10 +++
+ mm/mmap.c                    |  121 ++++++++++++++++++++++++++++++++++---------
+ mm/nommu.c                   |   18 ++----
+ 17 files changed, 169 insertions(+), 116 deletions(-)
+
+--- a/arch/ia64/mm/fault.c
++++ b/arch/ia64/mm/fault.c
+@@ -110,10 +110,12 @@ retry:
+          * register backing store that needs to expand upwards, in
+          * this case vma will be null, but prev_vma will ne non-null
+          */
+-        if (( !vma && prev_vma ) || (address < vma->vm_start) )
+-              goto check_expansion;
++        if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto bad_area_nosemaphore;
++      }
+-  good_area:
+       code = SEGV_ACCERR;
+       /* OK, we've got a good vm_area for this memory area.  Check the access permissions: */
+@@ -174,35 +176,9 @@ retry:
+       mmap_read_unlock(mm);
+       return;
+-  check_expansion:
+-      if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+-              if (!vma)
+-                      goto bad_area;
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      goto bad_area;
+-              if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+-                  || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+-                      goto bad_area;
+-              if (expand_stack(vma, address))
+-                      goto bad_area;
+-      } else {
+-              vma = prev_vma;
+-              if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+-                  || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+-                      goto bad_area;
+-              /*
+-               * Since the register backing store is accessed sequentially,
+-               * we disallow growing it by more than a page at a time.
+-               */
+-              if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+-                      goto bad_area;
+-              if (expand_upwards(vma, address))
+-                      goto bad_area;
+-      }
+-      goto good_area;
+-
+   bad_area:
+       mmap_read_unlock(mm);
++  bad_area_nosemaphore:
+       if ((isr & IA64_ISR_SP)
+           || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+       {
+--- a/arch/m68k/mm/fault.c
++++ b/arch/m68k/mm/fault.c
+@@ -105,8 +105,9 @@ retry:
+               if (address + 256 < rdusp())
+                       goto map_err;
+       }
+-      if (expand_stack(vma, address))
+-              goto map_err;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto map_err_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+@@ -193,10 +194,12 @@ bus_err:
+       goto send_sig;
+ map_err:
++      mmap_read_unlock(mm);
++map_err_nosemaphore:
+       current->thread.signo = SIGSEGV;
+       current->thread.code = SEGV_MAPERR;
+       current->thread.faddr = address;
+-      goto send_sig;
++      return send_fault_sig(regs);
+ acc_err:
+       current->thread.signo = SIGSEGV;
+--- a/arch/microblaze/mm/fault.c
++++ b/arch/microblaze/mm/fault.c
+@@ -192,8 +192,9 @@ retry:
+                       && (kernel_mode(regs) || !store_updates_sp(regs)))
+                               goto bad_area;
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+ good_area:
+       code = SEGV_ACCERR;
+--- a/arch/openrisc/mm/fault.c
++++ b/arch/openrisc/mm/fault.c
+@@ -127,8 +127,9 @@ retry:
+               if (address + PAGE_SIZE < regs->sp)
+                       goto bad_area;
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+--- a/arch/parisc/mm/fault.c
++++ b/arch/parisc/mm/fault.c
+@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs,
+ retry:
+       mmap_read_lock(mm);
+       vma = find_vma_prev(mm, address, &prev_vma);
+-      if (!vma || address < vma->vm_start)
+-              goto check_expansion;
++      if (!vma || address < vma->vm_start) {
++              if (!prev || !(prev->vm_flags & VM_GROWSUP))
++                      goto bad_area;
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto bad_area_nosemaphore;
++      }
++
+ /*
+  * Ok, we have a good vm_area for this memory access. We still need to
+  * check the access permissions.
+  */
+-good_area:
+-
+       if ((vma->vm_flags & acc_type) != acc_type)
+               goto bad_area;
+@@ -342,17 +346,13 @@ good_area:
+       mmap_read_unlock(mm);
+       return;
+-check_expansion:
+-      vma = prev_vma;
+-      if (vma && (expand_stack(vma, address) == 0))
+-              goto good_area;
+-
+ /*
+  * Something tried to access memory that isn't in our memory map..
+  */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               int signo, si_code;
+@@ -444,7 +444,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ {
+       unsigned long insn = regs->iir;
+       int breg, treg, xreg, val = 0;
+-      struct vm_area_struct *vma, *prev_vma;
++      struct vm_area_struct *vma;
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       unsigned long address;
+@@ -480,7 +480,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+                               /* Search for VMA */
+                               address = regs->ior;
+                               mmap_read_lock(mm);
+-                              vma = find_vma_prev(mm, address, &prev_vma);
++                              vma = vma_lookup(mm, address);
+                               mmap_read_unlock(mm);
+                               /*
+@@ -489,7 +489,6 @@ handle_nadtlb_fault(struct pt_regs *regs
+                                */
+                               acc_type = (insn & 0x40) ? VM_WRITE : VM_READ;
+                               if (vma
+-                                  && address >= vma->vm_start
+                                   && (vma->vm_flags & acc_type) == acc_type)
+                                       val = 1;
+                       }
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -429,8 +429,9 @@ retry:
+       if (unlikely(vma->vm_start > address)) {
+               if (!(vma->vm_flags & VM_GROWSDOWN))
+                       goto out_up;
+-              if (expand_stack(vma, address))
+-                      goto out_up;
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto out;
+       }
+       /*
+--- a/arch/sparc/mm/fault_64.c
++++ b/arch/sparc/mm/fault_64.c
+@@ -383,8 +383,9 @@ continue_fault:
+                               goto bad_area;
+               }
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+@@ -482,8 +483,9 @@ exit_exception:
+        * Fix it, but check if it's kernel or user first..
+        */
+ bad_area:
+-      insn = get_fault_insn(regs, insn);
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
++      insn = get_fault_insn(regs, insn);
+ handle_kernel_fault:
+       do_kernel_fault(regs, si_code, fault_code, insn, address);
+--- a/arch/um/kernel/trap.c
++++ b/arch/um/kernel/trap.c
+@@ -47,14 +47,15 @@ retry:
+       vma = find_vma(mm, address);
+       if (!vma)
+               goto out;
+-      else if (vma->vm_start <= address)
++      if (vma->vm_start <= address)
+               goto good_area;
+-      else if (!(vma->vm_flags & VM_GROWSDOWN))
++      if (!(vma->vm_flags & VM_GROWSDOWN))
+               goto out;
+-      else if (is_user && !ARCH_IS_STACKGROW(address))
+-              goto out;
+-      else if (expand_stack(vma, address))
++      if (is_user && !ARCH_IS_STACKGROW(address))
+               goto out;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto out_nosemaphore;
+ good_area:
+       *code_out = SEGV_ACCERR;
+--- a/drivers/iommu/amd/iommu_v2.c
++++ b/drivers/iommu/amd/iommu_v2.c
+@@ -485,8 +485,8 @@ static void do_fault(struct work_struct
+       flags |= FAULT_FLAG_REMOTE;
+       mmap_read_lock(mm);
+-      vma = find_extend_vma(mm, address);
+-      if (!vma || address < vma->vm_start)
++      vma = vma_lookup(mm, address);
++      if (!vma)
+               /* failed to get a vma in the right range */
+               goto out;
+--- a/drivers/iommu/io-pgfault.c
++++ b/drivers/iommu/io-pgfault.c
+@@ -89,7 +89,7 @@ iopf_handle_single(struct iopf_fault *io
+       mmap_read_lock(mm);
+-      vma = find_extend_vma(mm, prm->addr);
++      vma = vma_lookup(mm, prm->addr);
+       if (!vma)
+               /* Unmapped area */
+               goto out_put_mm;
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -317,7 +317,7 @@ create_elf_tables(struct linux_binprm *b
+        */
+       if (mmap_write_lock_killable(mm))
+               return -EINTR;
+-      vma = find_extend_vma_locked(mm, bprm->p, true);
++      vma = find_extend_vma_locked(mm, bprm->p);
+       mmap_write_unlock(mm);
+       if (!vma)
+               return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct
+        */
+       if (write && pos < vma->vm_start) {
+               mmap_write_lock(mm);
+-              ret = expand_downwards(vma, pos, true);
++              ret = expand_downwards(vma, pos);
+               if (unlikely(ret < 0)) {
+                       mmap_write_unlock(mm);
+                       return NULL;
+@@ -860,7 +860,7 @@ int setup_arg_pages(struct linux_binprm
+               stack_base = vma->vm_start - stack_expand;
+ #endif
+       current->mm->start_stack = bprm->p;
+-      ret = expand_stack_locked(vma, stack_base, true);
++      ret = expand_stack_locked(vma, stack_base);
+       if (ret)
+               ret = -EFAULT;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2810,18 +2810,11 @@ extern vm_fault_t filemap_page_mkwrite(s
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked);
+-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked);
+-#if VM_GROWSUP
+-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+-#else
+-  #define expand_upwards(vma, address) (0)
+-#endif
++int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+@@ -2916,9 +2909,8 @@ unsigned long change_prot_numa(struct vm
+                       unsigned long start, unsigned long end);
+ #endif
+-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
+-              unsigned long addr, bool write_locked);
++              unsigned long addr);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+                       unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1182,7 +1182,7 @@ static long __get_user_pages(struct mm_s
+               /* first iteration or cross vma bound */
+               if (!vma || start >= vma->vm_end) {
+-                      vma = find_extend_vma(mm, start);
++                      vma = vma_lookup(mm, start);
+                       if (!vma && in_gate_area(mm, start)) {
+                               ret = get_gate_page(mm, start & PAGE_MASK,
+                                               gup_flags, &vma,
+@@ -1351,8 +1351,8 @@ int fixup_user_fault(struct mm_struct *m
+               fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ retry:
+-      vma = find_extend_vma(mm, address);
+-      if (!vma || address < vma->vm_start)
++      vma = vma_lookup(mm, address);
++      if (!vma)
+               return -EFAULT;
+       if (!vma_permits_fault(vma, fault_flags))
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5352,7 +5352,7 @@ struct vm_area_struct *lock_mm_and_find_
+                       goto fail;
+       }
+-      if (expand_stack_locked(vma, addr, true))
++      if (expand_stack_locked(vma, addr))
+               goto fail;
+ success:
+@@ -5636,6 +5636,14 @@ int __access_remote_vm(struct mm_struct
+       if (mmap_read_lock_killable(mm))
+               return 0;
++      /* We might need to expand the stack to access it */
++      vma = vma_lookup(mm, addr);
++      if (!vma) {
++              vma = expand_stack(mm, addr);
++              if (!vma)
++                      return 0;
++      }
++
+       /* ignore errors, just check how much was successfully transferred */
+       while (len) {
+               int bytes, ret, offset;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1945,8 +1945,7 @@ static int acct_stack_growth(struct vm_a
+  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+  * vma is the last one with address > vma->vm_end.  Have to extend vma.
+  */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       struct vm_area_struct *next;
+@@ -1970,8 +1969,6 @@ int expand_upwards(struct vm_area_struct
+       if (gap_addr < address || gap_addr > TASK_SIZE)
+               gap_addr = TASK_SIZE;
+-      if (!write_locked)
+-              return -EAGAIN;
+       next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+       if (next && vma_is_accessible(next)) {
+               if (!(next->vm_flags & VM_GROWSUP))
+@@ -2039,15 +2036,18 @@ int expand_upwards(struct vm_area_struct
+ /*
+  * vma is the first one with address < vma->vm_start.  Have to extend vma.
++ * mmap_lock held for writing.
+  */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+       struct vm_area_struct *prev;
+       int error = 0;
++      if (!(vma->vm_flags & VM_GROWSDOWN))
++              return -EFAULT;
++
+       address &= PAGE_MASK;
+       if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+               return -EPERM;
+@@ -2060,8 +2060,6 @@ int expand_downwards(struct vm_area_stru
+                   vma_is_accessible(prev) &&
+                   (address - prev->vm_end < stack_guard_gap))
+                       return -ENOMEM;
+-              if (!write_locked && (prev->vm_end == address))
+-                      return -EAGAIN;
+       }
+       if (mas_preallocate(&mas, vma, GFP_KERNEL))
+@@ -2139,14 +2137,12 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+-      return expand_upwards(vma, address, write_locked);
++      return expand_upwards(vma, address);
+ }
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+-              unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+       struct vm_area_struct *vma, *prev;
+@@ -2156,23 +2152,21 @@ struct vm_area_struct *find_extend_vma_l
+               return vma;
+       if (!prev)
+               return NULL;
+-      if (expand_stack_locked(prev, addr, write_locked))
++      if (expand_stack_locked(prev, addr))
+               return NULL;
+       if (prev->vm_flags & VM_LOCKED)
+               populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+       return prev;
+ }
+ #else
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+       if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+               return -EINVAL;
+-      return expand_downwards(vma, address, write_locked);
++      return expand_downwards(vma, address);
+ }
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+-              unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+       struct vm_area_struct *vma;
+       unsigned long start;
+@@ -2184,7 +2178,7 @@ struct vm_area_struct *find_extend_vma_l
+       if (vma->vm_start <= addr)
+               return vma;
+       start = vma->vm_start;
+-      if (expand_stack_locked(vma, addr, write_locked))
++      if (expand_stack_locked(vma, addr))
+               return NULL;
+       if (vma->vm_flags & VM_LOCKED)
+               populate_vma_page_range(vma, addr, start, NULL);
+@@ -2192,12 +2186,91 @@ struct vm_area_struct *find_extend_vma_l
+ }
+ #endif
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
+-              unsigned long addr)
++/*
++ * IA64 has some horrid mapping rules: it can expand both up and down,
++ * but with various special rules.
++ *
++ * We'll get rid of this architecture eventually, so the ugliness is
++ * temporary.
++ */
++#ifdef CONFIG_IA64
++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
++{
++      return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
++              REGION_OFFSET(addr) < RGN_MAP_LIMIT;
++}
++
++/*
++ * IA64 stacks grow down, but there's a special register backing store
++ * that can grow up. Only sequentially, though, so the new address must
++ * match vm_end.
++ */
++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
++{
++      if (!vma_expand_ok(vma, addr))
++              return -EFAULT;
++      if (vma->vm_end != (addr & PAGE_MASK))
++              return -EFAULT;
++      return expand_upwards(vma, addr);
++}
++
++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
++{
++      if (!vma_expand_ok(vma, addr))
++              return -EFAULT;
++      return expand_downwards(vma, addr);
++}
++
++#elif defined(CONFIG_STACK_GROWSUP)
++
++#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
++#define vma_expand_down(vma, addr) (-EFAULT)
++
++#else
++
++#define vma_expand_up(vma,addr) (-EFAULT)
++#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
++
++#endif
++
++/*
++ * expand_stack(): legacy interface for page faulting. Don't use unless
++ * you have to.
++ *
++ * This is called with the mm locked for reading, drops the lock, takes
++ * the lock for writing, tries to look up a vma again, expands it if
++ * necessary, and downgrades the lock to reading again.
++ *
++ * If no vma is found or it can't be expanded, it returns NULL and has
++ * dropped the lock.
++ */
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+ {
+-      return find_extend_vma_locked(mm, addr, false);
++      struct vm_area_struct *vma, *prev;
++
++      mmap_read_unlock(mm);
++      if (mmap_write_lock_killable(mm))
++              return NULL;
++
++      vma = find_vma_prev(mm, addr, &prev);
++      if (vma && vma->vm_start <= addr)
++              goto success;
++
++      if (prev && !vma_expand_up(prev, addr)) {
++              vma = prev;
++              goto success;
++      }
++
++      if (vma && !vma_expand_down(vma, addr))
++              goto success;
++
++      mmap_write_unlock(mm);
++      return NULL;
++
++success:
++      mmap_write_downgrade(mm);
++      return vma;
+ }
+-EXPORT_SYMBOL_GPL(find_extend_vma);
+ /*
+  * Ok - we have the memory areas we should free on a maple tree so release them,
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -682,24 +682,20 @@ struct vm_area_struct *find_vma(struct m
+ EXPORT_SYMBOL(find_vma);
+ /*
+- * find a VMA
+- * - we don't extend stack VMAs under NOMMU conditions
+- */
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+-{
+-      return find_vma(mm, addr);
+-}
+-
+-/*
+  * expand a stack to a given address
+  * - not supported under NOMMU conditions
+  */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
+ {
+       return -ENOMEM;
+ }
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
++{
++      mmap_read_unlock(mm);
++      return NULL;
++}
++
+ /*
+  * look up the first VMA exactly that exactly matches addr
+  * - should be called with mm->mmap_lock at least held readlocked
diff --git a/queue-6.1/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch b/queue-6.1/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..9120b2f
--- /dev/null
@@ -0,0 +1,491 @@
+From f128a1b1b5a6b39471d62f1398196631160a24a2 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 10:55:38 -0700
+Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream.
+
+This does the simple pattern conversion of alpha, arc, csky, hexagon,
+loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma()
+helper.  They all have the regular fault handling pattern without odd
+special cases.
+
+The remaining architectures all have something that keeps us from a
+straightforward conversion: ia64 and parisc have stacks that can grow
+both up as well as down (and ia64 has special address region checks).
+
+And m68k, microblaze, openrisc, sparc64, and um end up having extra
+rules about only expanding the stack down a limited amount below the
+user space stack pointer.  That is something that x86 used to do too
+(long long ago), and it probably could just be skipped, but it still
+makes the conversion less than trivial.
+
+Note that this conversion was done manually and with the exception of
+alpha without any build testing, because I have a fairly limited cross-
+building environment.  The cases are all simple, and I went through the
+changes several times, but...
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/alpha/Kconfig         |    1 +
+ arch/alpha/mm/fault.c      |   13 +++----------
+ arch/arc/Kconfig           |    1 +
+ arch/arc/mm/fault.c        |   11 +++--------
+ arch/csky/Kconfig          |    1 +
+ arch/csky/mm/fault.c       |   22 +++++-----------------
+ arch/hexagon/Kconfig       |    1 +
+ arch/hexagon/mm/vm_fault.c |   18 ++++--------------
+ arch/loongarch/Kconfig     |    1 +
+ arch/loongarch/mm/fault.c  |   16 ++++++----------
+ arch/nios2/Kconfig         |    1 +
+ arch/nios2/mm/fault.c      |   17 ++---------------
+ arch/sh/Kconfig            |    1 +
+ arch/sh/mm/fault.c         |   17 ++---------------
+ arch/sparc/Kconfig         |    1 +
+ arch/sparc/mm/fault_32.c   |   32 ++++++++------------------------
+ arch/xtensa/Kconfig        |    1 +
+ arch/xtensa/mm/fault.c     |   14 +++-----------
+ 18 files changed, 45 insertions(+), 124 deletions(-)
+
+--- a/arch/alpha/Kconfig
++++ b/arch/alpha/Kconfig
+@@ -28,6 +28,7 @@ config ALPHA
+       select GENERIC_SMP_IDLE_THREAD
+       select HAVE_ARCH_AUDITSYSCALL
+       select HAVE_MOD_ARCH_SPECIFIC
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select ODD_RT_SIGACTION
+       select OLD_SIGSUSPEND
+--- a/arch/alpha/mm/fault.c
++++ b/arch/alpha/mm/fault.c
+@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns
+               flags |= FAULT_FLAG_USER;
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /* Ok, we have a good vm_area for this memory access, so
+          we can handle it.  */
+- good_area:
+       si_code = SEGV_ACCERR;
+       if (cause < 0) {
+               if (!(vma->vm_flags & VM_EXEC))
+@@ -189,6 +181,7 @@ retry:
+  bad_area:
+       mmap_read_unlock(mm);
++ bad_area_nosemaphore:
+       if (user_mode(regs))
+               goto do_sigsegv;
+--- a/arch/arc/Kconfig
++++ b/arch/arc/Kconfig
+@@ -41,6 +41,7 @@ config ARC
+       select HAVE_PERF_EVENTS
+       select HAVE_SYSCALL_TRACEPOINTS
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select OF
+       select OF_EARLY_FLATTREE
+--- a/arch/arc/mm/fault.c
++++ b/arch/arc/mm/fault.c
+@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (unlikely(address < vma->vm_start)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address))
+-                      goto bad_area;
+-      }
++              goto bad_area_nosemaphore;
+       /*
+        * vm_area is good, now check permissions for this memory access
+@@ -161,6 +155,7 @@ retry:
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       /*
+        * Major/minor page fault accounting
+        * (in case of retry we only land here once)
+--- a/arch/csky/Kconfig
++++ b/arch/csky/Kconfig
+@@ -96,6 +96,7 @@ config CSKY
+       select HAVE_RSEQ
+       select HAVE_STACKPROTECTOR
+       select HAVE_SYSCALL_TRACEPOINTS
++      select LOCK_MM_AND_FIND_VMA
+       select MAY_HAVE_SPARSE_IRQ
+       select MODULES_USE_ELF_RELA if MODULES
+       select OF
+--- a/arch/csky/mm/fault.c
++++ b/arch/csky/mm/fault.c
+@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct
+       BUG();
+ }
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
+ {
+       /*
+        * Something tried to access memory that isn't in our memory map.
+        * Fix it, but check if it's kernel or user first.
+        */
+-      mmap_read_unlock(mm);
+       /* User mode accesses just cause a SIGSEGV */
+       if (user_mode(regs)) {
+               do_trap(regs, SIGSEGV, code, addr);
+@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_
+       if (is_write(regs))
+               flags |= FAULT_FLAG_WRITE;
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, addr);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= addr))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, addr))) {
+-              bad_area(regs, mm, code, addr);
++              bad_area_nosemaphore(regs, mm, code, addr);
+               return;
+       }
+@@ -259,11 +247,11 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it.
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (unlikely(access_error(regs, vma))) {
+-              bad_area(regs, mm, code, addr);
++              mmap_read_unlock(mm);
++              bad_area_nosemaphore(regs, mm, code, addr);
+               return;
+       }
+--- a/arch/hexagon/Kconfig
++++ b/arch/hexagon/Kconfig
+@@ -28,6 +28,7 @@ config HEXAGON
+       select GENERIC_SMP_IDLE_THREAD
+       select STACKTRACE_SUPPORT
+       select GENERIC_CLOCKEVENTS_BROADCAST
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select GENERIC_CPU_DEVICES
+       select ARCH_WANT_LD_ORPHAN_WARN
+--- a/arch/hexagon/mm/vm_fault.c
++++ b/arch/hexagon/mm/vm_fault.c
+@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-      if (!vma)
+-              goto bad_area;
++      vma = lock_mm_and_find_vma(mm, address, regs);
++      if (unlikely(!vma))
++              goto bad_area_nosemaphore;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-
+-      if (expand_stack(vma, address))
+-              goto bad_area;
+-
+-good_area:
+       /* Address space is OK.  Now check access rights. */
+       si_code = SEGV_ACCERR;
+@@ -140,6 +129,7 @@ good_area:
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+               return;
+--- a/arch/loongarch/Kconfig
++++ b/arch/loongarch/Kconfig
+@@ -107,6 +107,7 @@ config LOONGARCH
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP
+       select IRQ_FORCED_THREADING
+       select IRQ_LOONGARCH_CPU
++      select LOCK_MM_AND_FIND_VMA
+       select MMU_GATHER_MERGE_VMAS if MMU
+       select MODULES_USE_ELF_RELA if MODULES
+       select NEED_PER_CPU_EMBED_FIRST_CHUNK
+--- a/arch/loongarch/mm/fault.c
++++ b/arch/loongarch/mm/fault.c
+@@ -166,22 +166,18 @@ static void __kprobes __do_page_fault(st
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-      if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (!expand_stack(vma, address))
+-              goto good_area;
++      vma = lock_mm_and_find_vma(mm, address, regs);
++      if (unlikely(!vma))
++              goto bad_area_nosemaphore;
++      goto good_area;
++
+ /*
+  * Something tried to access memory that isn't in our memory map..
+  * Fix it, but check if it's kernel or user first..
+  */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       do_sigsegv(regs, write, address, si_code);
+       return;
+--- a/arch/nios2/Kconfig
++++ b/arch/nios2/Kconfig
+@@ -16,6 +16,7 @@ config NIOS2
+       select HAVE_ARCH_TRACEHOOK
+       select HAVE_ARCH_KGDB
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select OF
+       select OF_EARLY_FLATTREE
+--- a/arch/nios2/mm/fault.c
++++ b/arch/nios2/mm/fault.c
+@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ea))
+-                      goto bad_area_nosemaphore;
+ retry:
+-              mmap_read_lock(mm);
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+  * we can handle it..
+  */
+-good_area:
+       code = SEGV_ACCERR;
+       switch (cause) {
+--- a/arch/sh/Kconfig
++++ b/arch/sh/Kconfig
+@@ -56,6 +56,7 @@ config SUPERH
+       select HAVE_STACKPROTECTOR
+       select HAVE_SYSCALL_TRACEPOINTS
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select NEED_SG_DMA_LENGTH
+       select NO_DMA if !MMU && !DMA_COHERENT
+--- a/arch/sh/mm/fault.c
++++ b/arch/sh/mm/fault.c
+@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(
+       }
+ retry:
+-      mmap_read_lock(mm);
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= address))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, address))) {
+-              bad_area(regs, error_code, address);
++              bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
+@@ -461,7 +449,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       if (unlikely(access_error(error_code, vma))) {
+               bad_area_access_error(regs, error_code, address);
+               return;
+--- a/arch/sparc/Kconfig
++++ b/arch/sparc/Kconfig
+@@ -56,6 +56,7 @@ config SPARC32
+       select DMA_DIRECT_REMAP
+       select GENERIC_ATOMIC64
+       select HAVE_UID16
++      select LOCK_MM_AND_FIND_VMA
+       select OLD_SIGACTION
+       select ZONE_DMA
+--- a/arch/sparc/mm/fault_32.c
++++ b/arch/sparc/mm/fault_32.c
+@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt
+       if (pagefault_disabled() || !mm)
+               goto no_context;
++      if (!from_user && address >= PAGE_OFFSET)
++              goto no_context;
++
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-
+-      if (!from_user && address >= PAGE_OFFSET)
+-              goto bad_area;
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (write) {
+               if (!(vma->vm_flags & VM_WRITE))
+@@ -318,17 +309,9 @@ static void force_user_fault(unsigned lo
+       code = SEGV_MAPERR;
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
+-good_area:
++              goto bad_area_nosemaphore;
+       code = SEGV_ACCERR;
+       if (write) {
+               if (!(vma->vm_flags & VM_WRITE))
+@@ -347,6 +330,7 @@ good_area:
+       return;
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address);
+       return;
+--- a/arch/xtensa/Kconfig
++++ b/arch/xtensa/Kconfig
+@@ -49,6 +49,7 @@ config XTENSA
+       select HAVE_SYSCALL_TRACEPOINTS
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select PERF_USE_VMALLOC
+       select TRACE_IRQFLAGS_SUPPORT
+--- a/arch/xtensa/mm/fault.c
++++ b/arch/xtensa/mm/fault.c
+@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs)
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /* Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (is_write) {
+@@ -205,6 +196,7 @@ good_area:
+        */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               current->thread.bad_vaddr = address;
+               current->thread.error_code = is_write;
diff --git a/queue-6.1/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch b/queue-6.1/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
new file mode 100644 (file)
index 0000000..a290458
--- /dev/null
@@ -0,0 +1,298 @@
+From 088826669e9cadc96824a9523a799bd6854a31ec Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 15:17:36 -0700
+Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream.
+
+.. and make x86 use it.
+
+This basically extracts the existing x86 "find and expand faulting vma"
+code, but extends it to also take the mmap lock for writing in case we
+actually do need to expand the vma.
+
+We've historically short-circuited that case, and have some rather ugly
+special logic to serialize the stack segment expansion (since we only
+hold the mmap lock for reading) that doesn't match the normal VM
+locking.
+
+That slight violation of locking worked well, right up until it didn't:
+the maple tree code really does want proper locking even for simple
+extension of an existing vma.
+
+So extract the code for "look up the vma of the fault" from x86, fix it
+up to do the necessary write locking, and make it available as a helper
+function for other architectures that can use the common helper.
+
+Note: I say "common helper", but it really only handles the normal
+stack-grows-down case.  Which is all architectures except for PA-RISC
+and IA64.  So some rare architectures can't use the helper, but if they
+care they'll just need to open-code this logic.
+
+It's also worth pointing out that this code really would like to have an
+optimistic "mmap_upgrade_trylock()" to make it quicker to go from a
+read-lock (for the common case) to taking the write lock (for having to
+extend the vma) in the normal single-threaded situation where there is
+no other locking activity.
+
+But that _is_ all the very uncommon special case, so while it would be
+nice to have such an operation, it probably doesn't matter in reality.
+I did put in the skeleton code for such a possible future expansion,
+even if it only acts as pseudo-documentation for what we're doing.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1: Ignore CONFIG_PER_VMA_LOCK context]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig    |    1 
+ arch/x86/mm/fault.c |   52 ----------------------
+ include/linux/mm.h  |    2 
+ mm/Kconfig          |    4 +
+ mm/memory.c         |  121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 130 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -271,6 +271,7 @@ config X86
+       select HAVE_GENERIC_VDSO
+       select HOTPLUG_SMT                      if SMP
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select NEED_PER_CPU_EMBED_FIRST_CHUNK
+       select NEED_PER_CPU_PAGE_FIRST_CHUNK
+       select NEED_SG_DMA_LENGTH
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -900,12 +900,6 @@ __bad_area(struct pt_regs *regs, unsigne
+       __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+ }
+-static noinline void
+-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+-{
+-      __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
+-}
+-
+ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+               struct vm_area_struct *vma)
+ {
+@@ -1354,51 +1348,10 @@ void do_user_addr_fault(struct pt_regs *
+       }
+ #endif
+-      /*
+-       * Kernel-mode access to the user address space should only occur
+-       * on well-defined single instructions listed in the exception
+-       * tables.  But, an erroneous kernel fault occurring outside one of
+-       * those areas which also holds mmap_lock might deadlock attempting
+-       * to validate the fault against the address space.
+-       *
+-       * Only do the expensive exception table search when we might be at
+-       * risk of a deadlock.  This happens if we
+-       * 1. Failed to acquire mmap_lock, and
+-       * 2. The access did not originate in userspace.
+-       */
+-      if (unlikely(!mmap_read_trylock(mm))) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
+-                      /*
+-                       * Fault from code in kernel from
+-                       * which we do not expect faults.
+-                       */
+-                      bad_area_nosemaphore(regs, error_code, address);
+-                      return;
+-              }
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case we'll have missed the might_sleep() from
+-               * down_read():
+-               */
+-              might_sleep();
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= address))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, address))) {
+-              bad_area(regs, error_code, address);
++              bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
+@@ -1406,7 +1359,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       if (unlikely(access_error(error_code, vma))) {
+               bad_area_access_error(regs, error_code, address, vma);
+               return;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1932,6 +1932,8 @@ void unmap_mapping_pages(struct address_
+               pgoff_t start, pgoff_t nr, bool even_cows);
+ void unmap_mapping_range(struct address_space *mapping,
+               loff_t const holebegin, loff_t const holelen, int even_cows);
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++              unsigned long address, struct pt_regs *regs);
+ #else
+ static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+                                        unsigned long address, unsigned int flags,
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1150,6 +1150,10 @@ config LRU_GEN_STATS
+         This option has a per-memcg and per-node memory overhead.
+ # }
++config LOCK_MM_AND_FIND_VMA
++      bool
++      depends on !STACK_GROWSUP
++
+ source "mm/damon/Kconfig"
+ endmenu
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5246,6 +5246,127 @@ vm_fault_t handle_mm_fault(struct vm_are
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
++#include <linux/extable.h>
++
++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++      /* Even if this succeeds, make it clear we *might* have slept */
++      if (likely(mmap_read_trylock(mm))) {
++              might_sleep();
++              return true;
++      }
++
++      if (regs && !user_mode(regs)) {
++              unsigned long ip = instruction_pointer(regs);
++              if (!search_exception_tables(ip))
++                      return false;
++      }
++
++      mmap_read_lock(mm);
++      return true;
++}
++
++static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
++{
++      /*
++       * We don't have this operation yet.
++       *
++       * It should be easy enough to do: it's basically a
++       *    atomic_long_try_cmpxchg_acquire()
++       * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
++       * it also needs the proper lockdep magic etc.
++       */
++      return false;
++}
++
++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++      mmap_read_unlock(mm);
++      if (regs && !user_mode(regs)) {
++              unsigned long ip = instruction_pointer(regs);
++              if (!search_exception_tables(ip))
++                      return false;
++      }
++      mmap_write_lock(mm);
++      return true;
++}
++
++/*
++ * Helper for page fault handling.
++ *
++ * This is kind of equivalend to "mmap_read_lock()" followed
++ * by "find_extend_vma()", except it's a lot more careful about
++ * the locking (and will drop the lock on failure).
++ *
++ * For example, if we have a kernel bug that causes a page
++ * fault, we don't want to just use mmap_read_lock() to get
++ * the mm lock, because that would deadlock if the bug were
++ * to happen while we're holding the mm lock for writing.
++ *
++ * So this checks the exception tables on kernel faults in
++ * order to only do this all for instructions that are actually
++ * expected to fault.
++ *
++ * We can also actually take the mm lock for writing if we
++ * need to extend the vma, which helps the VM layer a lot.
++ */
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++                      unsigned long addr, struct pt_regs *regs)
++{
++      struct vm_area_struct *vma;
++
++      if (!get_mmap_lock_carefully(mm, regs))
++              return NULL;
++
++      vma = find_vma(mm, addr);
++      if (likely(vma && (vma->vm_start <= addr)))
++              return vma;
++
++      /*
++       * Well, dang. We might still be successful, but only
++       * if we can extend a vma to do so.
++       */
++      if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
++              mmap_read_unlock(mm);
++              return NULL;
++      }
++
++      /*
++       * We can try to upgrade the mmap lock atomically,
++       * in which case we can continue to use the vma
++       * we already looked up.
++       *
++       * Otherwise we'll have to drop the mmap lock and
++       * re-take it, and also look up the vma again,
++       * re-checking it.
++       */
++      if (!mmap_upgrade_trylock(mm)) {
++              if (!upgrade_mmap_lock_carefully(mm, regs))
++                      return NULL;
++
++              vma = find_vma(mm, addr);
++              if (!vma)
++                      goto fail;
++              if (vma->vm_start <= addr)
++                      goto success;
++              if (!(vma->vm_flags & VM_GROWSDOWN))
++                      goto fail;
++      }
++
++      if (expand_stack(vma, addr))
++              goto fail;
++
++success:
++      mmap_write_downgrade(mm);
++      return vma;
++
++fail:
++      mmap_write_unlock(mm);
++      return NULL;
++}
++#endif
++
+ #ifndef __PAGETABLE_P4D_FOLDED
+ /*
+  * Allocate p4d page table.
diff --git a/queue-6.1/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch b/queue-6.1/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
new file mode 100644 (file)
index 0000000..37abf97
--- /dev/null
@@ -0,0 +1,244 @@
+From 37a9a30aeabe9fd620bcda2bb333f28a1593820d Mon Sep 17 00:00:00 2001
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Fri, 16 Jun 2023 15:58:54 -0700
+Subject: mm: make find_extend_vma() fail if write lock not held
+
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+
+commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream.
+
+Make calls to extend_vma() and find_extend_vma() fail if the write lock
+is required.
+
+To avoid making this a flag-day event, this still allows the old
+read-locking case for the trivial situations, and passes in a flag to
+say "is it write-locked".  That way write-lockers can say "yes, I'm
+being careful", and legacy users will continue to work in all the common
+cases until they have been fully converted to the new world order.
+
+Co-Developed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/binfmt_elf.c    |    6 +++---
+ fs/exec.c          |    5 +++--
+ include/linux/mm.h |   10 +++++++---
+ mm/memory.c        |    2 +-
+ mm/mmap.c          |   50 +++++++++++++++++++++++++++++++++-----------------
+ mm/nommu.c         |    3 ++-
+ 6 files changed, 49 insertions(+), 27 deletions(-)
+
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -315,10 +315,10 @@ create_elf_tables(struct linux_binprm *b
+        * Grow the stack manually; some architectures have a limit on how
+        * far ahead a user-space access may be in order to grow the stack.
+        */
+-      if (mmap_read_lock_killable(mm))
++      if (mmap_write_lock_killable(mm))
+               return -EINTR;
+-      vma = find_extend_vma(mm, bprm->p);
+-      mmap_read_unlock(mm);
++      vma = find_extend_vma_locked(mm, bprm->p, true);
++      mmap_write_unlock(mm);
+       if (!vma)
+               return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -203,7 +203,8 @@ static struct page *get_arg_page(struct
+ #ifdef CONFIG_STACK_GROWSUP
+       if (write) {
+-              ret = expand_downwards(bprm->vma, pos);
++              /* We claim to hold the lock - nobody to race with */
++              ret = expand_downwards(bprm->vma, pos, true);
+               if (ret < 0)
+                       return NULL;
+       }
+@@ -854,7 +855,7 @@ int setup_arg_pages(struct linux_binprm
+               stack_base = vma->vm_start - stack_expand;
+ #endif
+       current->mm->start_stack = bprm->p;
+-      ret = expand_stack(vma, stack_base);
++      ret = expand_stack_locked(vma, stack_base, true);
+       if (ret)
+               ret = -EFAULT;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2810,11 +2810,13 @@ extern vm_fault_t filemap_page_mkwrite(s
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked);
++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-extern int expand_downwards(struct vm_area_struct *vma,
+-              unsigned long address);
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked);
+ #if VM_GROWSUP
+ extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+ #else
+@@ -2915,6 +2917,8 @@ unsigned long change_prot_numa(struct vm
+ #endif
+ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
++              unsigned long addr, bool write_locked);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+                       unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5352,7 +5352,7 @@ struct vm_area_struct *lock_mm_and_find_
+                       goto fail;
+       }
+-      if (expand_stack(vma, addr))
++      if (expand_stack_locked(vma, addr, true))
+               goto fail;
+ success:
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1945,7 +1945,8 @@ static int acct_stack_growth(struct vm_a
+  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+  * vma is the last one with address > vma->vm_end.  Have to extend vma.
+  */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
++int expand_upwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       struct vm_area_struct *next;
+@@ -1969,6 +1970,8 @@ int expand_upwards(struct vm_area_struct
+       if (gap_addr < address || gap_addr > TASK_SIZE)
+               gap_addr = TASK_SIZE;
++      if (!write_locked)
++              return -EAGAIN;
+       next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+       if (next && vma_is_accessible(next)) {
+               if (!(next->vm_flags & VM_GROWSUP))
+@@ -2037,7 +2040,8 @@ int expand_upwards(struct vm_area_struct
+ /*
+  * vma is the first one with address < vma->vm_start.  Have to extend vma.
+  */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+@@ -2051,10 +2055,13 @@ int expand_downwards(struct vm_area_stru
+       /* Enforce stack_guard_gap */
+       prev = mas_prev(&mas, 0);
+       /* Check that both stack segments have the same anon_vma? */
+-      if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+-                      vma_is_accessible(prev)) {
+-              if (address - prev->vm_end < stack_guard_gap)
++      if (prev) {
++              if (!(prev->vm_flags & VM_GROWSDOWN) &&
++                  vma_is_accessible(prev) &&
++                  (address - prev->vm_end < stack_guard_gap))
+                       return -ENOMEM;
++              if (!write_locked && (prev->vm_end == address))
++                      return -EAGAIN;
+       }
+       if (mas_preallocate(&mas, vma, GFP_KERNEL))
+@@ -2132,13 +2139,14 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+-      return expand_upwards(vma, address);
++      return expand_upwards(vma, address, write_locked);
+ }
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++              unsigned long addr, bool write_locked)
+ {
+       struct vm_area_struct *vma, *prev;
+@@ -2146,20 +2154,25 @@ find_extend_vma(struct mm_struct *mm, un
+       vma = find_vma_prev(mm, addr, &prev);
+       if (vma && (vma->vm_start <= addr))
+               return vma;
+-      if (!prev || expand_stack(prev, addr))
++      if (!prev)
++              return NULL;
++      if (expand_stack_locked(prev, addr, write_locked))
+               return NULL;
+       if (prev->vm_flags & VM_LOCKED)
+               populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+       return prev;
+ }
+ #else
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+-      return expand_downwards(vma, address);
++      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
++              return -EINVAL;
++      return expand_downwards(vma, address, write_locked);
+ }
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++              unsigned long addr, bool write_locked)
+ {
+       struct vm_area_struct *vma;
+       unsigned long start;
+@@ -2170,10 +2183,8 @@ find_extend_vma(struct mm_struct *mm, un
+               return NULL;
+       if (vma->vm_start <= addr)
+               return vma;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              return NULL;
+       start = vma->vm_start;
+-      if (expand_stack(vma, addr))
++      if (expand_stack_locked(vma, addr, write_locked))
+               return NULL;
+       if (vma->vm_flags & VM_LOCKED)
+               populate_vma_page_range(vma, addr, start, NULL);
+@@ -2181,6 +2192,11 @@ find_extend_vma(struct mm_struct *mm, un
+ }
+ #endif
++struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
++              unsigned long addr)
++{
++      return find_extend_vma_locked(mm, addr, false);
++}
+ EXPORT_SYMBOL_GPL(find_extend_vma);
+ /*
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -694,7 +694,8 @@ struct vm_area_struct *find_extend_vma(s
+  * expand a stack to a given address
+  * - not supported under NOMMU conditions
+  */
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       return -ENOMEM;
+ }
diff --git a/queue-6.1/mm-make-the-page-fault-mmap-locking-killable.patch b/queue-6.1/mm-make-the-page-fault-mmap-locking-killable.patch
new file mode 100644 (file)
index 0000000..a786150
--- /dev/null
@@ -0,0 +1,48 @@
+From 92a6879f1c3fc7fdf6660b10be045c457ec697c6 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 16:17:48 -0700
+Subject: mm: make the page fault mmap locking killable
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream.
+
+This is done as a separate patch from introducing the new
+lock_mm_and_find_vma() helper, because while it's an obvious change,
+it's not what x86 used to do in this area.
+
+We already abort the page fault on fatal signals anyway, so why should
+we wait for the mmap lock only to then abort later? With the new helper
+function that returns without the lock held on failure anyway, this is
+particularly easy and straightforward.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c |    6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5263,8 +5263,7 @@ static inline bool get_mmap_lock_careful
+                       return false;
+       }
+-      mmap_read_lock(mm);
+-      return true;
++      return !mmap_read_lock_killable(mm);
+ }
+ static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+@@ -5288,8 +5287,7 @@ static inline bool upgrade_mmap_lock_car
+               if (!search_exception_tables(ip))
+                       return false;
+       }
+-      mmap_write_lock(mm);
+-      return true;
++      return !mmap_write_lock_killable(mm);
+ }
+ /*
diff --git a/queue-6.1/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch b/queue-6.1/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..04d3249
--- /dev/null
@@ -0,0 +1,49 @@
+From d47a1e567e9744a2a097ae2a39a2b028619d1f15 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 11:17:05 -0700
+Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream.
+
+This is one of the simple cases, except there's no pt_regs pointer.
+Which is fine, as lock_mm_and_find_vma() is set up to work fine with a
+NULL pt_regs.
+
+Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting,
+so we can just use the helper without any extra work.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/mm/copro_fault.c |   14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/arch/powerpc/mm/copro_fault.c
++++ b/arch/powerpc/mm/copro_fault.c
+@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru
+       if (mm->pgd == NULL)
+               return -EFAULT;
+-      mmap_read_lock(mm);
+-      ret = -EFAULT;
+-      vma = find_vma(mm, ea);
++      vma = lock_mm_and_find_vma(mm, ea, NULL);
+       if (!vma)
+-              goto out_unlock;
+-
+-      if (ea < vma->vm_start) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      goto out_unlock;
+-              if (expand_stack(vma, ea))
+-                      goto out_unlock;
+-      }
++              return -EFAULT;
++      ret = -EFAULT;
+       is_write = dsisr & DSISR_ISSTORE;
+       if (is_write) {
+               if (!(vma->vm_flags & VM_WRITE))
diff --git a/queue-6.1/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..38a8974
--- /dev/null
@@ -0,0 +1,88 @@
+From 689298e7d498f2c6d3e8116bce0a7c769e5369dc Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Fri, 16 Jun 2023 15:51:29 +1000
+Subject: powerpc/mm: Convert to using lock_mm_and_find_vma()
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream.
+
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/Kconfig    |    1 +
+ arch/powerpc/mm/fault.c |   41 ++++-------------------------------------
+ 2 files changed, 5 insertions(+), 37 deletions(-)
+
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -257,6 +257,7 @@ config PPC
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
+       select KASAN_VMALLOC                    if KASAN && MODULES
++      select LOCK_MM_AND_FIND_VMA
+       select MMU_GATHER_PAGE_SIZE
+       select MMU_GATHER_RCU_TABLE_FREE
+       select MMU_GATHER_MERGE_VMAS
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re
+       return __bad_area_nosemaphore(regs, address, si_code);
+ }
+-static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+-{
+-      return __bad_area(regs, address, SEGV_MAPERR);
+-}
+-
+ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+                                   struct vm_area_struct *vma)
+ {
+@@ -481,40 +476,12 @@ static int ___do_page_fault(struct pt_re
+        * we will deadlock attempting to validate the fault against the
+        * address space.  Luckily the kernel only validly references user
+        * space from well defined areas of code, which are listed in the
+-       * exceptions table.
+-       *
+-       * As the vast majority of faults will be valid we will only perform
+-       * the source reference check when there is a possibility of a deadlock.
+-       * Attempt to lock the address space, if we cannot we then validate the
+-       * source.  If this is invalid we can skip the address space check,
+-       * thus avoiding the deadlock.
+-       */
+-      if (unlikely(!mmap_read_trylock(mm))) {
+-              if (!is_user && !search_exception_tables(regs->nip))
+-                      return bad_area_nosemaphore(regs, address);
+-
++       * exceptions table. lock_mm_and_find_vma() handles that logic.
++       */
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case we'll have missed the might_sleep() from
+-               * down_read():
+-               */
+-              might_sleep();
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma))
+-              return bad_area(regs, address);
+-
+-      if (unlikely(vma->vm_start > address)) {
+-              if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+-                      return bad_area(regs, address);
+-
+-              if (unlikely(expand_stack(vma, address)))
+-                      return bad_area(regs, address);
+-      }
++              return bad_area_nosemaphore(regs, address);
+       if (unlikely(access_pkey_error(is_write, is_exec,
+                                      (error_code & DSISR_KEYFAULT), vma)))
diff --git a/queue-6.1/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..aa14f59
--- /dev/null
@@ -0,0 +1,98 @@
+From a907c689b4e7014c73c7c34fb1520431b75c787c Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 20:18:18 +0200
+Subject: riscv/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[6.1: Kconfig context]
+Signed-off-by: Samuel Mendoza-Jonas <samjonas@amazon.com>
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/Kconfig    |    1 +
+ arch/riscv/mm/fault.c |   31 +++++++++++++------------------
+ 2 files changed, 14 insertions(+), 18 deletions(-)
+
+--- a/arch/riscv/Kconfig
++++ b/arch/riscv/Kconfig
+@@ -114,6 +114,7 @@ config RISCV
+       select HAVE_RSEQ
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA if MODULES
+       select MODULE_SECTIONS if MODULES
+       select OF
+--- a/arch/riscv/mm/fault.c
++++ b/arch/riscv/mm/fault.c
+@@ -83,13 +83,13 @@ static inline void mm_fault_error(struct
+       BUG();
+ }
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void
++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr)
+ {
+       /*
+        * Something tried to access memory that isn't in our memory map.
+        * Fix it, but check if it's kernel or user first.
+        */
+-      mmap_read_unlock(mm);
+       /* User mode accesses just cause a SIGSEGV */
+       if (user_mode(regs)) {
+               do_trap(regs, SIGSEGV, code, addr);
+@@ -99,6 +99,15 @@ static inline void bad_area(struct pt_re
+       no_context(regs, addr);
+ }
++static inline void
++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code,
++       unsigned long addr)
++{
++      mmap_read_unlock(mm);
++
++      bad_area_nosemaphore(regs, code, addr);
++}
++
+ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
+ {
+       pgd_t *pgd, *pgd_k;
+@@ -281,23 +290,10 @@ asmlinkage void do_page_fault(struct pt_
+       else if (cause == EXC_INST_PAGE_FAULT)
+               flags |= FAULT_FLAG_INSTRUCTION;
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, addr);
++      vma = lock_mm_and_find_vma(mm, addr, regs);
+       if (unlikely(!vma)) {
+               tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= addr))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, addr))) {
+-              tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
++              bad_area_nosemaphore(regs, code, addr);
+               return;
+       }
+@@ -305,7 +301,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it.
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (unlikely(access_error(cause, vma))) {
index eebea1092ff960e00027296a2c5a3eb5d95ebaff..34905d9a593becc9869f79e35786f50c331e797a 100644 (file)
@@ -11,3 +11,16 @@ x86-smp-use-dedicated-cache-line-for-mwait_play_dead.patch
 x86-smp-cure-kexec-vs.-mwait_play_dead-breakage.patch
 can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch
 maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch
+
+mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
+mm-make-the-page-fault-mmap-locking-killable.patch
+arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
+powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
+mips-mm-convert-to-using-lock_mm_and_find_vma.patch
+riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
+arm-mm-convert-to-using-lock_mm_and_find_vma.patch
+mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
+powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
+mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
+execve-expand-new-process-stack-manually-ahead-of-time.patch
+mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
diff --git a/queue-6.3/arm-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/arm-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..708b2a7
--- /dev/null
@@ -0,0 +1,136 @@
+From 8b35ca3e45e35a26a21427f35d4093606e93ad0a Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 21:24:30 +0200
+Subject: arm/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream.
+
+arm has an additional check for address < FIRST_USER_ADDRESS before
+expanding the stack.  Since FIRST_USER_ADDRESS is defined everywhere
+(generally as 0), move that check to the generic expand_downwards().
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/Kconfig    |    1 
+ arch/arm/mm/fault.c |   63 +++++++++++-----------------------------------------
+ mm/mmap.c           |    2 -
+ 3 files changed, 16 insertions(+), 50 deletions(-)
+
+--- a/arch/arm/Kconfig
++++ b/arch/arm/Kconfig
+@@ -125,6 +125,7 @@ config ARM
+       select HAVE_UID16
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_REL
+       select NEED_DMA_MAP_STATE
+       select OF_EARLY_FLATTREE if OF
+--- a/arch/arm/mm/fault.c
++++ b/arch/arm/mm/fault.c
+@@ -232,37 +232,11 @@ static inline bool is_permission_fault(u
+       return false;
+ }
+-static vm_fault_t __kprobes
+-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags,
+-              unsigned long vma_flags, struct pt_regs *regs)
+-{
+-      struct vm_area_struct *vma = find_vma(mm, addr);
+-      if (unlikely(!vma))
+-              return VM_FAULT_BADMAP;
+-
+-      if (unlikely(vma->vm_start > addr)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      return VM_FAULT_BADMAP;
+-              if (addr < FIRST_USER_ADDRESS)
+-                      return VM_FAULT_BADMAP;
+-              if (expand_stack(vma, addr))
+-                      return VM_FAULT_BADMAP;
+-      }
+-
+-      /*
+-       * ok, we have a good vm_area for this memory access, check the
+-       * permissions on the VMA allow for the fault which occurred.
+-       */
+-      if (!(vma->vm_flags & vma_flags))
+-              return VM_FAULT_BADACCESS;
+-
+-      return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+-}
+-
+ static int __kprobes
+ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ {
+       struct mm_struct *mm = current->mm;
++      struct vm_area_struct *vma;
+       int sig, code;
+       vm_fault_t fault;
+       unsigned int flags = FAULT_FLAG_DEFAULT;
+@@ -301,31 +275,21 @@ do_page_fault(unsigned long addr, unsign
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+-      /*
+-       * As per x86, we may deadlock here.  However, since the kernel only
+-       * validly references user space from well defined areas of the code,
+-       * we can bug out early if this is from code which shouldn't.
+-       */
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
+-                      goto no_context;
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case, we'll have missed the might_sleep() from
+-               * down_read()
+-               */
+-              might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+-              if (!user_mode(regs) &&
+-                  !search_exception_tables(regs->ARM_pc))
+-                      goto no_context;
+-#endif
++      vma = lock_mm_and_find_vma(mm, addr, regs);
++      if (unlikely(!vma)) {
++              fault = VM_FAULT_BADMAP;
++              goto bad_area;
+       }
+-      fault = __do_page_fault(mm, addr, flags, vm_flags, regs);
++      /*
++       * ok, we have a good vm_area for this memory access, check the
++       * permissions on the VMA allow for the fault which occurred.
++       */
++      if (!(vma->vm_flags & vm_flags))
++              fault = VM_FAULT_BADACCESS;
++      else
++              fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+       /* If we need to retry but a fatal signal is pending, handle the
+        * signal first. We do not need to release the mmap_lock because
+@@ -356,6 +320,7 @@ retry:
+       if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+               return 0;
++bad_area:
+       /*
+        * If we are in kernel mode at this point, we
+        * have no context to handle this fault with.
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1999,7 +1999,7 @@ int expand_downwards(struct vm_area_stru
+       int error = 0;
+       address &= PAGE_MASK;
+-      if (address < mmap_min_addr)
++      if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+               return -EPERM;
+       /* Enforce stack_guard_gap */
diff --git a/queue-6.3/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..4d94745
--- /dev/null
@@ -0,0 +1,101 @@
+From ae870a68b5d13d67cf4f18d47bb01ee3fee40acb Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 17:11:44 -0700
+Subject: arm64/mm: Convert to using lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream.
+
+This converts arm64 to use the new page fault helper.  It was very
+straightforward, but still needed a fix for the "obvious" conversion I
+initially did.  Thanks to Suren for the fix and testing.
+
+Fixed-and-tested-by: Suren Baghdasaryan <surenb@google.com>
+Unnecessary-code-removal-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/Kconfig    |    1 +
+ arch/arm64/mm/fault.c |   44 +++++++-------------------------------------
+ 2 files changed, 8 insertions(+), 37 deletions(-)
+
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -219,6 +219,7 @@ config ARM64
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
+       select KASAN_VMALLOC if KASAN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select NEED_DMA_MAP_STATE
+       select NEED_SG_DMA_LENGTH
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa
+ #define VM_FAULT_BADMAP               ((__force vm_fault_t)0x010000)
+ #define VM_FAULT_BADACCESS    ((__force vm_fault_t)0x020000)
+-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
++static vm_fault_t __do_page_fault(struct mm_struct *mm,
++                                struct vm_area_struct *vma, unsigned long addr,
+                                 unsigned int mm_flags, unsigned long vm_flags,
+                                 struct pt_regs *regs)
+ {
+-      struct vm_area_struct *vma = find_vma(mm, addr);
+-
+-      if (unlikely(!vma))
+-              return VM_FAULT_BADMAP;
+-
+       /*
+        * Ok, we have a good vm_area for this memory access, so we can handle
+        * it.
+-       */
+-      if (unlikely(vma->vm_start > addr)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      return VM_FAULT_BADMAP;
+-              if (expand_stack(vma, addr))
+-                      return VM_FAULT_BADMAP;
+-      }
+-
+-      /*
+        * Check that the permissions on the VMA allow for the fault which
+        * occurred.
+        */
+@@ -585,31 +572,14 @@ static int __kprobes do_page_fault(unsig
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+-      /*
+-       * As per x86, we may deadlock here. However, since the kernel only
+-       * validly references user space from well defined areas of the code,
+-       * we can bug out early if this is from code which shouldn't.
+-       */
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->pc))
+-                      goto no_context;
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above mmap_read_trylock() might have succeeded in which
+-               * case, we'll have missed the might_sleep() from down_read().
+-               */
+-              might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+-              if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
+-                      mmap_read_unlock(mm);
+-                      goto no_context;
+-              }
+-#endif
++      vma = lock_mm_and_find_vma(mm, addr, regs);
++      if (unlikely(!vma)) {
++              fault = VM_FAULT_BADMAP;
++              goto done;
+       }
+-      fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
++      fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
+       /* Quick path to respond to signals */
+       if (fault_signal_pending(fault, regs)) {
diff --git a/queue-6.3/execve-expand-new-process-stack-manually-ahead-of-time.patch b/queue-6.3/execve-expand-new-process-stack-manually-ahead-of-time.patch
new file mode 100644 (file)
index 0000000..75b8e5a
--- /dev/null
@@ -0,0 +1,88 @@
+From f313c51d26aa87e69633c9b46efb37a930faca71 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Mon, 19 Jun 2023 11:34:15 -0700
+Subject: execve: expand new process stack manually ahead of time
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream.
+
+This is a small step towards a model where GUP itself would not expand
+the stack, and any user that needs GUP to not look up existing mappings,
+but actually expand on them, would have to do so manually before-hand,
+and with the mm lock held for writing.
+
+It turns out that execve() already did almost exactly that, except it
+didn't take the mm lock at all (it's single-threaded so no locking
+technically needed, but it could cause lockdep errors).  And it only did
+it for the CONFIG_STACK_GROWSUP case, since in that case GUP has
+obviously never expanded the stack downwards.
+
+So just make that CONFIG_STACK_GROWSUP case do the right thing with
+locking, and enable it generally.  This will eventually help GUP, and in
+the meantime avoids a special case and the lockdep issue.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c |   37 +++++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 16 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -199,34 +199,39 @@ static struct page *get_arg_page(struct
+               int write)
+ {
+       struct page *page;
++      struct vm_area_struct *vma = bprm->vma;
++      struct mm_struct *mm = bprm->mm;
+       int ret;
+-      unsigned int gup_flags = 0;
+-#ifdef CONFIG_STACK_GROWSUP
+-      if (write) {
+-              /* We claim to hold the lock - nobody to race with */
+-              ret = expand_downwards(bprm->vma, pos, true);
+-              if (ret < 0)
++      /*
++       * Avoid relying on expanding the stack down in GUP (which
++       * does not work for STACK_GROWSUP anyway), and just do it
++       * by hand ahead of time.
++       */
++      if (write && pos < vma->vm_start) {
++              mmap_write_lock(mm);
++              ret = expand_downwards(vma, pos, true);
++              if (unlikely(ret < 0)) {
++                      mmap_write_unlock(mm);
+                       return NULL;
+-      }
+-#endif
+-
+-      if (write)
+-              gup_flags |= FOLL_WRITE;
++              }
++              mmap_write_downgrade(mm);
++      } else
++              mmap_read_lock(mm);
+       /*
+        * We are doing an exec().  'current' is the process
+-       * doing the exec and bprm->mm is the new process's mm.
++       * doing the exec and 'mm' is the new process's mm.
+        */
+-      mmap_read_lock(bprm->mm);
+-      ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
++      ret = get_user_pages_remote(mm, pos, 1,
++                      write ? FOLL_WRITE : 0,
+                       &page, NULL, NULL);
+-      mmap_read_unlock(bprm->mm);
++      mmap_read_unlock(mm);
+       if (ret <= 0)
+               return NULL;
+       if (write)
+-              acct_arg_size(bprm, vma_pages(bprm->vma));
++              acct_arg_size(bprm, vma_pages(vma));
+       return page;
+ }
diff --git a/queue-6.3/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch b/queue-6.3/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch
new file mode 100644 (file)
index 0000000..cc1b1ef
--- /dev/null
@@ -0,0 +1,59 @@
+From a425ac5365f6cb3cc47bf83e6bff0213c10445f7 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sun, 25 Jun 2023 14:02:25 -0700
+Subject: gup: add warning if some caller would seem to want stack expansion
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a425ac5365f6cb3cc47bf83e6bff0213c10445f7 upstream.
+
+It feels very unlikely that anybody would want to do a GUP in an
+unmapped area under the stack pointer, but real users sometimes do some
+really strange things.  So add a (temporary) warning for the case where
+a GUP fails and expanding the stack might have made it work.
+
+It's trivial to do the expansion in the caller as part of getting the mm
+lock in the first place - see __access_remote_vm() for ptrace, for
+example - it's just that it's unnecessarily painful to do it deep in the
+guts of the GUP lookup when we might have to drop and re-take the lock.
+
+I doubt anybody actually does anything quite this strange, but let's be
+proactive: adding these warnings is simple, and will make debugging it
+much easier if they trigger.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/gup.c |   12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1096,7 +1096,11 @@ static long __get_user_pages(struct mm_s
+               /* first iteration or cross vma bound */
+               if (!vma || start >= vma->vm_end) {
+-                      vma = vma_lookup(mm, start);
++                      vma = find_vma(mm, start);
++                      if (vma && (start < vma->vm_start)) {
++                              WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
++                              vma = NULL;
++                      }
+                       if (!vma && in_gate_area(mm, start)) {
+                               ret = get_gate_page(mm, start & PAGE_MASK,
+                                               gup_flags, &vma,
+@@ -1265,9 +1269,13 @@ int fixup_user_fault(struct mm_struct *m
+               fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ retry:
+-      vma = vma_lookup(mm, address);
++      vma = find_vma(mm, address);
+       if (!vma)
+               return -EFAULT;
++      if (address < vma->vm_start ) {
++              WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
++              return -EFAULT;
++      }
+       if (!vma_permits_fault(vma, fault_flags))
+               return -EFAULT;
diff --git a/queue-6.3/mips-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/mips-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..58efcc6
--- /dev/null
@@ -0,0 +1,53 @@
+From 4bce37a68ff884e821a02a731897a8119e0c37b7 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 18:47:40 +0200
+Subject: mips/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/mips/Kconfig    |    1 +
+ arch/mips/mm/fault.c |   12 ++----------
+ 2 files changed, 3 insertions(+), 10 deletions(-)
+
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -94,6 +94,7 @@ config MIPS
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP
+       select IRQ_FORCED_THREADING
+       select ISA if EISA
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_REL if MODULES
+       select MODULES_USE_ELF_RELA if MODULES && 64BIT
+       select PERF_USE_VMALLOC
+--- a/arch/mips/mm/fault.c
++++ b/arch/mips/mm/fault.c
+@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+  * we can handle it..
+  */
+-good_area:
+       si_code = SEGV_ACCERR;
+       if (write) {
diff --git a/queue-6.3/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch b/queue-6.3/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
new file mode 100644 (file)
index 0000000..e8d7126
--- /dev/null
@@ -0,0 +1,671 @@
+From 8d7071af890768438c14db6172cc8f9f4d04e184 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 13:45:51 -0700
+Subject: mm: always expand the stack with the mmap write lock held
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream.
+
+This finishes the job of always holding the mmap write lock when
+extending the user stack vma, and removes the 'write_locked' argument
+from the vm helper functions again.
+
+For some cases, we just avoid expanding the stack at all: drivers and
+page pinning really shouldn't be extending any stacks.  Let's see if any
+strange users really wanted that.
+
+It's worth noting that architectures that weren't converted to the new
+lock_mm_and_find_vma() helper function are left using the legacy
+"expand_stack()" function, but it has been changed to drop the mmap_lock
+and take it for writing while expanding the vma.  This makes it fairly
+straightforward to convert the remaining architectures.
+
+As a result of dropping and re-taking the lock, the calling conventions
+for this function have also changed, since the old vma may no longer be
+valid.  So it will now return the new vma if successful, and NULL - and
+the lock dropped - if the area could not be extended.
+
+Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
+Tested-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> # ia64
+Tested-by: Frank Scheiner <frank.scheiner@web.de> # ia64
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/ia64/mm/fault.c         |   36 ++----------
+ arch/m68k/mm/fault.c         |    9 ++-
+ arch/microblaze/mm/fault.c   |    5 +
+ arch/openrisc/mm/fault.c     |    5 +
+ arch/parisc/mm/fault.c       |   23 +++-----
+ arch/s390/mm/fault.c         |    5 +
+ arch/sparc/mm/fault_64.c     |    8 +-
+ arch/um/kernel/trap.c        |   11 ++-
+ drivers/iommu/amd/iommu_v2.c |    4 -
+ drivers/iommu/iommu-sva.c    |    2 
+ fs/binfmt_elf.c              |    2 
+ fs/exec.c                    |    4 -
+ include/linux/mm.h           |   16 +----
+ mm/gup.c                     |    6 +-
+ mm/memory.c                  |   10 +++
+ mm/mmap.c                    |  121 ++++++++++++++++++++++++++++++++++---------
+ mm/nommu.c                   |   18 ++----
+ 17 files changed, 169 insertions(+), 116 deletions(-)
+
+--- a/arch/ia64/mm/fault.c
++++ b/arch/ia64/mm/fault.c
+@@ -110,10 +110,12 @@ retry:
+          * register backing store that needs to expand upwards, in
+          * this case vma will be null, but prev_vma will ne non-null
+          */
+-        if (( !vma && prev_vma ) || (address < vma->vm_start) )
+-              goto check_expansion;
++        if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto bad_area_nosemaphore;
++      }
+-  good_area:
+       code = SEGV_ACCERR;
+       /* OK, we've got a good vm_area for this memory area.  Check the access permissions: */
+@@ -177,35 +179,9 @@ retry:
+       mmap_read_unlock(mm);
+       return;
+-  check_expansion:
+-      if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+-              if (!vma)
+-                      goto bad_area;
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      goto bad_area;
+-              if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+-                  || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+-                      goto bad_area;
+-              if (expand_stack(vma, address))
+-                      goto bad_area;
+-      } else {
+-              vma = prev_vma;
+-              if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+-                  || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+-                      goto bad_area;
+-              /*
+-               * Since the register backing store is accessed sequentially,
+-               * we disallow growing it by more than a page at a time.
+-               */
+-              if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+-                      goto bad_area;
+-              if (expand_upwards(vma, address))
+-                      goto bad_area;
+-      }
+-      goto good_area;
+-
+   bad_area:
+       mmap_read_unlock(mm);
++  bad_area_nosemaphore:
+       if ((isr & IA64_ISR_SP)
+           || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+       {
+--- a/arch/m68k/mm/fault.c
++++ b/arch/m68k/mm/fault.c
+@@ -105,8 +105,9 @@ retry:
+               if (address + 256 < rdusp())
+                       goto map_err;
+       }
+-      if (expand_stack(vma, address))
+-              goto map_err;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto map_err_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+@@ -196,10 +197,12 @@ bus_err:
+       goto send_sig;
+ map_err:
++      mmap_read_unlock(mm);
++map_err_nosemaphore:
+       current->thread.signo = SIGSEGV;
+       current->thread.code = SEGV_MAPERR;
+       current->thread.faddr = address;
+-      goto send_sig;
++      return send_fault_sig(regs);
+ acc_err:
+       current->thread.signo = SIGSEGV;
+--- a/arch/microblaze/mm/fault.c
++++ b/arch/microblaze/mm/fault.c
+@@ -192,8 +192,9 @@ retry:
+                       && (kernel_mode(regs) || !store_updates_sp(regs)))
+                               goto bad_area;
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+ good_area:
+       code = SEGV_ACCERR;
+--- a/arch/openrisc/mm/fault.c
++++ b/arch/openrisc/mm/fault.c
+@@ -127,8 +127,9 @@ retry:
+               if (address + PAGE_SIZE < regs->sp)
+                       goto bad_area;
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+--- a/arch/parisc/mm/fault.c
++++ b/arch/parisc/mm/fault.c
+@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs,
+ retry:
+       mmap_read_lock(mm);
+       vma = find_vma_prev(mm, address, &prev_vma);
+-      if (!vma || address < vma->vm_start)
+-              goto check_expansion;
++      if (!vma || address < vma->vm_start) {
++              if (!prev || !(prev->vm_flags & VM_GROWSUP))
++                      goto bad_area;
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto bad_area_nosemaphore;
++      }
++
+ /*
+  * Ok, we have a good vm_area for this memory access. We still need to
+  * check the access permissions.
+  */
+-good_area:
+-
+       if ((vma->vm_flags & acc_type) != acc_type)
+               goto bad_area;
+@@ -347,17 +351,13 @@ good_area:
+       mmap_read_unlock(mm);
+       return;
+-check_expansion:
+-      vma = prev_vma;
+-      if (vma && (expand_stack(vma, address) == 0))
+-              goto good_area;
+-
+ /*
+  * Something tried to access memory that isn't in our memory map..
+  */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               int signo, si_code;
+@@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ {
+       unsigned long insn = regs->iir;
+       int breg, treg, xreg, val = 0;
+-      struct vm_area_struct *vma, *prev_vma;
++      struct vm_area_struct *vma;
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       unsigned long address;
+@@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+                               /* Search for VMA */
+                               address = regs->ior;
+                               mmap_read_lock(mm);
+-                              vma = find_vma_prev(mm, address, &prev_vma);
++                              vma = vma_lookup(mm, address);
+                               mmap_read_unlock(mm);
+                               /*
+@@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs
+                                */
+                               acc_type = (insn & 0x40) ? VM_WRITE : VM_READ;
+                               if (vma
+-                                  && address >= vma->vm_start
+                                   && (vma->vm_flags & acc_type) == acc_type)
+                                       val = 1;
+                       }
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -433,8 +433,9 @@ retry:
+       if (unlikely(vma->vm_start > address)) {
+               if (!(vma->vm_flags & VM_GROWSDOWN))
+                       goto out_up;
+-              if (expand_stack(vma, address))
+-                      goto out_up;
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto out;
+       }
+       /*
+--- a/arch/sparc/mm/fault_64.c
++++ b/arch/sparc/mm/fault_64.c
+@@ -383,8 +383,9 @@ continue_fault:
+                               goto bad_area;
+               }
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+@@ -487,8 +488,9 @@ exit_exception:
+        * Fix it, but check if it's kernel or user first..
+        */
+ bad_area:
+-      insn = get_fault_insn(regs, insn);
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
++      insn = get_fault_insn(regs, insn);
+ handle_kernel_fault:
+       do_kernel_fault(regs, si_code, fault_code, insn, address);
+--- a/arch/um/kernel/trap.c
++++ b/arch/um/kernel/trap.c
+@@ -47,14 +47,15 @@ retry:
+       vma = find_vma(mm, address);
+       if (!vma)
+               goto out;
+-      else if (vma->vm_start <= address)
++      if (vma->vm_start <= address)
+               goto good_area;
+-      else if (!(vma->vm_flags & VM_GROWSDOWN))
++      if (!(vma->vm_flags & VM_GROWSDOWN))
+               goto out;
+-      else if (is_user && !ARCH_IS_STACKGROW(address))
+-              goto out;
+-      else if (expand_stack(vma, address))
++      if (is_user && !ARCH_IS_STACKGROW(address))
+               goto out;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto out_nosemaphore;
+ good_area:
+       *code_out = SEGV_ACCERR;
+--- a/drivers/iommu/amd/iommu_v2.c
++++ b/drivers/iommu/amd/iommu_v2.c
+@@ -485,8 +485,8 @@ static void do_fault(struct work_struct
+       flags |= FAULT_FLAG_REMOTE;
+       mmap_read_lock(mm);
+-      vma = find_extend_vma(mm, address);
+-      if (!vma || address < vma->vm_start)
++      vma = vma_lookup(mm, address);
++      if (!vma)
+               /* failed to get a vma in the right range */
+               goto out;
+--- a/drivers/iommu/iommu-sva.c
++++ b/drivers/iommu/iommu-sva.c
+@@ -203,7 +203,7 @@ iommu_sva_handle_iopf(struct iommu_fault
+       mmap_read_lock(mm);
+-      vma = find_extend_vma(mm, prm->addr);
++      vma = vma_lookup(mm, prm->addr);
+       if (!vma)
+               /* Unmapped area */
+               goto out_put_mm;
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -322,7 +322,7 @@ create_elf_tables(struct linux_binprm *b
+        */
+       if (mmap_write_lock_killable(mm))
+               return -EINTR;
+-      vma = find_extend_vma_locked(mm, bprm->p, true);
++      vma = find_extend_vma_locked(mm, bprm->p);
+       mmap_write_unlock(mm);
+       if (!vma)
+               return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -210,7 +210,7 @@ static struct page *get_arg_page(struct
+        */
+       if (write && pos < vma->vm_start) {
+               mmap_write_lock(mm);
+-              ret = expand_downwards(vma, pos, true);
++              ret = expand_downwards(vma, pos);
+               if (unlikely(ret < 0)) {
+                       mmap_write_unlock(mm);
+                       return NULL;
+@@ -858,7 +858,7 @@ int setup_arg_pages(struct linux_binprm
+       stack_base = vma->vm_end - stack_expand;
+ #endif
+       current->mm->start_stack = bprm->p;
+-      ret = expand_stack_locked(vma, stack_base, true);
++      ret = expand_stack_locked(vma, stack_base);
+       if (ret)
+               ret = -EFAULT;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3065,18 +3065,11 @@ extern vm_fault_t filemap_page_mkwrite(s
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked);
+-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked);
+-#if VM_GROWSUP
+-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+-#else
+-  #define expand_upwards(vma, address) (0)
+-#endif
++int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+@@ -3171,9 +3164,8 @@ unsigned long change_prot_numa(struct vm
+                       unsigned long start, unsigned long end);
+ #endif
+-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
+-              unsigned long addr, bool write_locked);
++              unsigned long addr);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+                       unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1096,7 +1096,7 @@ static long __get_user_pages(struct mm_s
+               /* first iteration or cross vma bound */
+               if (!vma || start >= vma->vm_end) {
+-                      vma = find_extend_vma(mm, start);
++                      vma = vma_lookup(mm, start);
+                       if (!vma && in_gate_area(mm, start)) {
+                               ret = get_gate_page(mm, start & PAGE_MASK,
+                                               gup_flags, &vma,
+@@ -1265,8 +1265,8 @@ int fixup_user_fault(struct mm_struct *m
+               fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ retry:
+-      vma = find_extend_vma(mm, address);
+-      if (!vma || address < vma->vm_start)
++      vma = vma_lookup(mm, address);
++      if (!vma)
+               return -EFAULT;
+       if (!vma_permits_fault(vma, fault_flags))
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5336,7 +5336,7 @@ struct vm_area_struct *lock_mm_and_find_
+                       goto fail;
+       }
+-      if (expand_stack_locked(vma, addr, true))
++      if (expand_stack_locked(vma, addr))
+               goto fail;
+ success:
+@@ -5620,6 +5620,14 @@ int __access_remote_vm(struct mm_struct
+       if (mmap_read_lock_killable(mm))
+               return 0;
++      /* We might need to expand the stack to access it */
++      vma = vma_lookup(mm, addr);
++      if (!vma) {
++              vma = expand_stack(mm, addr);
++              if (!vma)
++                      return 0;
++      }
++
+       /* ignore errors, just check how much was successfully transferred */
+       while (len) {
+               int bytes, ret, offset;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1898,8 +1898,7 @@ static int acct_stack_growth(struct vm_a
+  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+  * vma is the last one with address > vma->vm_end.  Have to extend vma.
+  */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       struct vm_area_struct *next;
+@@ -1923,8 +1922,6 @@ int expand_upwards(struct vm_area_struct
+       if (gap_addr < address || gap_addr > TASK_SIZE)
+               gap_addr = TASK_SIZE;
+-      if (!write_locked)
+-              return -EAGAIN;
+       next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+       if (next && vma_is_accessible(next)) {
+               if (!(next->vm_flags & VM_GROWSUP))
+@@ -1993,15 +1990,18 @@ int expand_upwards(struct vm_area_struct
+ /*
+  * vma is the first one with address < vma->vm_start.  Have to extend vma.
++ * mmap_lock held for writing.
+  */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+       struct vm_area_struct *prev;
+       int error = 0;
++      if (!(vma->vm_flags & VM_GROWSDOWN))
++              return -EFAULT;
++
+       address &= PAGE_MASK;
+       if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+               return -EPERM;
+@@ -2014,8 +2014,6 @@ int expand_downwards(struct vm_area_stru
+                   vma_is_accessible(prev) &&
+                   (address - prev->vm_end < stack_guard_gap))
+                       return -ENOMEM;
+-              if (!write_locked && (prev->vm_end == address))
+-                      return -EAGAIN;
+       }
+       if (mas_preallocate(&mas, GFP_KERNEL))
+@@ -2094,14 +2092,12 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+-      return expand_upwards(vma, address, write_locked);
++      return expand_upwards(vma, address);
+ }
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+-              unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+       struct vm_area_struct *vma, *prev;
+@@ -2111,23 +2107,21 @@ struct vm_area_struct *find_extend_vma_l
+               return vma;
+       if (!prev)
+               return NULL;
+-      if (expand_stack_locked(prev, addr, write_locked))
++      if (expand_stack_locked(prev, addr))
+               return NULL;
+       if (prev->vm_flags & VM_LOCKED)
+               populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+       return prev;
+ }
+ #else
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+       if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+               return -EINVAL;
+-      return expand_downwards(vma, address, write_locked);
++      return expand_downwards(vma, address);
+ }
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+-              unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+       struct vm_area_struct *vma;
+       unsigned long start;
+@@ -2139,7 +2133,7 @@ struct vm_area_struct *find_extend_vma_l
+       if (vma->vm_start <= addr)
+               return vma;
+       start = vma->vm_start;
+-      if (expand_stack_locked(vma, addr, write_locked))
++      if (expand_stack_locked(vma, addr))
+               return NULL;
+       if (vma->vm_flags & VM_LOCKED)
+               populate_vma_page_range(vma, addr, start, NULL);
+@@ -2147,12 +2141,91 @@ struct vm_area_struct *find_extend_vma_l
+ }
+ #endif
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
+-              unsigned long addr)
++/*
++ * IA64 has some horrid mapping rules: it can expand both up and down,
++ * but with various special rules.
++ *
++ * We'll get rid of this architecture eventually, so the ugliness is
++ * temporary.
++ */
++#ifdef CONFIG_IA64
++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
++{
++      return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
++              REGION_OFFSET(addr) < RGN_MAP_LIMIT;
++}
++
++/*
++ * IA64 stacks grow down, but there's a special register backing store
++ * that can grow up. Only sequentially, though, so the new address must
++ * match vm_end.
++ */
++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
++{
++      if (!vma_expand_ok(vma, addr))
++              return -EFAULT;
++      if (vma->vm_end != (addr & PAGE_MASK))
++              return -EFAULT;
++      return expand_upwards(vma, addr);
++}
++
++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
++{
++      if (!vma_expand_ok(vma, addr))
++              return -EFAULT;
++      return expand_downwards(vma, addr);
++}
++
++#elif defined(CONFIG_STACK_GROWSUP)
++
++#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
++#define vma_expand_down(vma, addr) (-EFAULT)
++
++#else
++
++#define vma_expand_up(vma,addr) (-EFAULT)
++#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
++
++#endif
++
++/*
++ * expand_stack(): legacy interface for page faulting. Don't use unless
++ * you have to.
++ *
++ * This is called with the mm locked for reading, drops the lock, takes
++ * the lock for writing, tries to look up a vma again, expands it if
++ * necessary, and downgrades the lock to reading again.
++ *
++ * If no vma is found or it can't be expanded, it returns NULL and has
++ * dropped the lock.
++ */
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+ {
+-      return find_extend_vma_locked(mm, addr, false);
++      struct vm_area_struct *vma, *prev;
++
++      mmap_read_unlock(mm);
++      if (mmap_write_lock_killable(mm))
++              return NULL;
++
++      vma = find_vma_prev(mm, addr, &prev);
++      if (vma && vma->vm_start <= addr)
++              goto success;
++
++      if (prev && !vma_expand_up(prev, addr)) {
++              vma = prev;
++              goto success;
++      }
++
++      if (vma && !vma_expand_down(vma, addr))
++              goto success;
++
++      mmap_write_unlock(mm);
++      return NULL;
++
++success:
++      mmap_write_downgrade(mm);
++      return vma;
+ }
+-EXPORT_SYMBOL_GPL(find_extend_vma);
+ /*
+  * Ok - we have the memory areas we should free on a maple tree so release them,
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -631,24 +631,20 @@ struct vm_area_struct *find_vma(struct m
+ EXPORT_SYMBOL(find_vma);
+ /*
+- * find a VMA
+- * - we don't extend stack VMAs under NOMMU conditions
+- */
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+-{
+-      return find_vma(mm, addr);
+-}
+-
+-/*
+  * expand a stack to a given address
+  * - not supported under NOMMU conditions
+  */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
+ {
+       return -ENOMEM;
+ }
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
++{
++      mmap_read_unlock(mm);
++      return NULL;
++}
++
+ /*
+  * look up the first VMA exactly that exactly matches addr
+  * - should be called with mm->mmap_lock at least held readlocked
diff --git a/queue-6.3/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch b/queue-6.3/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..63b4467
--- /dev/null
@@ -0,0 +1,489 @@
+From a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 10:55:38 -0700
+Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream.
+
+This does the simple pattern conversion of alpha, arc, csky, hexagon,
+loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma()
+helper.  They all have the regular fault handling pattern without odd
+special cases.
+
+The remaining architectures all have something that keeps us from a
+straightforward conversion: ia64 and parisc have stacks that can grow
+both up as well as down (and ia64 has special address region checks).
+
+And m68k, microblaze, openrisc, sparc64, and um end up having extra
+rules about only expanding the stack down a limited amount below the
+user space stack pointer.  That is something that x86 used to do too
+(long long ago), and it probably could just be skipped, but it still
+makes the conversion less than trivial.
+
+Note that this conversion was done manually and with the exception of
+alpha without any build testing, because I have a fairly limited cross-
+building environment.  The cases are all simple, and I went through the
+changes several times, but...
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/alpha/Kconfig         |    1 +
+ arch/alpha/mm/fault.c      |   13 +++----------
+ arch/arc/Kconfig           |    1 +
+ arch/arc/mm/fault.c        |   11 +++--------
+ arch/csky/Kconfig          |    1 +
+ arch/csky/mm/fault.c       |   22 +++++-----------------
+ arch/hexagon/Kconfig       |    1 +
+ arch/hexagon/mm/vm_fault.c |   18 ++++--------------
+ arch/loongarch/Kconfig     |    1 +
+ arch/loongarch/mm/fault.c  |   16 ++++++----------
+ arch/nios2/Kconfig         |    1 +
+ arch/nios2/mm/fault.c      |   17 ++---------------
+ arch/sh/Kconfig            |    1 +
+ arch/sh/mm/fault.c         |   17 ++---------------
+ arch/sparc/Kconfig         |    1 +
+ arch/sparc/mm/fault_32.c   |   32 ++++++++------------------------
+ arch/xtensa/Kconfig        |    1 +
+ arch/xtensa/mm/fault.c     |   14 +++-----------
+ 18 files changed, 45 insertions(+), 124 deletions(-)
+
+--- a/arch/alpha/Kconfig
++++ b/arch/alpha/Kconfig
+@@ -29,6 +29,7 @@ config ALPHA
+       select GENERIC_SMP_IDLE_THREAD
+       select HAVE_ARCH_AUDITSYSCALL
+       select HAVE_MOD_ARCH_SPECIFIC
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select ODD_RT_SIGACTION
+       select OLD_SIGSUSPEND
+--- a/arch/alpha/mm/fault.c
++++ b/arch/alpha/mm/fault.c
+@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns
+               flags |= FAULT_FLAG_USER;
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /* Ok, we have a good vm_area for this memory access, so
+          we can handle it.  */
+- good_area:
+       si_code = SEGV_ACCERR;
+       if (cause < 0) {
+               if (!(vma->vm_flags & VM_EXEC))
+@@ -192,6 +184,7 @@ retry:
+  bad_area:
+       mmap_read_unlock(mm);
++ bad_area_nosemaphore:
+       if (user_mode(regs))
+               goto do_sigsegv;
+--- a/arch/arc/Kconfig
++++ b/arch/arc/Kconfig
+@@ -41,6 +41,7 @@ config ARC
+       select HAVE_PERF_EVENTS
+       select HAVE_SYSCALL_TRACEPOINTS
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select OF
+       select OF_EARLY_FLATTREE
+--- a/arch/arc/mm/fault.c
++++ b/arch/arc/mm/fault.c
+@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (unlikely(address < vma->vm_start)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address))
+-                      goto bad_area;
+-      }
++              goto bad_area_nosemaphore;
+       /*
+        * vm_area is good, now check permissions for this memory access
+@@ -161,6 +155,7 @@ retry:
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       /*
+        * Major/minor page fault accounting
+        * (in case of retry we only land here once)
+--- a/arch/csky/Kconfig
++++ b/arch/csky/Kconfig
+@@ -96,6 +96,7 @@ config CSKY
+       select HAVE_REGS_AND_STACK_ACCESS_API
+       select HAVE_STACKPROTECTOR
+       select HAVE_SYSCALL_TRACEPOINTS
++      select LOCK_MM_AND_FIND_VMA
+       select MAY_HAVE_SPARSE_IRQ
+       select MODULES_USE_ELF_RELA if MODULES
+       select OF
+--- a/arch/csky/mm/fault.c
++++ b/arch/csky/mm/fault.c
+@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct
+       BUG();
+ }
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
+ {
+       /*
+        * Something tried to access memory that isn't in our memory map.
+        * Fix it, but check if it's kernel or user first.
+        */
+-      mmap_read_unlock(mm);
+       /* User mode accesses just cause a SIGSEGV */
+       if (user_mode(regs)) {
+               do_trap(regs, SIGSEGV, code, addr);
+@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_
+       if (is_write(regs))
+               flags |= FAULT_FLAG_WRITE;
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, addr);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= addr))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, addr))) {
+-              bad_area(regs, mm, code, addr);
++              bad_area_nosemaphore(regs, mm, code, addr);
+               return;
+       }
+@@ -259,11 +247,11 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it.
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (unlikely(access_error(regs, vma))) {
+-              bad_area(regs, mm, code, addr);
++              mmap_read_unlock(mm);
++              bad_area_nosemaphore(regs, mm, code, addr);
+               return;
+       }
+--- a/arch/hexagon/Kconfig
++++ b/arch/hexagon/Kconfig
+@@ -28,6 +28,7 @@ config HEXAGON
+       select GENERIC_SMP_IDLE_THREAD
+       select STACKTRACE_SUPPORT
+       select GENERIC_CLOCKEVENTS_BROADCAST
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select GENERIC_CPU_DEVICES
+       select ARCH_WANT_LD_ORPHAN_WARN
+--- a/arch/hexagon/mm/vm_fault.c
++++ b/arch/hexagon/mm/vm_fault.c
+@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-      if (!vma)
+-              goto bad_area;
++      vma = lock_mm_and_find_vma(mm, address, regs);
++      if (unlikely(!vma))
++              goto bad_area_nosemaphore;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-
+-      if (expand_stack(vma, address))
+-              goto bad_area;
+-
+-good_area:
+       /* Address space is OK.  Now check access rights. */
+       si_code = SEGV_ACCERR;
+@@ -143,6 +132,7 @@ good_area:
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+               return;
+--- a/arch/loongarch/Kconfig
++++ b/arch/loongarch/Kconfig
+@@ -125,6 +125,7 @@ config LOONGARCH
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP
+       select IRQ_FORCED_THREADING
+       select IRQ_LOONGARCH_CPU
++      select LOCK_MM_AND_FIND_VMA
+       select MMU_GATHER_MERGE_VMAS if MMU
+       select MODULES_USE_ELF_RELA if MODULES
+       select NEED_PER_CPU_EMBED_FIRST_CHUNK
+--- a/arch/loongarch/mm/fault.c
++++ b/arch/loongarch/mm/fault.c
+@@ -169,22 +169,18 @@ static void __kprobes __do_page_fault(st
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-      if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (!expand_stack(vma, address))
+-              goto good_area;
++      vma = lock_mm_and_find_vma(mm, address, regs);
++      if (unlikely(!vma))
++              goto bad_area_nosemaphore;
++      goto good_area;
++
+ /*
+  * Something tried to access memory that isn't in our memory map..
+  * Fix it, but check if it's kernel or user first..
+  */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       do_sigsegv(regs, write, address, si_code);
+       return;
+--- a/arch/nios2/Kconfig
++++ b/arch/nios2/Kconfig
+@@ -16,6 +16,7 @@ config NIOS2
+       select HAVE_ARCH_TRACEHOOK
+       select HAVE_ARCH_KGDB
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select OF
+       select OF_EARLY_FLATTREE
+--- a/arch/nios2/mm/fault.c
++++ b/arch/nios2/mm/fault.c
+@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ea))
+-                      goto bad_area_nosemaphore;
+ retry:
+-              mmap_read_lock(mm);
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+  * we can handle it..
+  */
+-good_area:
+       code = SEGV_ACCERR;
+       switch (cause) {
+--- a/arch/sh/Kconfig
++++ b/arch/sh/Kconfig
+@@ -56,6 +56,7 @@ config SUPERH
+       select HAVE_STACKPROTECTOR
+       select HAVE_SYSCALL_TRACEPOINTS
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select NEED_SG_DMA_LENGTH
+       select NO_DMA if !MMU && !DMA_COHERENT
+--- a/arch/sh/mm/fault.c
++++ b/arch/sh/mm/fault.c
+@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(
+       }
+ retry:
+-      mmap_read_lock(mm);
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= address))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, address))) {
+-              bad_area(regs, error_code, address);
++              bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
+@@ -461,7 +449,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       if (unlikely(access_error(error_code, vma))) {
+               bad_area_access_error(regs, error_code, address);
+               return;
+--- a/arch/sparc/Kconfig
++++ b/arch/sparc/Kconfig
+@@ -56,6 +56,7 @@ config SPARC32
+       select DMA_DIRECT_REMAP
+       select GENERIC_ATOMIC64
+       select HAVE_UID16
++      select LOCK_MM_AND_FIND_VMA
+       select OLD_SIGACTION
+       select ZONE_DMA
+--- a/arch/sparc/mm/fault_32.c
++++ b/arch/sparc/mm/fault_32.c
+@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt
+       if (pagefault_disabled() || !mm)
+               goto no_context;
++      if (!from_user && address >= PAGE_OFFSET)
++              goto no_context;
++
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-
+-      if (!from_user && address >= PAGE_OFFSET)
+-              goto bad_area;
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (write) {
+               if (!(vma->vm_flags & VM_WRITE))
+@@ -321,17 +312,9 @@ static void force_user_fault(unsigned lo
+       code = SEGV_MAPERR;
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
+-good_area:
++              goto bad_area_nosemaphore;
+       code = SEGV_ACCERR;
+       if (write) {
+               if (!(vma->vm_flags & VM_WRITE))
+@@ -350,6 +333,7 @@ good_area:
+       return;
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address);
+       return;
+--- a/arch/xtensa/Kconfig
++++ b/arch/xtensa/Kconfig
+@@ -49,6 +49,7 @@ config XTENSA
+       select HAVE_SYSCALL_TRACEPOINTS
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select PERF_USE_VMALLOC
+       select TRACE_IRQFLAGS_SUPPORT
+--- a/arch/xtensa/mm/fault.c
++++ b/arch/xtensa/mm/fault.c
+@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs)
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /* Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (is_write) {
+@@ -205,6 +196,7 @@ good_area:
+        */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               force_sig_fault(SIGSEGV, code, (void *) address);
+               return;
diff --git a/queue-6.3/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch b/queue-6.3/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
new file mode 100644 (file)
index 0000000..aa638f1
--- /dev/null
@@ -0,0 +1,295 @@
+From c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 15:17:36 -0700
+Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream.
+
+.. and make x86 use it.
+
+This basically extracts the existing x86 "find and expand faulting vma"
+code, but extends it to also take the mmap lock for writing in case we
+actually do need to expand the vma.
+
+We've historically short-circuited that case, and have some rather ugly
+special logic to serialize the stack segment expansion (since we only
+hold the mmap lock for reading) that doesn't match the normal VM
+locking.
+
+That slight violation of locking worked well, right up until it didn't:
+the maple tree code really does want proper locking even for simple
+extension of an existing vma.
+
+So extract the code for "look up the vma of the fault" from x86, fix it
+up to do the necessary write locking, and make it available as a helper
+function for other architectures that can use the common helper.
+
+Note: I say "common helper", but it really only handles the normal
+stack-grows-down case.  Which is all architectures except for PA-RISC
+and IA64.  So some rare architectures can't use the helper, but if they
+care they'll just need to open-code this logic.
+
+It's also worth pointing out that this code really would like to have an
+optimistic "mmap_upgrade_trylock()" to make it quicker to go from a
+read-lock (for the common case) to taking the write lock (for having to
+extend the vma) in the normal single-threaded situation where there is
+no other locking activity.
+
+But that _is_ all the very uncommon special case, so while it would be
+nice to have such an operation, it probably doesn't matter in reality.
+I did put in the skeleton code for such a possible future expansion,
+even if it only acts as pseudo-documentation for what we're doing.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig    |    1 
+ arch/x86/mm/fault.c |   52 ----------------------
+ include/linux/mm.h  |    2 
+ mm/Kconfig          |    4 +
+ mm/memory.c         |  121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 130 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -274,6 +274,7 @@ config X86
+       select HAVE_GENERIC_VDSO
+       select HOTPLUG_SMT                      if SMP
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select NEED_PER_CPU_EMBED_FIRST_CHUNK
+       select NEED_PER_CPU_PAGE_FIRST_CHUNK
+       select NEED_SG_DMA_LENGTH
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -879,12 +879,6 @@ __bad_area(struct pt_regs *regs, unsigne
+       __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+ }
+-static noinline void
+-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+-{
+-      __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
+-}
+-
+ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+               struct vm_area_struct *vma)
+ {
+@@ -1333,51 +1327,10 @@ void do_user_addr_fault(struct pt_regs *
+       }
+ #endif
+-      /*
+-       * Kernel-mode access to the user address space should only occur
+-       * on well-defined single instructions listed in the exception
+-       * tables.  But, an erroneous kernel fault occurring outside one of
+-       * those areas which also holds mmap_lock might deadlock attempting
+-       * to validate the fault against the address space.
+-       *
+-       * Only do the expensive exception table search when we might be at
+-       * risk of a deadlock.  This happens if we
+-       * 1. Failed to acquire mmap_lock, and
+-       * 2. The access did not originate in userspace.
+-       */
+-      if (unlikely(!mmap_read_trylock(mm))) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
+-                      /*
+-                       * Fault from code in kernel from
+-                       * which we do not expect faults.
+-                       */
+-                      bad_area_nosemaphore(regs, error_code, address);
+-                      return;
+-              }
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case we'll have missed the might_sleep() from
+-               * down_read():
+-               */
+-              might_sleep();
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= address))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, address))) {
+-              bad_area(regs, error_code, address);
++              bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
+@@ -1385,7 +1338,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       if (unlikely(access_error(error_code, vma))) {
+               bad_area_access_error(regs, error_code, address, vma);
+               return;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2190,6 +2190,8 @@ void unmap_mapping_pages(struct address_
+               pgoff_t start, pgoff_t nr, bool even_cows);
+ void unmap_mapping_range(struct address_space *mapping,
+               loff_t const holebegin, loff_t const holelen, int even_cows);
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++              unsigned long address, struct pt_regs *regs);
+ #else
+ static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+                                        unsigned long address, unsigned int flags,
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1202,6 +1202,10 @@ config LRU_GEN_STATS
+         This option has a per-memcg and per-node memory overhead.
+ # }
++config LOCK_MM_AND_FIND_VMA
++      bool
++      depends on !STACK_GROWSUP
++
+ source "mm/damon/Kconfig"
+ endmenu
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5230,6 +5230,127 @@ vm_fault_t handle_mm_fault(struct vm_are
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
++#include <linux/extable.h>
++
++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++      /* Even if this succeeds, make it clear we *might* have slept */
++      if (likely(mmap_read_trylock(mm))) {
++              might_sleep();
++              return true;
++      }
++
++      if (regs && !user_mode(regs)) {
++              unsigned long ip = instruction_pointer(regs);
++              if (!search_exception_tables(ip))
++                      return false;
++      }
++
++      mmap_read_lock(mm);
++      return true;
++}
++
++static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
++{
++      /*
++       * We don't have this operation yet.
++       *
++       * It should be easy enough to do: it's basically a
++       *    atomic_long_try_cmpxchg_acquire()
++       * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
++       * it also needs the proper lockdep magic etc.
++       */
++      return false;
++}
++
++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++      mmap_read_unlock(mm);
++      if (regs && !user_mode(regs)) {
++              unsigned long ip = instruction_pointer(regs);
++              if (!search_exception_tables(ip))
++                      return false;
++      }
++      mmap_write_lock(mm);
++      return true;
++}
++
++/*
++ * Helper for page fault handling.
++ *
++ * This is kind of equivalend to "mmap_read_lock()" followed
++ * by "find_extend_vma()", except it's a lot more careful about
++ * the locking (and will drop the lock on failure).
++ *
++ * For example, if we have a kernel bug that causes a page
++ * fault, we don't want to just use mmap_read_lock() to get
++ * the mm lock, because that would deadlock if the bug were
++ * to happen while we're holding the mm lock for writing.
++ *
++ * So this checks the exception tables on kernel faults in
++ * order to only do this all for instructions that are actually
++ * expected to fault.
++ *
++ * We can also actually take the mm lock for writing if we
++ * need to extend the vma, which helps the VM layer a lot.
++ */
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++                      unsigned long addr, struct pt_regs *regs)
++{
++      struct vm_area_struct *vma;
++
++      if (!get_mmap_lock_carefully(mm, regs))
++              return NULL;
++
++      vma = find_vma(mm, addr);
++      if (likely(vma && (vma->vm_start <= addr)))
++              return vma;
++
++      /*
++       * Well, dang. We might still be successful, but only
++       * if we can extend a vma to do so.
++       */
++      if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
++              mmap_read_unlock(mm);
++              return NULL;
++      }
++
++      /*
++       * We can try to upgrade the mmap lock atomically,
++       * in which case we can continue to use the vma
++       * we already looked up.
++       *
++       * Otherwise we'll have to drop the mmap lock and
++       * re-take it, and also look up the vma again,
++       * re-checking it.
++       */
++      if (!mmap_upgrade_trylock(mm)) {
++              if (!upgrade_mmap_lock_carefully(mm, regs))
++                      return NULL;
++
++              vma = find_vma(mm, addr);
++              if (!vma)
++                      goto fail;
++              if (vma->vm_start <= addr)
++                      goto success;
++              if (!(vma->vm_flags & VM_GROWSDOWN))
++                      goto fail;
++      }
++
++      if (expand_stack(vma, addr))
++              goto fail;
++
++success:
++      mmap_write_downgrade(mm);
++      return vma;
++
++fail:
++      mmap_write_unlock(mm);
++      return NULL;
++}
++#endif
++
+ #ifndef __PAGETABLE_P4D_FOLDED
+ /*
+  * Allocate p4d page table.
diff --git a/queue-6.3/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch b/queue-6.3/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
new file mode 100644 (file)
index 0000000..6cdca99
--- /dev/null
@@ -0,0 +1,242 @@
+From f440fa1ac955e2898893f9301568435eb5cdfc4b Mon Sep 17 00:00:00 2001
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Fri, 16 Jun 2023 15:58:54 -0700
+Subject: mm: make find_extend_vma() fail if write lock not held
+
+From: Liam R. Howlett <Liam.Howlett@oracle.com>
+
+commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream.
+
+Make calls to extend_vma() and find_extend_vma() fail if the write lock
+is required.
+
+To avoid making this a flag-day event, this still allows the old
+read-locking case for the trivial situations, and passes in a flag to
+say "is it write-locked".  That way write-lockers can say "yes, I'm
+being careful", and legacy users will continue to work in all the common
+cases until they have been fully converted to the new world order.
+
+Co-Developed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/binfmt_elf.c    |    6 +++---
+ fs/exec.c          |    5 +++--
+ include/linux/mm.h |   10 +++++++---
+ mm/memory.c        |    2 +-
+ mm/mmap.c          |   50 +++++++++++++++++++++++++++++++++-----------------
+ mm/nommu.c         |    3 ++-
+ 6 files changed, 49 insertions(+), 27 deletions(-)
+
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *b
+        * Grow the stack manually; some architectures have a limit on how
+        * far ahead a user-space access may be in order to grow the stack.
+        */
+-      if (mmap_read_lock_killable(mm))
++      if (mmap_write_lock_killable(mm))
+               return -EINTR;
+-      vma = find_extend_vma(mm, bprm->p);
+-      mmap_read_unlock(mm);
++      vma = find_extend_vma_locked(mm, bprm->p, true);
++      mmap_write_unlock(mm);
+       if (!vma)
+               return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -204,7 +204,8 @@ static struct page *get_arg_page(struct
+ #ifdef CONFIG_STACK_GROWSUP
+       if (write) {
+-              ret = expand_downwards(bprm->vma, pos);
++              /* We claim to hold the lock - nobody to race with */
++              ret = expand_downwards(bprm->vma, pos, true);
+               if (ret < 0)
+                       return NULL;
+       }
+@@ -852,7 +853,7 @@ int setup_arg_pages(struct linux_binprm
+       stack_base = vma->vm_end - stack_expand;
+ #endif
+       current->mm->start_stack = bprm->p;
+-      ret = expand_stack(vma, stack_base);
++      ret = expand_stack_locked(vma, stack_base, true);
+       if (ret)
+               ret = -EFAULT;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3065,11 +3065,13 @@ extern vm_fault_t filemap_page_mkwrite(s
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked);
++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-extern int expand_downwards(struct vm_area_struct *vma,
+-              unsigned long address);
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked);
+ #if VM_GROWSUP
+ extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+ #else
+@@ -3170,6 +3172,8 @@ unsigned long change_prot_numa(struct vm
+ #endif
+ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
++              unsigned long addr, bool write_locked);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+                       unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5336,7 +5336,7 @@ struct vm_area_struct *lock_mm_and_find_
+                       goto fail;
+       }
+-      if (expand_stack(vma, addr))
++      if (expand_stack_locked(vma, addr, true))
+               goto fail;
+ success:
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1898,7 +1898,8 @@ static int acct_stack_growth(struct vm_a
+  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+  * vma is the last one with address > vma->vm_end.  Have to extend vma.
+  */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
++int expand_upwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       struct vm_area_struct *next;
+@@ -1922,6 +1923,8 @@ int expand_upwards(struct vm_area_struct
+       if (gap_addr < address || gap_addr > TASK_SIZE)
+               gap_addr = TASK_SIZE;
++      if (!write_locked)
++              return -EAGAIN;
+       next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+       if (next && vma_is_accessible(next)) {
+               if (!(next->vm_flags & VM_GROWSUP))
+@@ -1991,7 +1994,8 @@ int expand_upwards(struct vm_area_struct
+ /*
+  * vma is the first one with address < vma->vm_start.  Have to extend vma.
+  */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+@@ -2005,10 +2009,13 @@ int expand_downwards(struct vm_area_stru
+       /* Enforce stack_guard_gap */
+       prev = mas_prev(&mas, 0);
+       /* Check that both stack segments have the same anon_vma? */
+-      if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+-                      vma_is_accessible(prev)) {
+-              if (address - prev->vm_end < stack_guard_gap)
++      if (prev) {
++              if (!(prev->vm_flags & VM_GROWSDOWN) &&
++                  vma_is_accessible(prev) &&
++                  (address - prev->vm_end < stack_guard_gap))
+                       return -ENOMEM;
++              if (!write_locked && (prev->vm_end == address))
++                      return -EAGAIN;
+       }
+       if (mas_preallocate(&mas, GFP_KERNEL))
+@@ -2087,13 +2094,14 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+-      return expand_upwards(vma, address);
++      return expand_upwards(vma, address, write_locked);
+ }
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++              unsigned long addr, bool write_locked)
+ {
+       struct vm_area_struct *vma, *prev;
+@@ -2101,20 +2109,25 @@ find_extend_vma(struct mm_struct *mm, un
+       vma = find_vma_prev(mm, addr, &prev);
+       if (vma && (vma->vm_start <= addr))
+               return vma;
+-      if (!prev || expand_stack(prev, addr))
++      if (!prev)
++              return NULL;
++      if (expand_stack_locked(prev, addr, write_locked))
+               return NULL;
+       if (prev->vm_flags & VM_LOCKED)
+               populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+       return prev;
+ }
+ #else
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+-      return expand_downwards(vma, address);
++      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
++              return -EINVAL;
++      return expand_downwards(vma, address, write_locked);
+ }
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++              unsigned long addr, bool write_locked)
+ {
+       struct vm_area_struct *vma;
+       unsigned long start;
+@@ -2125,10 +2138,8 @@ find_extend_vma(struct mm_struct *mm, un
+               return NULL;
+       if (vma->vm_start <= addr)
+               return vma;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              return NULL;
+       start = vma->vm_start;
+-      if (expand_stack(vma, addr))
++      if (expand_stack_locked(vma, addr, write_locked))
+               return NULL;
+       if (vma->vm_flags & VM_LOCKED)
+               populate_vma_page_range(vma, addr, start, NULL);
+@@ -2136,6 +2147,11 @@ find_extend_vma(struct mm_struct *mm, un
+ }
+ #endif
++struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
++              unsigned long addr)
++{
++      return find_extend_vma_locked(mm, addr, false);
++}
+ EXPORT_SYMBOL_GPL(find_extend_vma);
+ /*
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -643,7 +643,8 @@ struct vm_area_struct *find_extend_vma(s
+  * expand a stack to a given address
+  * - not supported under NOMMU conditions
+  */
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       return -ENOMEM;
+ }
diff --git a/queue-6.3/mm-make-the-page-fault-mmap-locking-killable.patch b/queue-6.3/mm-make-the-page-fault-mmap-locking-killable.patch
new file mode 100644 (file)
index 0000000..b5337ee
--- /dev/null
@@ -0,0 +1,46 @@
+From eda0047296a16d65a7f2bc60a408f70d178b2014 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 16:17:48 -0700
+Subject: mm: make the page fault mmap locking killable
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream.
+
+This is done as a separate patch from introducing the new
+lock_mm_and_find_vma() helper, because while it's an obvious change,
+it's not what x86 used to do in this area.
+
+We already abort the page fault on fatal signals anyway, so why should
+we wait for the mmap lock only to then abort later? With the new helper
+function that returns without the lock held on failure anyway, this is
+particularly easy and straightforward.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c |    6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5247,8 +5247,7 @@ static inline bool get_mmap_lock_careful
+                       return false;
+       }
+-      mmap_read_lock(mm);
+-      return true;
++      return !mmap_read_lock_killable(mm);
+ }
+ static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+@@ -5272,8 +5271,7 @@ static inline bool upgrade_mmap_lock_car
+               if (!search_exception_tables(ip))
+                       return false;
+       }
+-      mmap_write_lock(mm);
+-      return true;
++      return !mmap_write_lock_killable(mm);
+ }
+ /*
diff --git a/queue-6.3/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch b/queue-6.3/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..f297869
--- /dev/null
@@ -0,0 +1,47 @@
+From 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 11:17:05 -0700
+Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream.
+
+This is one of the simple cases, except there's no pt_regs pointer.
+Which is fine, as lock_mm_and_find_vma() is set up to work fine with a
+NULL pt_regs.
+
+Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting,
+so we can just use the helper without any extra work.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/mm/copro_fault.c |   14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/arch/powerpc/mm/copro_fault.c
++++ b/arch/powerpc/mm/copro_fault.c
+@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru
+       if (mm->pgd == NULL)
+               return -EFAULT;
+-      mmap_read_lock(mm);
+-      ret = -EFAULT;
+-      vma = find_vma(mm, ea);
++      vma = lock_mm_and_find_vma(mm, ea, NULL);
+       if (!vma)
+-              goto out_unlock;
+-
+-      if (ea < vma->vm_start) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      goto out_unlock;
+-              if (expand_stack(vma, ea))
+-                      goto out_unlock;
+-      }
++              return -EFAULT;
++      ret = -EFAULT;
+       is_write = dsisr & DSISR_ISSTORE;
+       if (is_write) {
+               if (!(vma->vm_flags & VM_WRITE))
diff --git a/queue-6.3/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..95fca6c
--- /dev/null
@@ -0,0 +1,86 @@
+From e6fe228c4ffafdfc970cf6d46883a1f481baf7ea Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Fri, 16 Jun 2023 15:51:29 +1000
+Subject: powerpc/mm: Convert to using lock_mm_and_find_vma()
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream.
+
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/Kconfig    |    1 +
+ arch/powerpc/mm/fault.c |   41 ++++-------------------------------------
+ 2 files changed, 5 insertions(+), 37 deletions(-)
+
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -263,6 +263,7 @@ config PPC
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
+       select KASAN_VMALLOC                    if KASAN && MODULES
++      select LOCK_MM_AND_FIND_VMA
+       select MMU_GATHER_PAGE_SIZE
+       select MMU_GATHER_RCU_TABLE_FREE
+       select MMU_GATHER_MERGE_VMAS
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re
+       return __bad_area_nosemaphore(regs, address, si_code);
+ }
+-static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+-{
+-      return __bad_area(regs, address, SEGV_MAPERR);
+-}
+-
+ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+                                   struct vm_area_struct *vma)
+ {
+@@ -481,40 +476,12 @@ static int ___do_page_fault(struct pt_re
+        * we will deadlock attempting to validate the fault against the
+        * address space.  Luckily the kernel only validly references user
+        * space from well defined areas of code, which are listed in the
+-       * exceptions table.
+-       *
+-       * As the vast majority of faults will be valid we will only perform
+-       * the source reference check when there is a possibility of a deadlock.
+-       * Attempt to lock the address space, if we cannot we then validate the
+-       * source.  If this is invalid we can skip the address space check,
+-       * thus avoiding the deadlock.
+-       */
+-      if (unlikely(!mmap_read_trylock(mm))) {
+-              if (!is_user && !search_exception_tables(regs->nip))
+-                      return bad_area_nosemaphore(regs, address);
+-
++       * exceptions table. lock_mm_and_find_vma() handles that logic.
++       */
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case we'll have missed the might_sleep() from
+-               * down_read():
+-               */
+-              might_sleep();
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma))
+-              return bad_area(regs, address);
+-
+-      if (unlikely(vma->vm_start > address)) {
+-              if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+-                      return bad_area(regs, address);
+-
+-              if (unlikely(expand_stack(vma, address)))
+-                      return bad_area(regs, address);
+-      }
++              return bad_area_nosemaphore(regs, address);
+       if (unlikely(access_pkey_error(is_write, is_exec,
+                                      (error_code & DSISR_KEYFAULT), vma)))
diff --git a/queue-6.3/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..2999c21
--- /dev/null
@@ -0,0 +1,95 @@
+From 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 20:18:18 +0200
+Subject: riscv/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/Kconfig    |    1 +
+ arch/riscv/mm/fault.c |   31 +++++++++++++------------------
+ 2 files changed, 14 insertions(+), 18 deletions(-)
+
+--- a/arch/riscv/Kconfig
++++ b/arch/riscv/Kconfig
+@@ -119,6 +119,7 @@ config RISCV
+       select HAVE_SYSCALL_TRACEPOINTS
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA if MODULES
+       select MODULE_SECTIONS if MODULES
+       select OF
+--- a/arch/riscv/mm/fault.c
++++ b/arch/riscv/mm/fault.c
+@@ -83,13 +83,13 @@ static inline void mm_fault_error(struct
+       BUG();
+ }
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void
++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr)
+ {
+       /*
+        * Something tried to access memory that isn't in our memory map.
+        * Fix it, but check if it's kernel or user first.
+        */
+-      mmap_read_unlock(mm);
+       /* User mode accesses just cause a SIGSEGV */
+       if (user_mode(regs)) {
+               do_trap(regs, SIGSEGV, code, addr);
+@@ -99,6 +99,15 @@ static inline void bad_area(struct pt_re
+       no_context(regs, addr);
+ }
++static inline void
++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code,
++       unsigned long addr)
++{
++      mmap_read_unlock(mm);
++
++      bad_area_nosemaphore(regs, code, addr);
++}
++
+ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
+ {
+       pgd_t *pgd, *pgd_k;
+@@ -286,23 +295,10 @@ asmlinkage void do_page_fault(struct pt_
+       else if (cause == EXC_INST_PAGE_FAULT)
+               flags |= FAULT_FLAG_INSTRUCTION;
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, addr);
++      vma = lock_mm_and_find_vma(mm, addr, regs);
+       if (unlikely(!vma)) {
+               tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= addr))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, addr))) {
+-              tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
++              bad_area_nosemaphore(regs, code, addr);
+               return;
+       }
+@@ -310,7 +306,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it.
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (unlikely(access_error(cause, vma))) {
index 44c48b4009cc33ba6717cf0f5c8fe4c2c7b5ed0e..fcec13792a182788b55e1b836fde1ea2993d1926 100644 (file)
@@ -9,3 +9,17 @@ x86-smp-cure-kexec-vs.-mwait_play_dead-breakage.patch
 cpufreq-amd-pstate-make-amd-pstate-epp-driver-name-hyphenated.patch
 can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch
 maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch
+
+mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
+mm-make-the-page-fault-mmap-locking-killable.patch
+arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
+powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
+mips-mm-convert-to-using-lock_mm_and_find_vma.patch
+riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
+arm-mm-convert-to-using-lock_mm_and_find_vma.patch
+mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
+powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
+mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
+execve-expand-new-process-stack-manually-ahead-of-time.patch
+mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
+gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch
diff --git a/queue-6.4/arm-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/arm-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..3be848b
--- /dev/null
@@ -0,0 +1,136 @@
+From 8b35ca3e45e35a26a21427f35d4093606e93ad0a Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 21:24:30 +0200
+Subject: arm/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream.
+
+arm has an additional check for address < FIRST_USER_ADDRESS before
+expanding the stack.  Since FIRST_USER_ADDRESS is defined everywhere
+(generally as 0), move that check to the generic expand_downwards().
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/Kconfig    |    1 
+ arch/arm/mm/fault.c |   63 +++++++++++-----------------------------------------
+ mm/mmap.c           |    2 -
+ 3 files changed, 16 insertions(+), 50 deletions(-)
+
+--- a/arch/arm/Kconfig
++++ b/arch/arm/Kconfig
+@@ -125,6 +125,7 @@ config ARM
+       select HAVE_UID16
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_REL
+       select NEED_DMA_MAP_STATE
+       select OF_EARLY_FLATTREE if OF
+--- a/arch/arm/mm/fault.c
++++ b/arch/arm/mm/fault.c
+@@ -232,37 +232,11 @@ static inline bool is_permission_fault(u
+       return false;
+ }
+-static vm_fault_t __kprobes
+-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags,
+-              unsigned long vma_flags, struct pt_regs *regs)
+-{
+-      struct vm_area_struct *vma = find_vma(mm, addr);
+-      if (unlikely(!vma))
+-              return VM_FAULT_BADMAP;
+-
+-      if (unlikely(vma->vm_start > addr)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      return VM_FAULT_BADMAP;
+-              if (addr < FIRST_USER_ADDRESS)
+-                      return VM_FAULT_BADMAP;
+-              if (expand_stack(vma, addr))
+-                      return VM_FAULT_BADMAP;
+-      }
+-
+-      /*
+-       * ok, we have a good vm_area for this memory access, check the
+-       * permissions on the VMA allow for the fault which occurred.
+-       */
+-      if (!(vma->vm_flags & vma_flags))
+-              return VM_FAULT_BADACCESS;
+-
+-      return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+-}
+-
+ static int __kprobes
+ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ {
+       struct mm_struct *mm = current->mm;
++      struct vm_area_struct *vma;
+       int sig, code;
+       vm_fault_t fault;
+       unsigned int flags = FAULT_FLAG_DEFAULT;
+@@ -301,31 +275,21 @@ do_page_fault(unsigned long addr, unsign
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+-      /*
+-       * As per x86, we may deadlock here.  However, since the kernel only
+-       * validly references user space from well defined areas of the code,
+-       * we can bug out early if this is from code which shouldn't.
+-       */
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
+-                      goto no_context;
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case, we'll have missed the might_sleep() from
+-               * down_read()
+-               */
+-              might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+-              if (!user_mode(regs) &&
+-                  !search_exception_tables(regs->ARM_pc))
+-                      goto no_context;
+-#endif
++      vma = lock_mm_and_find_vma(mm, addr, regs);
++      if (unlikely(!vma)) {
++              fault = VM_FAULT_BADMAP;
++              goto bad_area;
+       }
+-      fault = __do_page_fault(mm, addr, flags, vm_flags, regs);
++      /*
++       * ok, we have a good vm_area for this memory access, check the
++       * permissions on the VMA allow for the fault which occurred.
++       */
++      if (!(vma->vm_flags & vm_flags))
++              fault = VM_FAULT_BADACCESS;
++      else
++              fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+       /* If we need to retry but a fatal signal is pending, handle the
+        * signal first. We do not need to release the mmap_lock because
+@@ -356,6 +320,7 @@ retry:
+       if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+               return 0;
++bad_area:
+       /*
+        * If we are in kernel mode at this point, we
+        * have no context to handle this fault with.
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2036,7 +2036,7 @@ int expand_downwards(struct vm_area_stru
+       int error = 0;
+       address &= PAGE_MASK;
+-      if (address < mmap_min_addr)
++      if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+               return -EPERM;
+       /* Enforce stack_guard_gap */
diff --git a/queue-6.4/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..dc6485a
--- /dev/null
@@ -0,0 +1,112 @@
+From ae870a68b5d13d67cf4f18d47bb01ee3fee40acb Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 17:11:44 -0700
+Subject: arm64/mm: Convert to using lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream.
+
+This converts arm64 to use the new page fault helper.  It was very
+straightforward, but still needed a fix for the "obvious" conversion I
+initially did.  Thanks to Suren for the fix and testing.
+
+Fixed-and-tested-by: Suren Baghdasaryan <surenb@google.com>
+Unnecessary-code-removal-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/Kconfig    |    1 +
+ arch/arm64/mm/fault.c |   47 ++++++++---------------------------------------
+ 2 files changed, 9 insertions(+), 39 deletions(-)
+
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -225,6 +225,7 @@ config ARM64
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
+       select KASAN_VMALLOC if KASAN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select NEED_DMA_MAP_STATE
+       select NEED_SG_DMA_LENGTH
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa
+ #define VM_FAULT_BADMAP               ((__force vm_fault_t)0x010000)
+ #define VM_FAULT_BADACCESS    ((__force vm_fault_t)0x020000)
+-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
++static vm_fault_t __do_page_fault(struct mm_struct *mm,
++                                struct vm_area_struct *vma, unsigned long addr,
+                                 unsigned int mm_flags, unsigned long vm_flags,
+                                 struct pt_regs *regs)
+ {
+-      struct vm_area_struct *vma = find_vma(mm, addr);
+-
+-      if (unlikely(!vma))
+-              return VM_FAULT_BADMAP;
+-
+       /*
+        * Ok, we have a good vm_area for this memory access, so we can handle
+        * it.
+-       */
+-      if (unlikely(vma->vm_start > addr)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      return VM_FAULT_BADMAP;
+-              if (expand_stack(vma, addr))
+-                      return VM_FAULT_BADMAP;
+-      }
+-
+-      /*
+        * Check that the permissions on the VMA allow for the fault which
+        * occurred.
+        */
+@@ -617,31 +604,15 @@ static int __kprobes do_page_fault(unsig
+       }
+ lock_mmap:
+ #endif /* CONFIG_PER_VMA_LOCK */
+-      /*
+-       * As per x86, we may deadlock here. However, since the kernel only
+-       * validly references user space from well defined areas of the code,
+-       * we can bug out early if this is from code which shouldn't.
+-       */
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->pc))
+-                      goto no_context;
++
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above mmap_read_trylock() might have succeeded in which
+-               * case, we'll have missed the might_sleep() from down_read().
+-               */
+-              might_sleep();
+-#ifdef CONFIG_DEBUG_VM
+-              if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
+-                      mmap_read_unlock(mm);
+-                      goto no_context;
+-              }
+-#endif
++      vma = lock_mm_and_find_vma(mm, addr, regs);
++      if (unlikely(!vma)) {
++              fault = VM_FAULT_BADMAP;
++              goto done;
+       }
+-      fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
++      fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
+       /* Quick path to respond to signals */
+       if (fault_signal_pending(fault, regs)) {
+@@ -660,9 +631,7 @@ retry:
+       }
+       mmap_read_unlock(mm);
+-#ifdef CONFIG_PER_VMA_LOCK
+ done:
+-#endif
+       /*
+        * Handle the "normal" (no error) case first.
+        */
diff --git a/queue-6.4/execve-expand-new-process-stack-manually-ahead-of-time.patch b/queue-6.4/execve-expand-new-process-stack-manually-ahead-of-time.patch
new file mode 100644 (file)
index 0000000..82044d2
--- /dev/null
@@ -0,0 +1,88 @@
+From f313c51d26aa87e69633c9b46efb37a930faca71 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Mon, 19 Jun 2023 11:34:15 -0700
+Subject: execve: expand new process stack manually ahead of time
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream.
+
+This is a small step towards a model where GUP itself would not expand
+the stack, and any user that needs GUP to not look up existing mappings,
+but actually expand on them, would have to do so manually before-hand,
+and with the mm lock held for writing.
+
+It turns out that execve() already did almost exactly that, except it
+didn't take the mm lock at all (it's single-threaded so no locking
+technically needed, but it could cause lockdep errors).  And it only did
+it for the CONFIG_STACK_GROWSUP case, since in that case GUP has
+obviously never expanded the stack downwards.
+
+So just make that CONFIG_STACK_GROWSUP case do the right thing with
+locking, and enable it generally.  This will eventually help GUP, and in
+the meantime avoids a special case and the lockdep issue.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c |   37 +++++++++++++++++++++----------------
+ 1 file changed, 21 insertions(+), 16 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -200,34 +200,39 @@ static struct page *get_arg_page(struct
+               int write)
+ {
+       struct page *page;
++      struct vm_area_struct *vma = bprm->vma;
++      struct mm_struct *mm = bprm->mm;
+       int ret;
+-      unsigned int gup_flags = 0;
+-#ifdef CONFIG_STACK_GROWSUP
+-      if (write) {
+-              /* We claim to hold the lock - nobody to race with */
+-              ret = expand_downwards(bprm->vma, pos, true);
+-              if (ret < 0)
++      /*
++       * Avoid relying on expanding the stack down in GUP (which
++       * does not work for STACK_GROWSUP anyway), and just do it
++       * by hand ahead of time.
++       */
++      if (write && pos < vma->vm_start) {
++              mmap_write_lock(mm);
++              ret = expand_downwards(vma, pos, true);
++              if (unlikely(ret < 0)) {
++                      mmap_write_unlock(mm);
+                       return NULL;
+-      }
+-#endif
+-
+-      if (write)
+-              gup_flags |= FOLL_WRITE;
++              }
++              mmap_write_downgrade(mm);
++      } else
++              mmap_read_lock(mm);
+       /*
+        * We are doing an exec().  'current' is the process
+-       * doing the exec and bprm->mm is the new process's mm.
++       * doing the exec and 'mm' is the new process's mm.
+        */
+-      mmap_read_lock(bprm->mm);
+-      ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
++      ret = get_user_pages_remote(mm, pos, 1,
++                      write ? FOLL_WRITE : 0,
+                       &page, NULL, NULL);
+-      mmap_read_unlock(bprm->mm);
++      mmap_read_unlock(mm);
+       if (ret <= 0)
+               return NULL;
+       if (write)
+-              acct_arg_size(bprm, vma_pages(bprm->vma));
++              acct_arg_size(bprm, vma_pages(vma));
+       return page;
+ }
diff --git a/queue-6.4/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch b/queue-6.4/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch
new file mode 100644 (file)
index 0000000..cc1b1ef
--- /dev/null
@@ -0,0 +1,59 @@
+From a425ac5365f6cb3cc47bf83e6bff0213c10445f7 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sun, 25 Jun 2023 14:02:25 -0700
+Subject: gup: add warning if some caller would seem to want stack expansion
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a425ac5365f6cb3cc47bf83e6bff0213c10445f7 upstream.
+
+It feels very unlikely that anybody would want to do a GUP in an
+unmapped area under the stack pointer, but real users sometimes do some
+really strange things.  So add a (temporary) warning for the case where
+a GUP fails and expanding the stack might have made it work.
+
+It's trivial to do the expansion in the caller as part of getting the mm
+lock in the first place - see __access_remote_vm() for ptrace, for
+example - it's just that it's unnecessarily painful to do it deep in the
+guts of the GUP lookup when we might have to drop and re-take the lock.
+
+I doubt anybody actually does anything quite this strange, but let's be
+proactive: adding these warnings is simple, and will make debugging it
+much easier if they trigger.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/gup.c |   12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1096,7 +1096,11 @@ static long __get_user_pages(struct mm_s
+               /* first iteration or cross vma bound */
+               if (!vma || start >= vma->vm_end) {
+-                      vma = vma_lookup(mm, start);
++                      vma = find_vma(mm, start);
++                      if (vma && (start < vma->vm_start)) {
++                              WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
++                              vma = NULL;
++                      }
+                       if (!vma && in_gate_area(mm, start)) {
+                               ret = get_gate_page(mm, start & PAGE_MASK,
+                                               gup_flags, &vma,
+@@ -1265,9 +1269,13 @@ int fixup_user_fault(struct mm_struct *m
+               fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ retry:
+-      vma = vma_lookup(mm, address);
++      vma = find_vma(mm, address);
+       if (!vma)
+               return -EFAULT;
++      if (address < vma->vm_start ) {
++              WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
++              return -EFAULT;
++      }
+       if (!vma_permits_fault(vma, fault_flags))
+               return -EFAULT;
diff --git a/queue-6.4/mips-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/mips-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..915e050
--- /dev/null
@@ -0,0 +1,53 @@
+From 4bce37a68ff884e821a02a731897a8119e0c37b7 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 18:47:40 +0200
+Subject: mips/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/mips/Kconfig    |    1 +
+ arch/mips/mm/fault.c |   12 ++----------
+ 2 files changed, 3 insertions(+), 10 deletions(-)
+
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -91,6 +91,7 @@ config MIPS
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP
+       select IRQ_FORCED_THREADING
+       select ISA if EISA
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_REL if MODULES
+       select MODULES_USE_ELF_RELA if MODULES && 64BIT
+       select PERF_USE_VMALLOC
+--- a/arch/mips/mm/fault.c
++++ b/arch/mips/mm/fault.c
+@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+  * we can handle it..
+  */
+-good_area:
+       si_code = SEGV_ACCERR;
+       if (write) {
diff --git a/queue-6.4/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch b/queue-6.4/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
new file mode 100644 (file)
index 0000000..5965c6d
--- /dev/null
@@ -0,0 +1,671 @@
+From 8d7071af890768438c14db6172cc8f9f4d04e184 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 13:45:51 -0700
+Subject: mm: always expand the stack with the mmap write lock held
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream.
+
+This finishes the job of always holding the mmap write lock when
+extending the user stack vma, and removes the 'write_locked' argument
+from the vm helper functions again.
+
+For some cases, we just avoid expanding the stack at all: drivers and
+page pinning really shouldn't be extending any stacks.  Let's see if any
+strange users really wanted that.
+
+It's worth noting that architectures that weren't converted to the new
+lock_mm_and_find_vma() helper function are left using the legacy
+"expand_stack()" function, but it has been changed to drop the mmap_lock
+and take it for writing while expanding the vma.  This makes it fairly
+straightforward to convert the remaining architectures.
+
+As a result of dropping and re-taking the lock, the calling conventions
+for this function have also changed, since the old vma may no longer be
+valid.  So it will now return the new vma if successful, and NULL - and
+the lock dropped - if the area could not be extended.
+
+Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
+Tested-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> # ia64
+Tested-by: Frank Scheiner <frank.scheiner@web.de> # ia64
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/ia64/mm/fault.c         |   36 ++----------
+ arch/m68k/mm/fault.c         |    9 ++-
+ arch/microblaze/mm/fault.c   |    5 +
+ arch/openrisc/mm/fault.c     |    5 +
+ arch/parisc/mm/fault.c       |   23 +++-----
+ arch/s390/mm/fault.c         |    5 +
+ arch/sparc/mm/fault_64.c     |    8 +-
+ arch/um/kernel/trap.c        |   11 ++-
+ drivers/iommu/amd/iommu_v2.c |    4 -
+ drivers/iommu/iommu-sva.c    |    2 
+ fs/binfmt_elf.c              |    2 
+ fs/exec.c                    |    4 -
+ include/linux/mm.h           |   16 +----
+ mm/gup.c                     |    6 +-
+ mm/memory.c                  |   10 +++
+ mm/mmap.c                    |  121 ++++++++++++++++++++++++++++++++++---------
+ mm/nommu.c                   |   18 ++----
+ 17 files changed, 169 insertions(+), 116 deletions(-)
+
+--- a/arch/ia64/mm/fault.c
++++ b/arch/ia64/mm/fault.c
+@@ -110,10 +110,12 @@ retry:
+          * register backing store that needs to expand upwards, in
+          * this case vma will be null, but prev_vma will ne non-null
+          */
+-        if (( !vma && prev_vma ) || (address < vma->vm_start) )
+-              goto check_expansion;
++        if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto bad_area_nosemaphore;
++      }
+-  good_area:
+       code = SEGV_ACCERR;
+       /* OK, we've got a good vm_area for this memory area.  Check the access permissions: */
+@@ -177,35 +179,9 @@ retry:
+       mmap_read_unlock(mm);
+       return;
+-  check_expansion:
+-      if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+-              if (!vma)
+-                      goto bad_area;
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      goto bad_area;
+-              if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+-                  || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+-                      goto bad_area;
+-              if (expand_stack(vma, address))
+-                      goto bad_area;
+-      } else {
+-              vma = prev_vma;
+-              if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+-                  || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+-                      goto bad_area;
+-              /*
+-               * Since the register backing store is accessed sequentially,
+-               * we disallow growing it by more than a page at a time.
+-               */
+-              if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+-                      goto bad_area;
+-              if (expand_upwards(vma, address))
+-                      goto bad_area;
+-      }
+-      goto good_area;
+-
+   bad_area:
+       mmap_read_unlock(mm);
++  bad_area_nosemaphore:
+       if ((isr & IA64_ISR_SP)
+           || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+       {
+--- a/arch/m68k/mm/fault.c
++++ b/arch/m68k/mm/fault.c
+@@ -105,8 +105,9 @@ retry:
+               if (address + 256 < rdusp())
+                       goto map_err;
+       }
+-      if (expand_stack(vma, address))
+-              goto map_err;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto map_err_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+@@ -196,10 +197,12 @@ bus_err:
+       goto send_sig;
+ map_err:
++      mmap_read_unlock(mm);
++map_err_nosemaphore:
+       current->thread.signo = SIGSEGV;
+       current->thread.code = SEGV_MAPERR;
+       current->thread.faddr = address;
+-      goto send_sig;
++      return send_fault_sig(regs);
+ acc_err:
+       current->thread.signo = SIGSEGV;
+--- a/arch/microblaze/mm/fault.c
++++ b/arch/microblaze/mm/fault.c
+@@ -192,8 +192,9 @@ retry:
+                       && (kernel_mode(regs) || !store_updates_sp(regs)))
+                               goto bad_area;
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+ good_area:
+       code = SEGV_ACCERR;
+--- a/arch/openrisc/mm/fault.c
++++ b/arch/openrisc/mm/fault.c
+@@ -127,8 +127,9 @@ retry:
+               if (address + PAGE_SIZE < regs->sp)
+                       goto bad_area;
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+--- a/arch/parisc/mm/fault.c
++++ b/arch/parisc/mm/fault.c
+@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs,
+ retry:
+       mmap_read_lock(mm);
+       vma = find_vma_prev(mm, address, &prev_vma);
+-      if (!vma || address < vma->vm_start)
+-              goto check_expansion;
++      if (!vma || address < vma->vm_start) {
++              if (!prev || !(prev->vm_flags & VM_GROWSUP))
++                      goto bad_area;
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto bad_area_nosemaphore;
++      }
++
+ /*
+  * Ok, we have a good vm_area for this memory access. We still need to
+  * check the access permissions.
+  */
+-good_area:
+-
+       if ((vma->vm_flags & acc_type) != acc_type)
+               goto bad_area;
+@@ -347,17 +351,13 @@ good_area:
+       mmap_read_unlock(mm);
+       return;
+-check_expansion:
+-      vma = prev_vma;
+-      if (vma && (expand_stack(vma, address) == 0))
+-              goto good_area;
+-
+ /*
+  * Something tried to access memory that isn't in our memory map..
+  */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               int signo, si_code;
+@@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+ {
+       unsigned long insn = regs->iir;
+       int breg, treg, xreg, val = 0;
+-      struct vm_area_struct *vma, *prev_vma;
++      struct vm_area_struct *vma;
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       unsigned long address;
+@@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs
+                               /* Search for VMA */
+                               address = regs->ior;
+                               mmap_read_lock(mm);
+-                              vma = find_vma_prev(mm, address, &prev_vma);
++                              vma = vma_lookup(mm, address);
+                               mmap_read_unlock(mm);
+                               /*
+@@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs
+                                */
+                               acc_type = (insn & 0x40) ? VM_WRITE : VM_READ;
+                               if (vma
+-                                  && address >= vma->vm_start
+                                   && (vma->vm_flags & acc_type) == acc_type)
+                                       val = 1;
+                       }
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -457,8 +457,9 @@ retry:
+       if (unlikely(vma->vm_start > address)) {
+               if (!(vma->vm_flags & VM_GROWSDOWN))
+                       goto out_up;
+-              if (expand_stack(vma, address))
+-                      goto out_up;
++              vma = expand_stack(mm, address);
++              if (!vma)
++                      goto out;
+       }
+       /*
+--- a/arch/sparc/mm/fault_64.c
++++ b/arch/sparc/mm/fault_64.c
+@@ -383,8 +383,9 @@ continue_fault:
+                               goto bad_area;
+               }
+       }
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+@@ -487,8 +488,9 @@ exit_exception:
+        * Fix it, but check if it's kernel or user first..
+        */
+ bad_area:
+-      insn = get_fault_insn(regs, insn);
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
++      insn = get_fault_insn(regs, insn);
+ handle_kernel_fault:
+       do_kernel_fault(regs, si_code, fault_code, insn, address);
+--- a/arch/um/kernel/trap.c
++++ b/arch/um/kernel/trap.c
+@@ -47,14 +47,15 @@ retry:
+       vma = find_vma(mm, address);
+       if (!vma)
+               goto out;
+-      else if (vma->vm_start <= address)
++      if (vma->vm_start <= address)
+               goto good_area;
+-      else if (!(vma->vm_flags & VM_GROWSDOWN))
++      if (!(vma->vm_flags & VM_GROWSDOWN))
+               goto out;
+-      else if (is_user && !ARCH_IS_STACKGROW(address))
+-              goto out;
+-      else if (expand_stack(vma, address))
++      if (is_user && !ARCH_IS_STACKGROW(address))
+               goto out;
++      vma = expand_stack(mm, address);
++      if (!vma)
++              goto out_nosemaphore;
+ good_area:
+       *code_out = SEGV_ACCERR;
+--- a/drivers/iommu/amd/iommu_v2.c
++++ b/drivers/iommu/amd/iommu_v2.c
+@@ -485,8 +485,8 @@ static void do_fault(struct work_struct
+       flags |= FAULT_FLAG_REMOTE;
+       mmap_read_lock(mm);
+-      vma = find_extend_vma(mm, address);
+-      if (!vma || address < vma->vm_start)
++      vma = vma_lookup(mm, address);
++      if (!vma)
+               /* failed to get a vma in the right range */
+               goto out;
+--- a/drivers/iommu/iommu-sva.c
++++ b/drivers/iommu/iommu-sva.c
+@@ -175,7 +175,7 @@ iommu_sva_handle_iopf(struct iommu_fault
+       mmap_read_lock(mm);
+-      vma = find_extend_vma(mm, prm->addr);
++      vma = vma_lookup(mm, prm->addr);
+       if (!vma)
+               /* Unmapped area */
+               goto out_put_mm;
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -322,7 +322,7 @@ create_elf_tables(struct linux_binprm *b
+        */
+       if (mmap_write_lock_killable(mm))
+               return -EINTR;
+-      vma = find_extend_vma_locked(mm, bprm->p, true);
++      vma = find_extend_vma_locked(mm, bprm->p);
+       mmap_write_unlock(mm);
+       if (!vma)
+               return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -211,7 +211,7 @@ static struct page *get_arg_page(struct
+        */
+       if (write && pos < vma->vm_start) {
+               mmap_write_lock(mm);
+-              ret = expand_downwards(vma, pos, true);
++              ret = expand_downwards(vma, pos);
+               if (unlikely(ret < 0)) {
+                       mmap_write_unlock(mm);
+                       return NULL;
+@@ -859,7 +859,7 @@ int setup_arg_pages(struct linux_binprm
+       stack_base = vma->vm_end - stack_expand;
+ #endif
+       current->mm->start_stack = bprm->p;
+-      ret = expand_stack_locked(vma, stack_base, true);
++      ret = expand_stack_locked(vma, stack_base);
+       if (ret)
+               ret = -EFAULT;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3192,18 +3192,11 @@ extern vm_fault_t filemap_page_mkwrite(s
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked);
+-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked);
+-#if VM_GROWSUP
+-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+-#else
+-  #define expand_upwards(vma, address) (0)
+-#endif
++int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+ /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+@@ -3298,9 +3291,8 @@ unsigned long change_prot_numa(struct vm
+                       unsigned long start, unsigned long end);
+ #endif
+-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
+-              unsigned long addr, bool write_locked);
++              unsigned long addr);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+                       unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -1096,7 +1096,7 @@ static long __get_user_pages(struct mm_s
+               /* first iteration or cross vma bound */
+               if (!vma || start >= vma->vm_end) {
+-                      vma = find_extend_vma(mm, start);
++                      vma = vma_lookup(mm, start);
+                       if (!vma && in_gate_area(mm, start)) {
+                               ret = get_gate_page(mm, start & PAGE_MASK,
+                                               gup_flags, &vma,
+@@ -1265,8 +1265,8 @@ int fixup_user_fault(struct mm_struct *m
+               fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ retry:
+-      vma = find_extend_vma(mm, address);
+-      if (!vma || address < vma->vm_start)
++      vma = vma_lookup(mm, address);
++      if (!vma)
+               return -EFAULT;
+       if (!vma_permits_fault(vma, fault_flags))
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_
+                       goto fail;
+       }
+-      if (expand_stack_locked(vma, addr, true))
++      if (expand_stack_locked(vma, addr))
+               goto fail;
+ success:
+@@ -5713,6 +5713,14 @@ int __access_remote_vm(struct mm_struct
+       if (mmap_read_lock_killable(mm))
+               return 0;
++      /* We might need to expand the stack to access it */
++      vma = vma_lookup(mm, addr);
++      if (!vma) {
++              vma = expand_stack(mm, addr);
++              if (!vma)
++                      return 0;
++      }
++
+       /* ignore errors, just check how much was successfully transferred */
+       while (len) {
+               int bytes, ret, offset;
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1935,8 +1935,7 @@ static int acct_stack_growth(struct vm_a
+  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+  * vma is the last one with address > vma->vm_end.  Have to extend vma.
+  */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       struct vm_area_struct *next;
+@@ -1960,8 +1959,6 @@ int expand_upwards(struct vm_area_struct
+       if (gap_addr < address || gap_addr > TASK_SIZE)
+               gap_addr = TASK_SIZE;
+-      if (!write_locked)
+-              return -EAGAIN;
+       next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+       if (next && vma_is_accessible(next)) {
+               if (!(next->vm_flags & VM_GROWSUP))
+@@ -2030,15 +2027,18 @@ int expand_upwards(struct vm_area_struct
+ /*
+  * vma is the first one with address < vma->vm_start.  Have to extend vma.
++ * mmap_lock held for writing.
+  */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+       struct vm_area_struct *prev;
+       int error = 0;
++      if (!(vma->vm_flags & VM_GROWSDOWN))
++              return -EFAULT;
++
+       address &= PAGE_MASK;
+       if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+               return -EPERM;
+@@ -2051,8 +2051,6 @@ int expand_downwards(struct vm_area_stru
+                   vma_is_accessible(prev) &&
+                   (address - prev->vm_end < stack_guard_gap))
+                       return -ENOMEM;
+-              if (!write_locked && (prev->vm_end == address))
+-                      return -EAGAIN;
+       }
+       if (mas_preallocate(&mas, GFP_KERNEL))
+@@ -2131,14 +2129,12 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+-      return expand_upwards(vma, address, write_locked);
++      return expand_upwards(vma, address);
+ }
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+-              unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+       struct vm_area_struct *vma, *prev;
+@@ -2148,23 +2144,21 @@ struct vm_area_struct *find_extend_vma_l
+               return vma;
+       if (!prev)
+               return NULL;
+-      if (expand_stack_locked(prev, addr, write_locked))
++      if (expand_stack_locked(prev, addr))
+               return NULL;
+       if (prev->vm_flags & VM_LOCKED)
+               populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+       return prev;
+ }
+ #else
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
+ {
+       if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+               return -EINVAL;
+-      return expand_downwards(vma, address, write_locked);
++      return expand_downwards(vma, address);
+ }
+-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
+-              unsigned long addr, bool write_locked)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
+ {
+       struct vm_area_struct *vma;
+       unsigned long start;
+@@ -2176,7 +2170,7 @@ struct vm_area_struct *find_extend_vma_l
+       if (vma->vm_start <= addr)
+               return vma;
+       start = vma->vm_start;
+-      if (expand_stack_locked(vma, addr, write_locked))
++      if (expand_stack_locked(vma, addr))
+               return NULL;
+       if (vma->vm_flags & VM_LOCKED)
+               populate_vma_page_range(vma, addr, start, NULL);
+@@ -2184,12 +2178,91 @@ struct vm_area_struct *find_extend_vma_l
+ }
+ #endif
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
+-              unsigned long addr)
++/*
++ * IA64 has some horrid mapping rules: it can expand both up and down,
++ * but with various special rules.
++ *
++ * We'll get rid of this architecture eventually, so the ugliness is
++ * temporary.
++ */
++#ifdef CONFIG_IA64
++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
++{
++      return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
++              REGION_OFFSET(addr) < RGN_MAP_LIMIT;
++}
++
++/*
++ * IA64 stacks grow down, but there's a special register backing store
++ * that can grow up. Only sequentially, though, so the new address must
++ * match vm_end.
++ */
++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
++{
++      if (!vma_expand_ok(vma, addr))
++              return -EFAULT;
++      if (vma->vm_end != (addr & PAGE_MASK))
++              return -EFAULT;
++      return expand_upwards(vma, addr);
++}
++
++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
++{
++      if (!vma_expand_ok(vma, addr))
++              return -EFAULT;
++      return expand_downwards(vma, addr);
++}
++
++#elif defined(CONFIG_STACK_GROWSUP)
++
++#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
++#define vma_expand_down(vma, addr) (-EFAULT)
++
++#else
++
++#define vma_expand_up(vma,addr) (-EFAULT)
++#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
++
++#endif
++
++/*
++ * expand_stack(): legacy interface for page faulting. Don't use unless
++ * you have to.
++ *
++ * This is called with the mm locked for reading, drops the lock, takes
++ * the lock for writing, tries to look up a vma again, expands it if
++ * necessary, and downgrades the lock to reading again.
++ *
++ * If no vma is found or it can't be expanded, it returns NULL and has
++ * dropped the lock.
++ */
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+ {
+-      return find_extend_vma_locked(mm, addr, false);
++      struct vm_area_struct *vma, *prev;
++
++      mmap_read_unlock(mm);
++      if (mmap_write_lock_killable(mm))
++              return NULL;
++
++      vma = find_vma_prev(mm, addr, &prev);
++      if (vma && vma->vm_start <= addr)
++              goto success;
++
++      if (prev && !vma_expand_up(prev, addr)) {
++              vma = prev;
++              goto success;
++      }
++
++      if (vma && !vma_expand_down(vma, addr))
++              goto success;
++
++      mmap_write_unlock(mm);
++      return NULL;
++
++success:
++      mmap_write_downgrade(mm);
++      return vma;
+ }
+-EXPORT_SYMBOL_GPL(find_extend_vma);
+ /*
+  * Ok - we have the memory areas we should free on a maple tree so release them,
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -631,24 +631,20 @@ struct vm_area_struct *find_vma(struct m
+ EXPORT_SYMBOL(find_vma);
+ /*
+- * find a VMA
+- * - we don't extend stack VMAs under NOMMU conditions
+- */
+-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+-{
+-      return find_vma(mm, addr);
+-}
+-
+-/*
+  * expand a stack to a given address
+  * - not supported under NOMMU conditions
+  */
+-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
+-              bool write_locked)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
+ {
+       return -ENOMEM;
+ }
++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
++{
++      mmap_read_unlock(mm);
++      return NULL;
++}
++
+ /*
+  * look up the first VMA exactly that exactly matches addr
+  * - should be called with mm->mmap_lock at least held readlocked
diff --git a/queue-6.4/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch b/queue-6.4/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..77c9efb
--- /dev/null
@@ -0,0 +1,489 @@
+From a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 10:55:38 -0700
+Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream.
+
+This does the simple pattern conversion of alpha, arc, csky, hexagon,
+loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma()
+helper.  They all have the regular fault handling pattern without odd
+special cases.
+
+The remaining architectures all have something that keeps us from a
+straightforward conversion: ia64 and parisc have stacks that can grow
+both up as well as down (and ia64 has special address region checks).
+
+And m68k, microblaze, openrisc, sparc64, and um end up having extra
+rules about only expanding the stack down a limited amount below the
+user space stack pointer.  That is something that x86 used to do too
+(long long ago), and it probably could just be skipped, but it still
+makes the conversion less than trivial.
+
+Note that this conversion was done manually and with the exception of
+alpha without any build testing, because I have a fairly limited cross-
+building environment.  The cases are all simple, and I went through the
+changes several times, but...
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/alpha/Kconfig         |    1 +
+ arch/alpha/mm/fault.c      |   13 +++----------
+ arch/arc/Kconfig           |    1 +
+ arch/arc/mm/fault.c        |   11 +++--------
+ arch/csky/Kconfig          |    1 +
+ arch/csky/mm/fault.c       |   22 +++++-----------------
+ arch/hexagon/Kconfig       |    1 +
+ arch/hexagon/mm/vm_fault.c |   18 ++++--------------
+ arch/loongarch/Kconfig     |    1 +
+ arch/loongarch/mm/fault.c  |   16 ++++++----------
+ arch/nios2/Kconfig         |    1 +
+ arch/nios2/mm/fault.c      |   17 ++---------------
+ arch/sh/Kconfig            |    1 +
+ arch/sh/mm/fault.c         |   17 ++---------------
+ arch/sparc/Kconfig         |    1 +
+ arch/sparc/mm/fault_32.c   |   32 ++++++++------------------------
+ arch/xtensa/Kconfig        |    1 +
+ arch/xtensa/mm/fault.c     |   14 +++-----------
+ 18 files changed, 45 insertions(+), 124 deletions(-)
+
+--- a/arch/alpha/Kconfig
++++ b/arch/alpha/Kconfig
+@@ -30,6 +30,7 @@ config ALPHA
+       select HAS_IOPORT
+       select HAVE_ARCH_AUDITSYSCALL
+       select HAVE_MOD_ARCH_SPECIFIC
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select ODD_RT_SIGACTION
+       select OLD_SIGSUSPEND
+--- a/arch/alpha/mm/fault.c
++++ b/arch/alpha/mm/fault.c
+@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns
+               flags |= FAULT_FLAG_USER;
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /* Ok, we have a good vm_area for this memory access, so
+          we can handle it.  */
+- good_area:
+       si_code = SEGV_ACCERR;
+       if (cause < 0) {
+               if (!(vma->vm_flags & VM_EXEC))
+@@ -192,6 +184,7 @@ retry:
+  bad_area:
+       mmap_read_unlock(mm);
++ bad_area_nosemaphore:
+       if (user_mode(regs))
+               goto do_sigsegv;
+--- a/arch/arc/Kconfig
++++ b/arch/arc/Kconfig
+@@ -41,6 +41,7 @@ config ARC
+       select HAVE_PERF_EVENTS
+       select HAVE_SYSCALL_TRACEPOINTS
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select OF
+       select OF_EARLY_FLATTREE
+--- a/arch/arc/mm/fault.c
++++ b/arch/arc/mm/fault.c
+@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (unlikely(address < vma->vm_start)) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address))
+-                      goto bad_area;
+-      }
++              goto bad_area_nosemaphore;
+       /*
+        * vm_area is good, now check permissions for this memory access
+@@ -161,6 +155,7 @@ retry:
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       /*
+        * Major/minor page fault accounting
+        * (in case of retry we only land here once)
+--- a/arch/csky/Kconfig
++++ b/arch/csky/Kconfig
+@@ -96,6 +96,7 @@ config CSKY
+       select HAVE_REGS_AND_STACK_ACCESS_API
+       select HAVE_STACKPROTECTOR
+       select HAVE_SYSCALL_TRACEPOINTS
++      select LOCK_MM_AND_FIND_VMA
+       select MAY_HAVE_SPARSE_IRQ
+       select MODULES_USE_ELF_RELA if MODULES
+       select OF
+--- a/arch/csky/mm/fault.c
++++ b/arch/csky/mm/fault.c
+@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct
+       BUG();
+ }
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
+ {
+       /*
+        * Something tried to access memory that isn't in our memory map.
+        * Fix it, but check if it's kernel or user first.
+        */
+-      mmap_read_unlock(mm);
+       /* User mode accesses just cause a SIGSEGV */
+       if (user_mode(regs)) {
+               do_trap(regs, SIGSEGV, code, addr);
+@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_
+       if (is_write(regs))
+               flags |= FAULT_FLAG_WRITE;
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, addr);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= addr))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, addr))) {
+-              bad_area(regs, mm, code, addr);
++              bad_area_nosemaphore(regs, mm, code, addr);
+               return;
+       }
+@@ -259,11 +247,11 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it.
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (unlikely(access_error(regs, vma))) {
+-              bad_area(regs, mm, code, addr);
++              mmap_read_unlock(mm);
++              bad_area_nosemaphore(regs, mm, code, addr);
+               return;
+       }
+--- a/arch/hexagon/Kconfig
++++ b/arch/hexagon/Kconfig
+@@ -28,6 +28,7 @@ config HEXAGON
+       select GENERIC_SMP_IDLE_THREAD
+       select STACKTRACE_SUPPORT
+       select GENERIC_CLOCKEVENTS_BROADCAST
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select GENERIC_CPU_DEVICES
+       select ARCH_WANT_LD_ORPHAN_WARN
+--- a/arch/hexagon/mm/vm_fault.c
++++ b/arch/hexagon/mm/vm_fault.c
+@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-      if (!vma)
+-              goto bad_area;
++      vma = lock_mm_and_find_vma(mm, address, regs);
++      if (unlikely(!vma))
++              goto bad_area_nosemaphore;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-
+-      if (expand_stack(vma, address))
+-              goto bad_area;
+-
+-good_area:
+       /* Address space is OK.  Now check access rights. */
+       si_code = SEGV_ACCERR;
+@@ -143,6 +132,7 @@ good_area:
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+               return;
+--- a/arch/loongarch/Kconfig
++++ b/arch/loongarch/Kconfig
+@@ -130,6 +130,7 @@ config LOONGARCH
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP
+       select IRQ_FORCED_THREADING
+       select IRQ_LOONGARCH_CPU
++      select LOCK_MM_AND_FIND_VMA
+       select MMU_GATHER_MERGE_VMAS if MMU
+       select MODULES_USE_ELF_RELA if MODULES
+       select NEED_PER_CPU_EMBED_FIRST_CHUNK
+--- a/arch/loongarch/mm/fault.c
++++ b/arch/loongarch/mm/fault.c
+@@ -169,22 +169,18 @@ static void __kprobes __do_page_fault(st
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-      if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (!expand_stack(vma, address))
+-              goto good_area;
++      vma = lock_mm_and_find_vma(mm, address, regs);
++      if (unlikely(!vma))
++              goto bad_area_nosemaphore;
++      goto good_area;
++
+ /*
+  * Something tried to access memory that isn't in our memory map..
+  * Fix it, but check if it's kernel or user first..
+  */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       do_sigsegv(regs, write, address, si_code);
+       return;
+--- a/arch/nios2/Kconfig
++++ b/arch/nios2/Kconfig
+@@ -16,6 +16,7 @@ config NIOS2
+       select HAVE_ARCH_TRACEHOOK
+       select HAVE_ARCH_KGDB
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select OF
+       select OF_EARLY_FLATTREE
+--- a/arch/nios2/mm/fault.c
++++ b/arch/nios2/mm/fault.c
+@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+-      if (!mmap_read_trylock(mm)) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ea))
+-                      goto bad_area_nosemaphore;
+ retry:
+-              mmap_read_lock(mm);
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+ /*
+  * Ok, we have a good vm_area for this memory access, so
+  * we can handle it..
+  */
+-good_area:
+       code = SEGV_ACCERR;
+       switch (cause) {
+--- a/arch/sh/Kconfig
++++ b/arch/sh/Kconfig
+@@ -59,6 +59,7 @@ config SUPERH
+       select HAVE_STACKPROTECTOR
+       select HAVE_SYSCALL_TRACEPOINTS
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select NEED_SG_DMA_LENGTH
+       select NO_DMA if !MMU && !DMA_COHERENT
+--- a/arch/sh/mm/fault.c
++++ b/arch/sh/mm/fault.c
+@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(
+       }
+ retry:
+-      mmap_read_lock(mm);
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= address))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, address))) {
+-              bad_area(regs, error_code, address);
++              bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
+@@ -461,7 +449,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       if (unlikely(access_error(error_code, vma))) {
+               bad_area_access_error(regs, error_code, address);
+               return;
+--- a/arch/sparc/Kconfig
++++ b/arch/sparc/Kconfig
+@@ -57,6 +57,7 @@ config SPARC32
+       select DMA_DIRECT_REMAP
+       select GENERIC_ATOMIC64
+       select HAVE_UID16
++      select LOCK_MM_AND_FIND_VMA
+       select OLD_SIGACTION
+       select ZONE_DMA
+--- a/arch/sparc/mm/fault_32.c
++++ b/arch/sparc/mm/fault_32.c
+@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt
+       if (pagefault_disabled() || !mm)
+               goto no_context;
++      if (!from_user && address >= PAGE_OFFSET)
++              goto no_context;
++
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-
+-      if (!from_user && address >= PAGE_OFFSET)
+-              goto bad_area;
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (write) {
+               if (!(vma->vm_flags & VM_WRITE))
+@@ -321,17 +312,9 @@ static void force_user_fault(unsigned lo
+       code = SEGV_MAPERR;
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
+-good_area:
++              goto bad_area_nosemaphore;
+       code = SEGV_ACCERR;
+       if (write) {
+               if (!(vma->vm_flags & VM_WRITE))
+@@ -350,6 +333,7 @@ good_area:
+       return;
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address);
+       return;
+--- a/arch/xtensa/Kconfig
++++ b/arch/xtensa/Kconfig
+@@ -49,6 +49,7 @@ config XTENSA
+       select HAVE_SYSCALL_TRACEPOINTS
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN
+       select IRQ_DOMAIN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA
+       select PERF_USE_VMALLOC
+       select TRACE_IRQFLAGS_SUPPORT
+--- a/arch/xtensa/mm/fault.c
++++ b/arch/xtensa/mm/fault.c
+@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs)
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, address);
+-
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (!vma)
+-              goto bad_area;
+-      if (vma->vm_start <= address)
+-              goto good_area;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              goto bad_area;
+-      if (expand_stack(vma, address))
+-              goto bad_area;
++              goto bad_area_nosemaphore;
+       /* Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (is_write) {
+@@ -205,6 +196,7 @@ good_area:
+        */
+ bad_area:
+       mmap_read_unlock(mm);
++bad_area_nosemaphore:
+       if (user_mode(regs)) {
+               force_sig_fault(SIGSEGV, code, (void *) address);
+               return;
diff --git a/queue-6.4/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch b/queue-6.4/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
new file mode 100644 (file)
index 0000000..88491d4
--- /dev/null
@@ -0,0 +1,295 @@
+From c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 15:17:36 -0700
+Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream.
+
+.. and make x86 use it.
+
+This basically extracts the existing x86 "find and expand faulting vma"
+code, but extends it to also take the mmap lock for writing in case we
+actually do need to expand the vma.
+
+We've historically short-circuited that case, and have some rather ugly
+special logic to serialize the stack segment expansion (since we only
+hold the mmap lock for reading) that doesn't match the normal VM
+locking.
+
+That slight violation of locking worked well, right up until it didn't:
+the maple tree code really does want proper locking even for simple
+extension of an existing vma.
+
+So extract the code for "look up the vma of the fault" from x86, fix it
+up to do the necessary write locking, and make it available as a helper
+function for other architectures that can use the common helper.
+
+Note: I say "common helper", but it really only handles the normal
+stack-grows-down case.  Which is all architectures except for PA-RISC
+and IA64.  So some rare architectures can't use the helper, but if they
+care they'll just need to open-code this logic.
+
+It's also worth pointing out that this code really would like to have an
+optimistic "mmap_upgrade_trylock()" to make it quicker to go from a
+read-lock (for the common case) to taking the write lock (for having to
+extend the vma) in the normal single-threaded situation where there is
+no other locking activity.
+
+But that _is_ all the very uncommon special case, so while it would be
+nice to have such an operation, it probably doesn't matter in reality.
+I did put in the skeleton code for such a possible future expansion,
+even if it only acts as pseudo-documentation for what we're doing.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig    |    1 
+ arch/x86/mm/fault.c |   52 ----------------------
+ include/linux/mm.h  |    2 
+ mm/Kconfig          |    4 +
+ mm/memory.c         |  121 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 130 insertions(+), 50 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -276,6 +276,7 @@ config X86
+       select HAVE_GENERIC_VDSO
+       select HOTPLUG_SMT                      if SMP
+       select IRQ_FORCED_THREADING
++      select LOCK_MM_AND_FIND_VMA
+       select NEED_PER_CPU_EMBED_FIRST_CHUNK
+       select NEED_PER_CPU_PAGE_FIRST_CHUNK
+       select NEED_SG_DMA_LENGTH
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -880,12 +880,6 @@ __bad_area(struct pt_regs *regs, unsigne
+       __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+ }
+-static noinline void
+-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+-{
+-      __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
+-}
+-
+ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+               struct vm_area_struct *vma)
+ {
+@@ -1366,51 +1360,10 @@ void do_user_addr_fault(struct pt_regs *
+ lock_mmap:
+ #endif /* CONFIG_PER_VMA_LOCK */
+-      /*
+-       * Kernel-mode access to the user address space should only occur
+-       * on well-defined single instructions listed in the exception
+-       * tables.  But, an erroneous kernel fault occurring outside one of
+-       * those areas which also holds mmap_lock might deadlock attempting
+-       * to validate the fault against the address space.
+-       *
+-       * Only do the expensive exception table search when we might be at
+-       * risk of a deadlock.  This happens if we
+-       * 1. Failed to acquire mmap_lock, and
+-       * 2. The access did not originate in userspace.
+-       */
+-      if (unlikely(!mmap_read_trylock(mm))) {
+-              if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
+-                      /*
+-                       * Fault from code in kernel from
+-                       * which we do not expect faults.
+-                       */
+-                      bad_area_nosemaphore(regs, error_code, address);
+-                      return;
+-              }
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case we'll have missed the might_sleep() from
+-               * down_read():
+-               */
+-              might_sleep();
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma)) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= address))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              bad_area(regs, error_code, address);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, address))) {
+-              bad_area(regs, error_code, address);
++              bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
+@@ -1418,7 +1371,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+-good_area:
+       if (unlikely(access_error(error_code, vma))) {
+               bad_area_access_error(regs, error_code, address, vma);
+               return;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2325,6 +2325,8 @@ void unmap_mapping_pages(struct address_
+               pgoff_t start, pgoff_t nr, bool even_cows);
+ void unmap_mapping_range(struct address_space *mapping,
+               loff_t const holebegin, loff_t const holelen, int even_cows);
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++              unsigned long address, struct pt_regs *regs);
+ #else
+ static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+                                        unsigned long address, unsigned int flags,
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1206,6 +1206,10 @@ config PER_VMA_LOCK
+         This feature allows locking each virtual memory area separately when
+         handling page faults instead of taking mmap_lock.
++config LOCK_MM_AND_FIND_VMA
++      bool
++      depends on !STACK_GROWSUP
++
+ source "mm/damon/Kconfig"
+ endmenu
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5262,6 +5262,127 @@ out:
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
++#include <linux/extable.h>
++
++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++      /* Even if this succeeds, make it clear we *might* have slept */
++      if (likely(mmap_read_trylock(mm))) {
++              might_sleep();
++              return true;
++      }
++
++      if (regs && !user_mode(regs)) {
++              unsigned long ip = instruction_pointer(regs);
++              if (!search_exception_tables(ip))
++                      return false;
++      }
++
++      mmap_read_lock(mm);
++      return true;
++}
++
++static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
++{
++      /*
++       * We don't have this operation yet.
++       *
++       * It should be easy enough to do: it's basically a
++       *    atomic_long_try_cmpxchg_acquire()
++       * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
++       * it also needs the proper lockdep magic etc.
++       */
++      return false;
++}
++
++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
++{
++      mmap_read_unlock(mm);
++      if (regs && !user_mode(regs)) {
++              unsigned long ip = instruction_pointer(regs);
++              if (!search_exception_tables(ip))
++                      return false;
++      }
++      mmap_write_lock(mm);
++      return true;
++}
++
++/*
++ * Helper for page fault handling.
++ *
++ * This is kind of equivalend to "mmap_read_lock()" followed
++ * by "find_extend_vma()", except it's a lot more careful about
++ * the locking (and will drop the lock on failure).
++ *
++ * For example, if we have a kernel bug that causes a page
++ * fault, we don't want to just use mmap_read_lock() to get
++ * the mm lock, because that would deadlock if the bug were
++ * to happen while we're holding the mm lock for writing.
++ *
++ * So this checks the exception tables on kernel faults in
++ * order to only do this all for instructions that are actually
++ * expected to fault.
++ *
++ * We can also actually take the mm lock for writing if we
++ * need to extend the vma, which helps the VM layer a lot.
++ */
++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
++                      unsigned long addr, struct pt_regs *regs)
++{
++      struct vm_area_struct *vma;
++
++      if (!get_mmap_lock_carefully(mm, regs))
++              return NULL;
++
++      vma = find_vma(mm, addr);
++      if (likely(vma && (vma->vm_start <= addr)))
++              return vma;
++
++      /*
++       * Well, dang. We might still be successful, but only
++       * if we can extend a vma to do so.
++       */
++      if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
++              mmap_read_unlock(mm);
++              return NULL;
++      }
++
++      /*
++       * We can try to upgrade the mmap lock atomically,
++       * in which case we can continue to use the vma
++       * we already looked up.
++       *
++       * Otherwise we'll have to drop the mmap lock and
++       * re-take it, and also look up the vma again,
++       * re-checking it.
++       */
++      if (!mmap_upgrade_trylock(mm)) {
++              if (!upgrade_mmap_lock_carefully(mm, regs))
++                      return NULL;
++
++              vma = find_vma(mm, addr);
++              if (!vma)
++                      goto fail;
++              if (vma->vm_start <= addr)
++                      goto success;
++              if (!(vma->vm_flags & VM_GROWSDOWN))
++                      goto fail;
++      }
++
++      if (expand_stack(vma, addr))
++              goto fail;
++
++success:
++      mmap_write_downgrade(mm);
++      return vma;
++
++fail:
++      mmap_write_unlock(mm);
++      return NULL;
++}
++#endif
++
+ #ifdef CONFIG_PER_VMA_LOCK
+ /*
+  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
diff --git a/queue-6.4/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch b/queue-6.4/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
new file mode 100644 (file)
index 0000000..ee7ca86
--- /dev/null
@@ -0,0 +1,242 @@
+From f440fa1ac955e2898893f9301568435eb5cdfc4b Mon Sep 17 00:00:00 2001
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Fri, 16 Jun 2023 15:58:54 -0700
+Subject: mm: make find_extend_vma() fail if write lock not held
+
+From: Liam R. Howlett <Liam.Howlett@oracle.com>
+
+commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream.
+
+Make calls to extend_vma() and find_extend_vma() fail if the write lock
+is required.
+
+To avoid making this a flag-day event, this still allows the old
+read-locking case for the trivial situations, and passes in a flag to
+say "is it write-locked".  That way write-lockers can say "yes, I'm
+being careful", and legacy users will continue to work in all the common
+cases until they have been fully converted to the new world order.
+
+Co-Developed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/binfmt_elf.c    |    6 +++---
+ fs/exec.c          |    5 +++--
+ include/linux/mm.h |   10 +++++++---
+ mm/memory.c        |    2 +-
+ mm/mmap.c          |   50 +++++++++++++++++++++++++++++++++-----------------
+ mm/nommu.c         |    3 ++-
+ 6 files changed, 49 insertions(+), 27 deletions(-)
+
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *b
+        * Grow the stack manually; some architectures have a limit on how
+        * far ahead a user-space access may be in order to grow the stack.
+        */
+-      if (mmap_read_lock_killable(mm))
++      if (mmap_write_lock_killable(mm))
+               return -EINTR;
+-      vma = find_extend_vma(mm, bprm->p);
+-      mmap_read_unlock(mm);
++      vma = find_extend_vma_locked(mm, bprm->p, true);
++      mmap_write_unlock(mm);
+       if (!vma)
+               return -EFAULT;
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -205,7 +205,8 @@ static struct page *get_arg_page(struct
+ #ifdef CONFIG_STACK_GROWSUP
+       if (write) {
+-              ret = expand_downwards(bprm->vma, pos);
++              /* We claim to hold the lock - nobody to race with */
++              ret = expand_downwards(bprm->vma, pos, true);
+               if (ret < 0)
+                       return NULL;
+       }
+@@ -853,7 +854,7 @@ int setup_arg_pages(struct linux_binprm
+       stack_base = vma->vm_end - stack_expand;
+ #endif
+       current->mm->start_stack = bprm->p;
+-      ret = expand_stack(vma, stack_base);
++      ret = expand_stack_locked(vma, stack_base, true);
+       if (ret)
+               ret = -EFAULT;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -3192,11 +3192,13 @@ extern vm_fault_t filemap_page_mkwrite(s
+ extern unsigned long stack_guard_gap;
+ /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
+-extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked);
++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false)
+ /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
+-extern int expand_downwards(struct vm_area_struct *vma,
+-              unsigned long address);
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked);
+ #if VM_GROWSUP
+ extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+ #else
+@@ -3297,6 +3299,8 @@ unsigned long change_prot_numa(struct vm
+ #endif
+ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
++              unsigned long addr, bool write_locked);
+ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+                       unsigned long pfn, unsigned long size, pgprot_t);
+ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_
+                       goto fail;
+       }
+-      if (expand_stack(vma, addr))
++      if (expand_stack_locked(vma, addr, true))
+               goto fail;
+ success:
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1935,7 +1935,8 @@ static int acct_stack_growth(struct vm_a
+  * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+  * vma is the last one with address > vma->vm_end.  Have to extend vma.
+  */
+-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
++int expand_upwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       struct vm_area_struct *next;
+@@ -1959,6 +1960,8 @@ int expand_upwards(struct vm_area_struct
+       if (gap_addr < address || gap_addr > TASK_SIZE)
+               gap_addr = TASK_SIZE;
++      if (!write_locked)
++              return -EAGAIN;
+       next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+       if (next && vma_is_accessible(next)) {
+               if (!(next->vm_flags & VM_GROWSUP))
+@@ -2028,7 +2031,8 @@ int expand_upwards(struct vm_area_struct
+ /*
+  * vma is the first one with address < vma->vm_start.  Have to extend vma.
+  */
+-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
++int expand_downwards(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       struct mm_struct *mm = vma->vm_mm;
+       MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
+@@ -2042,10 +2046,13 @@ int expand_downwards(struct vm_area_stru
+       /* Enforce stack_guard_gap */
+       prev = mas_prev(&mas, 0);
+       /* Check that both stack segments have the same anon_vma? */
+-      if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+-                      vma_is_accessible(prev)) {
+-              if (address - prev->vm_end < stack_guard_gap)
++      if (prev) {
++              if (!(prev->vm_flags & VM_GROWSDOWN) &&
++                  vma_is_accessible(prev) &&
++                  (address - prev->vm_end < stack_guard_gap))
+                       return -ENOMEM;
++              if (!write_locked && (prev->vm_end == address))
++                      return -EAGAIN;
+       }
+       if (mas_preallocate(&mas, GFP_KERNEL))
+@@ -2124,13 +2131,14 @@ static int __init cmdline_parse_stack_gu
+ __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+ #ifdef CONFIG_STACK_GROWSUP
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+-      return expand_upwards(vma, address);
++      return expand_upwards(vma, address, write_locked);
+ }
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++              unsigned long addr, bool write_locked)
+ {
+       struct vm_area_struct *vma, *prev;
+@@ -2138,20 +2146,25 @@ find_extend_vma(struct mm_struct *mm, un
+       vma = find_vma_prev(mm, addr, &prev);
+       if (vma && (vma->vm_start <= addr))
+               return vma;
+-      if (!prev || expand_stack(prev, addr))
++      if (!prev)
++              return NULL;
++      if (expand_stack_locked(prev, addr, write_locked))
+               return NULL;
+       if (prev->vm_flags & VM_LOCKED)
+               populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+       return prev;
+ }
+ #else
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+-      return expand_downwards(vma, address);
++      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
++              return -EINVAL;
++      return expand_downwards(vma, address, write_locked);
+ }
+-struct vm_area_struct *
+-find_extend_vma(struct mm_struct *mm, unsigned long addr)
++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm,
++              unsigned long addr, bool write_locked)
+ {
+       struct vm_area_struct *vma;
+       unsigned long start;
+@@ -2162,10 +2175,8 @@ find_extend_vma(struct mm_struct *mm, un
+               return NULL;
+       if (vma->vm_start <= addr)
+               return vma;
+-      if (!(vma->vm_flags & VM_GROWSDOWN))
+-              return NULL;
+       start = vma->vm_start;
+-      if (expand_stack(vma, addr))
++      if (expand_stack_locked(vma, addr, write_locked))
+               return NULL;
+       if (vma->vm_flags & VM_LOCKED)
+               populate_vma_page_range(vma, addr, start, NULL);
+@@ -2173,6 +2184,11 @@ find_extend_vma(struct mm_struct *mm, un
+ }
+ #endif
++struct vm_area_struct *find_extend_vma(struct mm_struct *mm,
++              unsigned long addr)
++{
++      return find_extend_vma_locked(mm, addr, false);
++}
+ EXPORT_SYMBOL_GPL(find_extend_vma);
+ /*
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -643,7 +643,8 @@ struct vm_area_struct *find_extend_vma(s
+  * expand a stack to a given address
+  * - not supported under NOMMU conditions
+  */
+-int expand_stack(struct vm_area_struct *vma, unsigned long address)
++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address,
++              bool write_locked)
+ {
+       return -ENOMEM;
+ }
diff --git a/queue-6.4/mm-make-the-page-fault-mmap-locking-killable.patch b/queue-6.4/mm-make-the-page-fault-mmap-locking-killable.patch
new file mode 100644 (file)
index 0000000..9116f6a
--- /dev/null
@@ -0,0 +1,46 @@
+From eda0047296a16d65a7f2bc60a408f70d178b2014 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 15 Jun 2023 16:17:48 -0700
+Subject: mm: make the page fault mmap locking killable
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream.
+
+This is done as a separate patch from introducing the new
+lock_mm_and_find_vma() helper, because while it's an obvious change,
+it's not what x86 used to do in this area.
+
+We already abort the page fault on fatal signals anyway, so why should
+we wait for the mmap lock only to then abort later? With the new helper
+function that returns without the lock held on failure anyway, this is
+particularly easy and straightforward.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c |    6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -5279,8 +5279,7 @@ static inline bool get_mmap_lock_careful
+                       return false;
+       }
+-      mmap_read_lock(mm);
+-      return true;
++      return !mmap_read_lock_killable(mm);
+ }
+ static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+@@ -5304,8 +5303,7 @@ static inline bool upgrade_mmap_lock_car
+               if (!search_exception_tables(ip))
+                       return false;
+       }
+-      mmap_write_lock(mm);
+-      return true;
++      return !mmap_write_lock_killable(mm);
+ }
+ /*
diff --git a/queue-6.4/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch b/queue-6.4/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..f297869
--- /dev/null
@@ -0,0 +1,47 @@
+From 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Sat, 24 Jun 2023 11:17:05 -0700
+Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma()
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream.
+
+This is one of the simple cases, except there's no pt_regs pointer.
+Which is fine, as lock_mm_and_find_vma() is set up to work fine with a
+NULL pt_regs.
+
+Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting,
+so we can just use the helper without any extra work.
+
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/mm/copro_fault.c |   14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/arch/powerpc/mm/copro_fault.c
++++ b/arch/powerpc/mm/copro_fault.c
+@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru
+       if (mm->pgd == NULL)
+               return -EFAULT;
+-      mmap_read_lock(mm);
+-      ret = -EFAULT;
+-      vma = find_vma(mm, ea);
++      vma = lock_mm_and_find_vma(mm, ea, NULL);
+       if (!vma)
+-              goto out_unlock;
+-
+-      if (ea < vma->vm_start) {
+-              if (!(vma->vm_flags & VM_GROWSDOWN))
+-                      goto out_unlock;
+-              if (expand_stack(vma, ea))
+-                      goto out_unlock;
+-      }
++              return -EFAULT;
++      ret = -EFAULT;
+       is_write = dsisr & DSISR_ISSTORE;
+       if (is_write) {
+               if (!(vma->vm_flags & VM_WRITE))
diff --git a/queue-6.4/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..0fcbcfb
--- /dev/null
@@ -0,0 +1,86 @@
+From e6fe228c4ffafdfc970cf6d46883a1f481baf7ea Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Fri, 16 Jun 2023 15:51:29 +1000
+Subject: powerpc/mm: Convert to using lock_mm_and_find_vma()
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream.
+
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/Kconfig    |    1 +
+ arch/powerpc/mm/fault.c |   41 ++++-------------------------------------
+ 2 files changed, 5 insertions(+), 37 deletions(-)
+
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -278,6 +278,7 @@ config PPC
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
+       select KASAN_VMALLOC                    if KASAN && MODULES
++      select LOCK_MM_AND_FIND_VMA
+       select MMU_GATHER_PAGE_SIZE
+       select MMU_GATHER_RCU_TABLE_FREE
+       select MMU_GATHER_MERGE_VMAS
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re
+       return __bad_area_nosemaphore(regs, address, si_code);
+ }
+-static noinline int bad_area(struct pt_regs *regs, unsigned long address)
+-{
+-      return __bad_area(regs, address, SEGV_MAPERR);
+-}
+-
+ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+                                   struct vm_area_struct *vma)
+ {
+@@ -515,40 +510,12 @@ lock_mmap:
+        * we will deadlock attempting to validate the fault against the
+        * address space.  Luckily the kernel only validly references user
+        * space from well defined areas of code, which are listed in the
+-       * exceptions table.
+-       *
+-       * As the vast majority of faults will be valid we will only perform
+-       * the source reference check when there is a possibility of a deadlock.
+-       * Attempt to lock the address space, if we cannot we then validate the
+-       * source.  If this is invalid we can skip the address space check,
+-       * thus avoiding the deadlock.
+-       */
+-      if (unlikely(!mmap_read_trylock(mm))) {
+-              if (!is_user && !search_exception_tables(regs->nip))
+-                      return bad_area_nosemaphore(regs, address);
+-
++       * exceptions table. lock_mm_and_find_vma() handles that logic.
++       */
+ retry:
+-              mmap_read_lock(mm);
+-      } else {
+-              /*
+-               * The above down_read_trylock() might have succeeded in
+-               * which case we'll have missed the might_sleep() from
+-               * down_read():
+-               */
+-              might_sleep();
+-      }
+-
+-      vma = find_vma(mm, address);
++      vma = lock_mm_and_find_vma(mm, address, regs);
+       if (unlikely(!vma))
+-              return bad_area(regs, address);
+-
+-      if (unlikely(vma->vm_start > address)) {
+-              if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+-                      return bad_area(regs, address);
+-
+-              if (unlikely(expand_stack(vma, address)))
+-                      return bad_area(regs, address);
+-      }
++              return bad_area_nosemaphore(regs, address);
+       if (unlikely(access_pkey_error(is_write, is_exec,
+                                      (error_code & DSISR_KEYFAULT), vma)))
diff --git a/queue-6.4/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
new file mode 100644 (file)
index 0000000..b360426
--- /dev/null
@@ -0,0 +1,95 @@
+From 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Thu, 22 Jun 2023 20:18:18 +0200
+Subject: riscv/mm: Convert to using lock_mm_and_find_vma()
+
+From: Ben Hutchings <ben@decadent.org.uk>
+
+commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/Kconfig    |    1 +
+ arch/riscv/mm/fault.c |   31 +++++++++++++------------------
+ 2 files changed, 14 insertions(+), 18 deletions(-)
+
+--- a/arch/riscv/Kconfig
++++ b/arch/riscv/Kconfig
+@@ -126,6 +126,7 @@ config RISCV
+       select IRQ_DOMAIN
+       select IRQ_FORCED_THREADING
+       select KASAN_VMALLOC if KASAN
++      select LOCK_MM_AND_FIND_VMA
+       select MODULES_USE_ELF_RELA if MODULES
+       select MODULE_SECTIONS if MODULES
+       select OF
+--- a/arch/riscv/mm/fault.c
++++ b/arch/riscv/mm/fault.c
+@@ -84,13 +84,13 @@ static inline void mm_fault_error(struct
+       BUG();
+ }
+-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr)
++static inline void
++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr)
+ {
+       /*
+        * Something tried to access memory that isn't in our memory map.
+        * Fix it, but check if it's kernel or user first.
+        */
+-      mmap_read_unlock(mm);
+       /* User mode accesses just cause a SIGSEGV */
+       if (user_mode(regs)) {
+               do_trap(regs, SIGSEGV, code, addr);
+@@ -100,6 +100,15 @@ static inline void bad_area(struct pt_re
+       no_context(regs, addr);
+ }
++static inline void
++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code,
++       unsigned long addr)
++{
++      mmap_read_unlock(mm);
++
++      bad_area_nosemaphore(regs, code, addr);
++}
++
+ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr)
+ {
+       pgd_t *pgd, *pgd_k;
+@@ -287,23 +296,10 @@ void handle_page_fault(struct pt_regs *r
+       else if (cause == EXC_INST_PAGE_FAULT)
+               flags |= FAULT_FLAG_INSTRUCTION;
+ retry:
+-      mmap_read_lock(mm);
+-      vma = find_vma(mm, addr);
++      vma = lock_mm_and_find_vma(mm, addr, regs);
+       if (unlikely(!vma)) {
+               tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (likely(vma->vm_start <= addr))
+-              goto good_area;
+-      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+-              tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
+-              return;
+-      }
+-      if (unlikely(expand_stack(vma, addr))) {
+-              tsk->thread.bad_cause = cause;
+-              bad_area(regs, mm, code, addr);
++              bad_area_nosemaphore(regs, code, addr);
+               return;
+       }
+@@ -311,7 +307,6 @@ retry:
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it.
+        */
+-good_area:
+       code = SEGV_ACCERR;
+       if (unlikely(access_error(cause, vma))) {
index ca8e99a97aecec5332a4a1a42f87dd8a704bd04b..bbaeef50df2ec73b67f40fafc578cedf3f158fc7 100644 (file)
@@ -7,3 +7,16 @@ x86-smp-cure-kexec-vs.-mwait_play_dead-breakage.patch
 cpufreq-amd-pstate-make-amd-pstate-epp-driver-name-hyphenated.patch
 can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch
 maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch
+mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch
+mm-make-the-page-fault-mmap-locking-killable.patch
+arm64-mm-convert-to-using-lock_mm_and_find_vma.patch
+powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch
+mips-mm-convert-to-using-lock_mm_and_find_vma.patch
+riscv-mm-convert-to-using-lock_mm_and_find_vma.patch
+arm-mm-convert-to-using-lock_mm_and_find_vma.patch
+mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch
+powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch
+mm-make-find_extend_vma-fail-if-write-lock-not-held.patch
+execve-expand-new-process-stack-manually-ahead-of-time.patch
+mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch
+gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch