From: Greg Kroah-Hartman Date: Tue, 27 Jun 2023 19:25:07 +0000 (+0200) Subject: some mm patches for 6.1, 6.3, and 6.4 X-Git-Tag: v6.4.1~25 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d69b5c8cc99b1f8da7f1c134f683341351f37a89;p=thirdparty%2Fkernel%2Fstable-queue.git some mm patches for 6.1, 6.3, and 6.4 --- diff --git a/queue-6.1/arm-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/arm-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..d5583bb7a51 --- /dev/null +++ b/queue-6.1/arm-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,138 @@ +From f8c4e35d716b886d05595706af1be757fede502d Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 21:24:30 +0200 +Subject: arm/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream. + +arm has an additional check for address < FIRST_USER_ADDRESS before +expanding the stack. Since FIRST_USER_ADDRESS is defined everywhere +(generally as 0), move that check to the generic expand_downwards(). + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm/Kconfig | 1 + arch/arm/mm/fault.c | 63 +++++++++++----------------------------------------- + mm/mmap.c | 2 - + 3 files changed, 16 insertions(+), 50 deletions(-) + +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -122,6 +122,7 @@ config ARM + select HAVE_UID16 + select HAVE_VIRT_CPU_ACCOUNTING_GEN + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_REL + select NEED_DMA_MAP_STATE + select OF_EARLY_FLATTREE if OF +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -231,37 +231,11 @@ static inline bool is_permission_fault(u + return false; + } + +-static vm_fault_t __kprobes +-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags, +- unsigned long vma_flags, struct pt_regs *regs) +-{ +- struct vm_area_struct *vma = find_vma(mm, addr); +- if (unlikely(!vma)) +- return VM_FAULT_BADMAP; +- +- if (unlikely(vma->vm_start > addr)) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return VM_FAULT_BADMAP; +- if (addr < FIRST_USER_ADDRESS) +- return VM_FAULT_BADMAP; +- if (expand_stack(vma, addr)) +- return VM_FAULT_BADMAP; +- } +- +- /* +- * ok, we have a good vm_area for this memory access, check the +- * permissions on the VMA allow for the fault which occurred. +- */ +- if (!(vma->vm_flags & vma_flags)) +- return VM_FAULT_BADACCESS; +- +- return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); +-} +- + static int __kprobes + do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { + struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; + int sig, code; + vm_fault_t fault; + unsigned int flags = FAULT_FLAG_DEFAULT; +@@ -300,31 +274,21 @@ do_page_fault(unsigned long addr, unsign + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + +- /* +- * As per x86, we may deadlock here. However, since the kernel only +- * validly references user space from well defined areas of the code, +- * we can bug out early if this is from code which shouldn't. +- */ +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) +- goto no_context; + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case, we'll have missed the might_sleep() from +- * down_read() +- */ +- might_sleep(); +-#ifdef CONFIG_DEBUG_VM +- if (!user_mode(regs) && +- !search_exception_tables(regs->ARM_pc)) +- goto no_context; +-#endif ++ vma = lock_mm_and_find_vma(mm, addr, regs); ++ if (unlikely(!vma)) { ++ fault = VM_FAULT_BADMAP; ++ goto bad_area; + } + +- fault = __do_page_fault(mm, addr, flags, vm_flags, regs); ++ /* ++ * ok, we have a good vm_area for this memory access, check the ++ * permissions on the VMA allow for the fault which occurred. ++ */ ++ if (!(vma->vm_flags & vm_flags)) ++ fault = VM_FAULT_BADACCESS; ++ else ++ fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); + + /* If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_lock because +@@ -355,6 +319,7 @@ retry: + if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) + return 0; + ++bad_area: + /* + * If we are in kernel mode at this point, we + * have no context to handle this fault with. +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -2045,7 +2045,7 @@ int expand_downwards(struct vm_area_stru + int error = 0; + + address &= PAGE_MASK; +- if (address < mmap_min_addr) ++ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + + /* Enforce stack_guard_gap */ diff --git a/queue-6.1/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..4320d8efb2a --- /dev/null +++ b/queue-6.1/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,120 @@ +From a45a8a9f70fe70c7c4479b9256b1eb1b5774df64 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 17:11:44 -0700 +Subject: arm64/mm: Convert to using lock_mm_and_find_vma() + +From: Linus Torvalds + +commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream. + +This converts arm64 to use the new page fault helper. It was very +straightforward, but still needed a fix for the "obvious" conversion I +initially did. Thanks to Suren for the fix and testing. + +Fixed-and-tested-by: Suren Baghdasaryan +Unnecessary-code-removal-by: Liam R. Howlett +Signed-off-by: Linus Torvalds +[6.1: Ignore CONFIG_PER_VMA_LOCK context] +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/Kconfig | 1 + + arch/arm64/mm/fault.c | 46 +++++++++------------------------------------- + 2 files changed, 10 insertions(+), 37 deletions(-) + +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -211,6 +211,7 @@ config ARM64 + select IRQ_DOMAIN + select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select NEED_DMA_MAP_STATE + select NEED_SG_DMA_LENGTH +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa + #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) + #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) + +-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, ++static vm_fault_t __do_page_fault(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long addr, + unsigned int mm_flags, unsigned long vm_flags, + struct pt_regs *regs) + { +- struct vm_area_struct *vma = find_vma(mm, addr); +- +- if (unlikely(!vma)) +- return VM_FAULT_BADMAP; +- + /* + * Ok, we have a good vm_area for this memory access, so we can handle + * it. +- */ +- if (unlikely(vma->vm_start > addr)) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return VM_FAULT_BADMAP; +- if (expand_stack(vma, addr)) +- return VM_FAULT_BADMAP; +- } +- +- /* + * Check that the permissions on the VMA allow for the fault which + * occurred. + */ +@@ -535,6 +522,7 @@ static int __kprobes do_page_fault(unsig + unsigned long vm_flags; + unsigned int mm_flags = FAULT_FLAG_DEFAULT; + unsigned long addr = untagged_addr(far); ++ struct vm_area_struct *vma; + + if (kprobe_page_fault(regs, esr)) + return 0; +@@ -585,31 +573,14 @@ static int __kprobes do_page_fault(unsig + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + +- /* +- * As per x86, we may deadlock here. However, since the kernel only +- * validly references user space from well defined areas of the code, +- * we can bug out early if this is from code which shouldn't. +- */ +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->pc)) +- goto no_context; + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above mmap_read_trylock() might have succeeded in which +- * case, we'll have missed the might_sleep() from down_read(). +- */ +- might_sleep(); +-#ifdef CONFIG_DEBUG_VM +- if (!user_mode(regs) && !search_exception_tables(regs->pc)) { +- mmap_read_unlock(mm); +- goto no_context; +- } +-#endif ++ vma = lock_mm_and_find_vma(mm, addr, regs); ++ if (unlikely(!vma)) { ++ fault = VM_FAULT_BADMAP; ++ goto done; + } + +- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs); ++ fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs); + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { +@@ -628,6 +599,7 @@ retry: + } + mmap_read_unlock(mm); + ++done: + /* + * Handle the "normal" (no error) case first. + */ diff --git a/queue-6.1/execve-expand-new-process-stack-manually-ahead-of-time.patch b/queue-6.1/execve-expand-new-process-stack-manually-ahead-of-time.patch new file mode 100644 index 00000000000..833c47b61d4 --- /dev/null +++ b/queue-6.1/execve-expand-new-process-stack-manually-ahead-of-time.patch @@ -0,0 +1,91 @@ +From 9e1f3d01ba1f6ffa0ad902d594b1b44619568b74 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Mon, 19 Jun 2023 11:34:15 -0700 +Subject: execve: expand new process stack manually ahead of time + +From: Linus Torvalds + +commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream. + +This is a small step towards a model where GUP itself would not expand +the stack, and any user that needs GUP to not look up existing mappings, +but actually expand on them, would have to do so manually before-hand, +and with the mm lock held for writing. + +It turns out that execve() already did almost exactly that, except it +didn't take the mm lock at all (it's single-threaded so no locking +technically needed, but it could cause lockdep errors). And it only did +it for the CONFIG_STACK_GROWSUP case, since in that case GUP has +obviously never expanded the stack downwards. + +So just make that CONFIG_STACK_GROWSUP case do the right thing with +locking, and enable it generally. This will eventually help GUP, and in +the meantime avoids a special case and the lockdep issue. + +Signed-off-by: Linus Torvalds +[6.1 Minor context from still having FOLL_FORCE flags set] +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + fs/exec.c | 37 +++++++++++++++++++++---------------- + 1 file changed, 21 insertions(+), 16 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -198,34 +198,39 @@ static struct page *get_arg_page(struct + int write) + { + struct page *page; ++ struct vm_area_struct *vma = bprm->vma; ++ struct mm_struct *mm = bprm->mm; + int ret; +- unsigned int gup_flags = FOLL_FORCE; + +-#ifdef CONFIG_STACK_GROWSUP +- if (write) { +- /* We claim to hold the lock - nobody to race with */ +- ret = expand_downwards(bprm->vma, pos, true); +- if (ret < 0) ++ /* ++ * Avoid relying on expanding the stack down in GUP (which ++ * does not work for STACK_GROWSUP anyway), and just do it ++ * by hand ahead of time. ++ */ ++ if (write && pos < vma->vm_start) { ++ mmap_write_lock(mm); ++ ret = expand_downwards(vma, pos, true); ++ if (unlikely(ret < 0)) { ++ mmap_write_unlock(mm); + return NULL; +- } +-#endif +- +- if (write) +- gup_flags |= FOLL_WRITE; ++ } ++ mmap_write_downgrade(mm); ++ } else ++ mmap_read_lock(mm); + + /* + * We are doing an exec(). 'current' is the process +- * doing the exec and bprm->mm is the new process's mm. ++ * doing the exec and 'mm' is the new process's mm. + */ +- mmap_read_lock(bprm->mm); +- ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, ++ ret = get_user_pages_remote(mm, pos, 1, ++ write ? FOLL_WRITE : 0, + &page, NULL, NULL); +- mmap_read_unlock(bprm->mm); ++ mmap_read_unlock(mm); + if (ret <= 0) + return NULL; + + if (write) +- acct_arg_size(bprm, vma_pages(bprm->vma)); ++ acct_arg_size(bprm, vma_pages(vma)); + + return page; + } diff --git a/queue-6.1/mips-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/mips-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..112cae5cb0b --- /dev/null +++ b/queue-6.1/mips-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,55 @@ +From f9ced2ac8976a6560505cc4bf14ffdf1c076e475 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 18:47:40 +0200 +Subject: mips/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream. + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/mips/Kconfig | 1 + + arch/mips/mm/fault.c | 12 ++---------- + 2 files changed, 3 insertions(+), 10 deletions(-) + +--- a/arch/mips/Kconfig ++++ b/arch/mips/Kconfig +@@ -94,6 +94,7 @@ config MIPS + select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP + select IRQ_FORCED_THREADING + select ISA if EISA ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_REL if MODULES + select MODULES_USE_ELF_RELA if MODULES && 64BIT + select PERF_USE_VMALLOC +--- a/arch/mips/mm/fault.c ++++ b/arch/mips/mm/fault.c +@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + si_code = SEGV_ACCERR; + + if (write) { diff --git a/queue-6.1/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch b/queue-6.1/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch new file mode 100644 index 00000000000..013f951ce81 --- /dev/null +++ b/queue-6.1/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch @@ -0,0 +1,671 @@ +From 2956a81444985ffb601685f3a796e79470b56353 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 13:45:51 -0700 +Subject: mm: always expand the stack with the mmap write lock held + +From: Linus Torvalds + +commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream + +This finishes the job of always holding the mmap write lock when +extending the user stack vma, and removes the 'write_locked' argument +from the vm helper functions again. + +For some cases, we just avoid expanding the stack at all: drivers and +page pinning really shouldn't be extending any stacks. Let's see if any +strange users really wanted that. + +It's worth noting that architectures that weren't converted to the new +lock_mm_and_find_vma() helper function are left using the legacy +"expand_stack()" function, but it has been changed to drop the mmap_lock +and take it for writing while expanding the vma. This makes it fairly +straightforward to convert the remaining architectures. + +As a result of dropping and re-taking the lock, the calling conventions +for this function have also changed, since the old vma may no longer be +valid. So it will now return the new vma if successful, and NULL - and +the lock dropped - if the area could not be extended. + +Signed-off-by: Linus Torvalds +[6.1: Patch drivers/iommu/io-pgfault.c instead] +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/ia64/mm/fault.c | 36 ++---------- + arch/m68k/mm/fault.c | 9 ++- + arch/microblaze/mm/fault.c | 5 + + arch/openrisc/mm/fault.c | 5 + + arch/parisc/mm/fault.c | 23 +++----- + arch/s390/mm/fault.c | 5 + + arch/sparc/mm/fault_64.c | 8 +- + arch/um/kernel/trap.c | 11 ++- + drivers/iommu/amd/iommu_v2.c | 4 - + drivers/iommu/io-pgfault.c | 2 + fs/binfmt_elf.c | 2 + fs/exec.c | 4 - + include/linux/mm.h | 16 +---- + mm/gup.c | 6 +- + mm/memory.c | 10 +++ + mm/mmap.c | 121 ++++++++++++++++++++++++++++++++++--------- + mm/nommu.c | 18 ++---- + 17 files changed, 169 insertions(+), 116 deletions(-) + +--- a/arch/ia64/mm/fault.c ++++ b/arch/ia64/mm/fault.c +@@ -110,10 +110,12 @@ retry: + * register backing store that needs to expand upwards, in + * this case vma will be null, but prev_vma will ne non-null + */ +- if (( !vma && prev_vma ) || (address < vma->vm_start) ) +- goto check_expansion; ++ if (( !vma && prev_vma ) || (address < vma->vm_start) ) { ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; ++ } + +- good_area: + code = SEGV_ACCERR; + + /* OK, we've got a good vm_area for this memory area. Check the access permissions: */ +@@ -174,35 +176,9 @@ retry: + mmap_read_unlock(mm); + return; + +- check_expansion: +- if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) { +- if (!vma) +- goto bad_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) +- || REGION_OFFSET(address) >= RGN_MAP_LIMIT) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; +- } else { +- vma = prev_vma; +- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) +- || REGION_OFFSET(address) >= RGN_MAP_LIMIT) +- goto bad_area; +- /* +- * Since the register backing store is accessed sequentially, +- * we disallow growing it by more than a page at a time. +- */ +- if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) +- goto bad_area; +- if (expand_upwards(vma, address)) +- goto bad_area; +- } +- goto good_area; +- + bad_area: + mmap_read_unlock(mm); ++ bad_area_nosemaphore: + if ((isr & IA64_ISR_SP) + || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) + { +--- a/arch/m68k/mm/fault.c ++++ b/arch/m68k/mm/fault.c +@@ -105,8 +105,9 @@ retry: + if (address + 256 < rdusp()) + goto map_err; + } +- if (expand_stack(vma, address)) +- goto map_err; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto map_err_nosemaphore; + + /* + * Ok, we have a good vm_area for this memory access, so +@@ -193,10 +194,12 @@ bus_err: + goto send_sig; + + map_err: ++ mmap_read_unlock(mm); ++map_err_nosemaphore: + current->thread.signo = SIGSEGV; + current->thread.code = SEGV_MAPERR; + current->thread.faddr = address; +- goto send_sig; ++ return send_fault_sig(regs); + + acc_err: + current->thread.signo = SIGSEGV; +--- a/arch/microblaze/mm/fault.c ++++ b/arch/microblaze/mm/fault.c +@@ -192,8 +192,9 @@ retry: + && (kernel_mode(regs) || !store_updates_sp(regs))) + goto bad_area; + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + + good_area: + code = SEGV_ACCERR; +--- a/arch/openrisc/mm/fault.c ++++ b/arch/openrisc/mm/fault.c +@@ -127,8 +127,9 @@ retry: + if (address + PAGE_SIZE < regs->sp) + goto bad_area; + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + + /* + * Ok, we have a good vm_area for this memory access, so +--- a/arch/parisc/mm/fault.c ++++ b/arch/parisc/mm/fault.c +@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs, + retry: + mmap_read_lock(mm); + vma = find_vma_prev(mm, address, &prev_vma); +- if (!vma || address < vma->vm_start) +- goto check_expansion; ++ if (!vma || address < vma->vm_start) { ++ if (!prev || !(prev->vm_flags & VM_GROWSUP)) ++ goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; ++ } ++ + /* + * Ok, we have a good vm_area for this memory access. We still need to + * check the access permissions. + */ + +-good_area: +- + if ((vma->vm_flags & acc_type) != acc_type) + goto bad_area; + +@@ -342,17 +346,13 @@ good_area: + mmap_read_unlock(mm); + return; + +-check_expansion: +- vma = prev_vma; +- if (vma && (expand_stack(vma, address) == 0)) +- goto good_area; +- + /* + * Something tried to access memory that isn't in our memory map.. + */ + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + if (user_mode(regs)) { + int signo, si_code; + +@@ -444,7 +444,7 @@ handle_nadtlb_fault(struct pt_regs *regs + { + unsigned long insn = regs->iir; + int breg, treg, xreg, val = 0; +- struct vm_area_struct *vma, *prev_vma; ++ struct vm_area_struct *vma; + struct task_struct *tsk; + struct mm_struct *mm; + unsigned long address; +@@ -480,7 +480,7 @@ handle_nadtlb_fault(struct pt_regs *regs + /* Search for VMA */ + address = regs->ior; + mmap_read_lock(mm); +- vma = find_vma_prev(mm, address, &prev_vma); ++ vma = vma_lookup(mm, address); + mmap_read_unlock(mm); + + /* +@@ -489,7 +489,6 @@ handle_nadtlb_fault(struct pt_regs *regs + */ + acc_type = (insn & 0x40) ? VM_WRITE : VM_READ; + if (vma +- && address >= vma->vm_start + && (vma->vm_flags & acc_type) == acc_type) + val = 1; + } +--- a/arch/s390/mm/fault.c ++++ b/arch/s390/mm/fault.c +@@ -429,8 +429,9 @@ retry: + if (unlikely(vma->vm_start > address)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_up; +- if (expand_stack(vma, address)) +- goto out_up; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto out; + } + + /* +--- a/arch/sparc/mm/fault_64.c ++++ b/arch/sparc/mm/fault_64.c +@@ -383,8 +383,9 @@ continue_fault: + goto bad_area; + } + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. +@@ -482,8 +483,9 @@ exit_exception: + * Fix it, but check if it's kernel or user first.. + */ + bad_area: +- insn = get_fault_insn(regs, insn); + mmap_read_unlock(mm); ++bad_area_nosemaphore: ++ insn = get_fault_insn(regs, insn); + + handle_kernel_fault: + do_kernel_fault(regs, si_code, fault_code, insn, address); +--- a/arch/um/kernel/trap.c ++++ b/arch/um/kernel/trap.c +@@ -47,14 +47,15 @@ retry: + vma = find_vma(mm, address); + if (!vma) + goto out; +- else if (vma->vm_start <= address) ++ if (vma->vm_start <= address) + goto good_area; +- else if (!(vma->vm_flags & VM_GROWSDOWN)) ++ if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; +- else if (is_user && !ARCH_IS_STACKGROW(address)) +- goto out; +- else if (expand_stack(vma, address)) ++ if (is_user && !ARCH_IS_STACKGROW(address)) + goto out; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto out_nosemaphore; + + good_area: + *code_out = SEGV_ACCERR; +--- a/drivers/iommu/amd/iommu_v2.c ++++ b/drivers/iommu/amd/iommu_v2.c +@@ -485,8 +485,8 @@ static void do_fault(struct work_struct + flags |= FAULT_FLAG_REMOTE; + + mmap_read_lock(mm); +- vma = find_extend_vma(mm, address); +- if (!vma || address < vma->vm_start) ++ vma = vma_lookup(mm, address); ++ if (!vma) + /* failed to get a vma in the right range */ + goto out; + +--- a/drivers/iommu/io-pgfault.c ++++ b/drivers/iommu/io-pgfault.c +@@ -89,7 +89,7 @@ iopf_handle_single(struct iopf_fault *io + + mmap_read_lock(mm); + +- vma = find_extend_vma(mm, prm->addr); ++ vma = vma_lookup(mm, prm->addr); + if (!vma) + /* Unmapped area */ + goto out_put_mm; +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -317,7 +317,7 @@ create_elf_tables(struct linux_binprm *b + */ + if (mmap_write_lock_killable(mm)) + return -EINTR; +- vma = find_extend_vma_locked(mm, bprm->p, true); ++ vma = find_extend_vma_locked(mm, bprm->p); + mmap_write_unlock(mm); + if (!vma) + return -EFAULT; +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct + */ + if (write && pos < vma->vm_start) { + mmap_write_lock(mm); +- ret = expand_downwards(vma, pos, true); ++ ret = expand_downwards(vma, pos); + if (unlikely(ret < 0)) { + mmap_write_unlock(mm); + return NULL; +@@ -860,7 +860,7 @@ int setup_arg_pages(struct linux_binprm + stack_base = vma->vm_start - stack_expand; + #endif + current->mm->start_stack = bprm->p; +- ret = expand_stack_locked(vma, stack_base, true); ++ ret = expand_stack_locked(vma, stack_base); + if (ret) + ret = -EFAULT; + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2810,18 +2810,11 @@ extern vm_fault_t filemap_page_mkwrite(s + + extern unsigned long stack_guard_gap; + /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked); +-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); ++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); + + /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked); +-#if VM_GROWSUP +-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); +-#else +- #define expand_upwards(vma, address) (0) +-#endif ++int expand_downwards(struct vm_area_struct *vma, unsigned long address); + + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ + extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +@@ -2916,9 +2909,8 @@ unsigned long change_prot_numa(struct vm + unsigned long start, unsigned long end); + #endif + +-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); + struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, +- unsigned long addr, bool write_locked); ++ unsigned long addr); + int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); + int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1182,7 +1182,7 @@ static long __get_user_pages(struct mm_s + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { +- vma = find_extend_vma(mm, start); ++ vma = vma_lookup(mm, start); + if (!vma && in_gate_area(mm, start)) { + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, +@@ -1351,8 +1351,8 @@ int fixup_user_fault(struct mm_struct *m + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + retry: +- vma = find_extend_vma(mm, address); +- if (!vma || address < vma->vm_start) ++ vma = vma_lookup(mm, address); ++ if (!vma) + return -EFAULT; + + if (!vma_permits_fault(vma, fault_flags)) +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5352,7 +5352,7 @@ struct vm_area_struct *lock_mm_and_find_ + goto fail; + } + +- if (expand_stack_locked(vma, addr, true)) ++ if (expand_stack_locked(vma, addr)) + goto fail; + + success: +@@ -5636,6 +5636,14 @@ int __access_remote_vm(struct mm_struct + if (mmap_read_lock_killable(mm)) + return 0; + ++ /* We might need to expand the stack to access it */ ++ vma = vma_lookup(mm, addr); ++ if (!vma) { ++ vma = expand_stack(mm, addr); ++ if (!vma) ++ return 0; ++ } ++ + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, ret, offset; +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1945,8 +1945,7 @@ static int acct_stack_growth(struct vm_a + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +-int expand_upwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++static int expand_upwards(struct vm_area_struct *vma, unsigned long address) + { + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; +@@ -1970,8 +1969,6 @@ int expand_upwards(struct vm_area_struct + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + +- if (!write_locked) +- return -EAGAIN; + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) +@@ -2039,15 +2036,18 @@ int expand_upwards(struct vm_area_struct + + /* + * vma is the first one with address < vma->vm_start. Have to extend vma. ++ * mmap_lock held for writing. + */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_downwards(struct vm_area_struct *vma, unsigned long address) + { + struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); + struct vm_area_struct *prev; + int error = 0; + ++ if (!(vma->vm_flags & VM_GROWSDOWN)) ++ return -EFAULT; ++ + address &= PAGE_MASK; + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; +@@ -2060,8 +2060,6 @@ int expand_downwards(struct vm_area_stru + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; +- if (!write_locked && (prev->vm_end == address)) +- return -EAGAIN; + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) +@@ -2139,14 +2137,12 @@ static int __init cmdline_parse_stack_gu + __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + + #ifdef CONFIG_STACK_GROWSUP +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) + { +- return expand_upwards(vma, address, write_locked); ++ return expand_upwards(vma, address); + } + +-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, +- unsigned long addr, bool write_locked) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) + { + struct vm_area_struct *vma, *prev; + +@@ -2156,23 +2152,21 @@ struct vm_area_struct *find_extend_vma_l + return vma; + if (!prev) + return NULL; +- if (expand_stack_locked(prev, addr, write_locked)) ++ if (expand_stack_locked(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); + return prev; + } + #else +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) + { + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + return -EINVAL; +- return expand_downwards(vma, address, write_locked); ++ return expand_downwards(vma, address); + } + +-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, +- unsigned long addr, bool write_locked) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) + { + struct vm_area_struct *vma; + unsigned long start; +@@ -2184,7 +2178,7 @@ struct vm_area_struct *find_extend_vma_l + if (vma->vm_start <= addr) + return vma; + start = vma->vm_start; +- if (expand_stack_locked(vma, addr, write_locked)) ++ if (expand_stack_locked(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) + populate_vma_page_range(vma, addr, start, NULL); +@@ -2192,12 +2186,91 @@ struct vm_area_struct *find_extend_vma_l + } + #endif + +-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, +- unsigned long addr) ++/* ++ * IA64 has some horrid mapping rules: it can expand both up and down, ++ * but with various special rules. ++ * ++ * We'll get rid of this architecture eventually, so the ugliness is ++ * temporary. ++ */ ++#ifdef CONFIG_IA64 ++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr) ++{ ++ return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) && ++ REGION_OFFSET(addr) < RGN_MAP_LIMIT; ++} ++ ++/* ++ * IA64 stacks grow down, but there's a special register backing store ++ * that can grow up. Only sequentially, though, so the new address must ++ * match vm_end. ++ */ ++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr) ++{ ++ if (!vma_expand_ok(vma, addr)) ++ return -EFAULT; ++ if (vma->vm_end != (addr & PAGE_MASK)) ++ return -EFAULT; ++ return expand_upwards(vma, addr); ++} ++ ++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr) ++{ ++ if (!vma_expand_ok(vma, addr)) ++ return -EFAULT; ++ return expand_downwards(vma, addr); ++} ++ ++#elif defined(CONFIG_STACK_GROWSUP) ++ ++#define vma_expand_up(vma,addr) expand_upwards(vma, addr) ++#define vma_expand_down(vma, addr) (-EFAULT) ++ ++#else ++ ++#define vma_expand_up(vma,addr) (-EFAULT) ++#define vma_expand_down(vma, addr) expand_downwards(vma, addr) ++ ++#endif ++ ++/* ++ * expand_stack(): legacy interface for page faulting. Don't use unless ++ * you have to. ++ * ++ * This is called with the mm locked for reading, drops the lock, takes ++ * the lock for writing, tries to look up a vma again, expands it if ++ * necessary, and downgrades the lock to reading again. ++ * ++ * If no vma is found or it can't be expanded, it returns NULL and has ++ * dropped the lock. ++ */ ++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) + { +- return find_extend_vma_locked(mm, addr, false); ++ struct vm_area_struct *vma, *prev; ++ ++ mmap_read_unlock(mm); ++ if (mmap_write_lock_killable(mm)) ++ return NULL; ++ ++ vma = find_vma_prev(mm, addr, &prev); ++ if (vma && vma->vm_start <= addr) ++ goto success; ++ ++ if (prev && !vma_expand_up(prev, addr)) { ++ vma = prev; ++ goto success; ++ } ++ ++ if (vma && !vma_expand_down(vma, addr)) ++ goto success; ++ ++ mmap_write_unlock(mm); ++ return NULL; ++ ++success: ++ mmap_write_downgrade(mm); ++ return vma; + } +-EXPORT_SYMBOL_GPL(find_extend_vma); + + /* + * Ok - we have the memory areas we should free on a maple tree so release them, +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -682,24 +682,20 @@ struct vm_area_struct *find_vma(struct m + EXPORT_SYMBOL(find_vma); + + /* +- * find a VMA +- * - we don't extend stack VMAs under NOMMU conditions +- */ +-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +-{ +- return find_vma(mm, addr); +-} +- +-/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) + { + return -ENOMEM; + } + ++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) ++{ ++ mmap_read_unlock(mm); ++ return NULL; ++} ++ + /* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_lock at least held readlocked diff --git a/queue-6.1/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch b/queue-6.1/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..9120b2f6f91 --- /dev/null +++ b/queue-6.1/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch @@ -0,0 +1,491 @@ +From f128a1b1b5a6b39471d62f1398196631160a24a2 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 10:55:38 -0700 +Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma() + +From: Linus Torvalds + +commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream. + +This does the simple pattern conversion of alpha, arc, csky, hexagon, +loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma() +helper. They all have the regular fault handling pattern without odd +special cases. + +The remaining architectures all have something that keeps us from a +straightforward conversion: ia64 and parisc have stacks that can grow +both up as well as down (and ia64 has special address region checks). + +And m68k, microblaze, openrisc, sparc64, and um end up having extra +rules about only expanding the stack down a limited amount below the +user space stack pointer. That is something that x86 used to do too +(long long ago), and it probably could just be skipped, but it still +makes the conversion less than trivial. + +Note that this conversion was done manually and with the exception of +alpha without any build testing, because I have a fairly limited cross- +building environment. The cases are all simple, and I went through the +changes several times, but... + +Signed-off-by: Linus Torvalds +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/alpha/Kconfig | 1 + + arch/alpha/mm/fault.c | 13 +++---------- + arch/arc/Kconfig | 1 + + arch/arc/mm/fault.c | 11 +++-------- + arch/csky/Kconfig | 1 + + arch/csky/mm/fault.c | 22 +++++----------------- + arch/hexagon/Kconfig | 1 + + arch/hexagon/mm/vm_fault.c | 18 ++++-------------- + arch/loongarch/Kconfig | 1 + + arch/loongarch/mm/fault.c | 16 ++++++---------- + arch/nios2/Kconfig | 1 + + arch/nios2/mm/fault.c | 17 ++--------------- + arch/sh/Kconfig | 1 + + arch/sh/mm/fault.c | 17 ++--------------- + arch/sparc/Kconfig | 1 + + arch/sparc/mm/fault_32.c | 32 ++++++++------------------------ + arch/xtensa/Kconfig | 1 + + arch/xtensa/mm/fault.c | 14 +++----------- + 18 files changed, 45 insertions(+), 124 deletions(-) + +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -28,6 +28,7 @@ config ALPHA + select GENERIC_SMP_IDLE_THREAD + select HAVE_ARCH_AUDITSYSCALL + select HAVE_MOD_ARCH_SPECIFIC ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select ODD_RT_SIGACTION + select OLD_SIGSUSPEND +--- a/arch/alpha/mm/fault.c ++++ b/arch/alpha/mm/fault.c +@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns + flags |= FAULT_FLAG_USER; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + + /* Ok, we have a good vm_area for this memory access, so + we can handle it. */ +- good_area: + si_code = SEGV_ACCERR; + if (cause < 0) { + if (!(vma->vm_flags & VM_EXEC)) +@@ -189,6 +181,7 @@ retry: + bad_area: + mmap_read_unlock(mm); + ++ bad_area_nosemaphore: + if (user_mode(regs)) + goto do_sigsegv; + +--- a/arch/arc/Kconfig ++++ b/arch/arc/Kconfig +@@ -41,6 +41,7 @@ config ARC + select HAVE_PERF_EVENTS + select HAVE_SYSCALL_TRACEPOINTS + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select OF + select OF_EARLY_FLATTREE +--- a/arch/arc/mm/fault.c ++++ b/arch/arc/mm/fault.c +@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (unlikely(address < vma->vm_start)) { +- if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address)) +- goto bad_area; +- } ++ goto bad_area_nosemaphore; + + /* + * vm_area is good, now check permissions for this memory access +@@ -161,6 +155,7 @@ retry: + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + /* + * Major/minor page fault accounting + * (in case of retry we only land here once) +--- a/arch/csky/Kconfig ++++ b/arch/csky/Kconfig +@@ -96,6 +96,7 @@ config CSKY + select HAVE_RSEQ + select HAVE_STACKPROTECTOR + select HAVE_SYSCALL_TRACEPOINTS ++ select LOCK_MM_AND_FIND_VMA + select MAY_HAVE_SPARSE_IRQ + select MODULES_USE_ELF_RELA if MODULES + select OF +--- a/arch/csky/mm/fault.c ++++ b/arch/csky/mm/fault.c +@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct + BUG(); + } + +-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) ++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) + { + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ +- mmap_read_unlock(mm); + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); +@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_ + if (is_write(regs)) + flags |= FAULT_FLAG_WRITE; + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, addr); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, mm, code, addr); +- return; +- } +- if (likely(vma->vm_start <= addr)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, mm, code, addr); +- return; +- } +- if (unlikely(expand_stack(vma, addr))) { +- bad_area(regs, mm, code, addr); ++ bad_area_nosemaphore(regs, mm, code, addr); + return; + } + +@@ -259,11 +247,11 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it. + */ +-good_area: + code = SEGV_ACCERR; + + if (unlikely(access_error(regs, vma))) { +- bad_area(regs, mm, code, addr); ++ mmap_read_unlock(mm); ++ bad_area_nosemaphore(regs, mm, code, addr); + return; + } + +--- a/arch/hexagon/Kconfig ++++ b/arch/hexagon/Kconfig +@@ -28,6 +28,7 @@ config HEXAGON + select GENERIC_SMP_IDLE_THREAD + select STACKTRACE_SUPPORT + select GENERIC_CLOCKEVENTS_BROADCAST ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select GENERIC_CPU_DEVICES + select ARCH_WANT_LD_ORPHAN_WARN +--- a/arch/hexagon/mm/vm_fault.c ++++ b/arch/hexagon/mm/vm_fault.c +@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- if (!vma) +- goto bad_area; ++ vma = lock_mm_and_find_vma(mm, address, regs); ++ if (unlikely(!vma)) ++ goto bad_area_nosemaphore; + +- if (vma->vm_start <= address) +- goto good_area; +- +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- +- if (expand_stack(vma, address)) +- goto bad_area; +- +-good_area: + /* Address space is OK. Now check access rights. */ + si_code = SEGV_ACCERR; + +@@ -140,6 +129,7 @@ good_area: + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + if (user_mode(regs)) { + force_sig_fault(SIGSEGV, si_code, (void __user *)address); + return; +--- a/arch/loongarch/Kconfig ++++ b/arch/loongarch/Kconfig +@@ -107,6 +107,7 @@ config LOONGARCH + select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP + select IRQ_FORCED_THREADING + select IRQ_LOONGARCH_CPU ++ select LOCK_MM_AND_FIND_VMA + select MMU_GATHER_MERGE_VMAS if MMU + select MODULES_USE_ELF_RELA if MODULES + select NEED_PER_CPU_EMBED_FIRST_CHUNK +--- a/arch/loongarch/mm/fault.c ++++ b/arch/loongarch/mm/fault.c +@@ -166,22 +166,18 @@ static void __kprobes __do_page_fault(st + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (!expand_stack(vma, address)) +- goto good_area; ++ vma = lock_mm_and_find_vma(mm, address, regs); ++ if (unlikely(!vma)) ++ goto bad_area_nosemaphore; ++ goto good_area; ++ + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + do_sigsegv(regs, write, address, si_code); + return; + +--- a/arch/nios2/Kconfig ++++ b/arch/nios2/Kconfig +@@ -16,6 +16,7 @@ config NIOS2 + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_KGDB + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select OF + select OF_EARLY_FLATTREE +--- a/arch/nios2/mm/fault.c ++++ b/arch/nios2/mm/fault.c +@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_ + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->ea)) +- goto bad_area_nosemaphore; + retry: +- mmap_read_lock(mm); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + code = SEGV_ACCERR; + + switch (cause) { +--- a/arch/sh/Kconfig ++++ b/arch/sh/Kconfig +@@ -56,6 +56,7 @@ config SUPERH + select HAVE_STACKPROTECTOR + select HAVE_SYSCALL_TRACEPOINTS + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select NEED_SG_DMA_LENGTH + select NO_DMA if !MMU && !DMA_COHERENT +--- a/arch/sh/mm/fault.c ++++ b/arch/sh/mm/fault.c +@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault( + } + + retry: +- mmap_read_lock(mm); +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, error_code, address); +- return; +- } +- if (likely(vma->vm_start <= address)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, error_code, address); +- return; +- } +- if (unlikely(expand_stack(vma, address))) { +- bad_area(regs, error_code, address); ++ bad_area_nosemaphore(regs, error_code, address); + return; + } + +@@ -461,7 +449,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address); + return; +--- a/arch/sparc/Kconfig ++++ b/arch/sparc/Kconfig +@@ -56,6 +56,7 @@ config SPARC32 + select DMA_DIRECT_REMAP + select GENERIC_ATOMIC64 + select HAVE_UID16 ++ select LOCK_MM_AND_FIND_VMA + select OLD_SIGACTION + select ZONE_DMA + +--- a/arch/sparc/mm/fault_32.c ++++ b/arch/sparc/mm/fault_32.c +@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt + if (pagefault_disabled() || !mm) + goto no_context; + ++ if (!from_user && address >= PAGE_OFFSET) ++ goto no_context; ++ + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + retry: +- mmap_read_lock(mm); +- +- if (!from_user && address >= PAGE_OFFSET) +- goto bad_area; +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + code = SEGV_ACCERR; + if (write) { + if (!(vma->vm_flags & VM_WRITE)) +@@ -318,17 +309,9 @@ static void force_user_fault(unsigned lo + + code = SEGV_MAPERR; + +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; +-good_area: ++ goto bad_area_nosemaphore; + code = SEGV_ACCERR; + if (write) { + if (!(vma->vm_flags & VM_WRITE)) +@@ -347,6 +330,7 @@ good_area: + return; + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address); + return; + +--- a/arch/xtensa/Kconfig ++++ b/arch/xtensa/Kconfig +@@ -49,6 +49,7 @@ config XTENSA + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING_GEN + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select PERF_USE_VMALLOC + select TRACE_IRQFLAGS_SUPPORT +--- a/arch/xtensa/mm/fault.c ++++ b/arch/xtensa/mm/fault.c +@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs) + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + + /* Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + +-good_area: + code = SEGV_ACCERR; + + if (is_write) { +@@ -205,6 +196,7 @@ good_area: + */ + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + if (user_mode(regs)) { + current->thread.bad_vaddr = address; + current->thread.error_code = is_write; diff --git a/queue-6.1/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch b/queue-6.1/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch new file mode 100644 index 00000000000..a290458cbec --- /dev/null +++ b/queue-6.1/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch @@ -0,0 +1,298 @@ +From 088826669e9cadc96824a9523a799bd6854a31ec Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 15:17:36 -0700 +Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper + +From: Linus Torvalds + +commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream. + +.. and make x86 use it. + +This basically extracts the existing x86 "find and expand faulting vma" +code, but extends it to also take the mmap lock for writing in case we +actually do need to expand the vma. + +We've historically short-circuited that case, and have some rather ugly +special logic to serialize the stack segment expansion (since we only +hold the mmap lock for reading) that doesn't match the normal VM +locking. + +That slight violation of locking worked well, right up until it didn't: +the maple tree code really does want proper locking even for simple +extension of an existing vma. + +So extract the code for "look up the vma of the fault" from x86, fix it +up to do the necessary write locking, and make it available as a helper +function for other architectures that can use the common helper. + +Note: I say "common helper", but it really only handles the normal +stack-grows-down case. Which is all architectures except for PA-RISC +and IA64. So some rare architectures can't use the helper, but if they +care they'll just need to open-code this logic. + +It's also worth pointing out that this code really would like to have an +optimistic "mmap_upgrade_trylock()" to make it quicker to go from a +read-lock (for the common case) to taking the write lock (for having to +extend the vma) in the normal single-threaded situation where there is +no other locking activity. + +But that _is_ all the very uncommon special case, so while it would be +nice to have such an operation, it probably doesn't matter in reality. +I did put in the skeleton code for such a possible future expansion, +even if it only acts as pseudo-documentation for what we're doing. + +Signed-off-by: Linus Torvalds +[6.1: Ignore CONFIG_PER_VMA_LOCK context] +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/Kconfig | 1 + arch/x86/mm/fault.c | 52 ---------------------- + include/linux/mm.h | 2 + mm/Kconfig | 4 + + mm/memory.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 130 insertions(+), 50 deletions(-) + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -271,6 +271,7 @@ config X86 + select HAVE_GENERIC_VDSO + select HOTPLUG_SMT if SMP + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK + select NEED_SG_DMA_LENGTH +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -900,12 +900,6 @@ __bad_area(struct pt_regs *regs, unsigne + __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); + } + +-static noinline void +-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +-{ +- __bad_area(regs, error_code, address, 0, SEGV_MAPERR); +-} +- + static inline bool bad_area_access_from_pkeys(unsigned long error_code, + struct vm_area_struct *vma) + { +@@ -1354,51 +1348,10 @@ void do_user_addr_fault(struct pt_regs * + } + #endif + +- /* +- * Kernel-mode access to the user address space should only occur +- * on well-defined single instructions listed in the exception +- * tables. But, an erroneous kernel fault occurring outside one of +- * those areas which also holds mmap_lock might deadlock attempting +- * to validate the fault against the address space. +- * +- * Only do the expensive exception table search when we might be at +- * risk of a deadlock. This happens if we +- * 1. Failed to acquire mmap_lock, and +- * 2. The access did not originate in userspace. +- */ +- if (unlikely(!mmap_read_trylock(mm))) { +- if (!user_mode(regs) && !search_exception_tables(regs->ip)) { +- /* +- * Fault from code in kernel from +- * which we do not expect faults. +- */ +- bad_area_nosemaphore(regs, error_code, address); +- return; +- } + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case we'll have missed the might_sleep() from +- * down_read(): +- */ +- might_sleep(); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, error_code, address); +- return; +- } +- if (likely(vma->vm_start <= address)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, error_code, address); +- return; +- } +- if (unlikely(expand_stack(vma, address))) { +- bad_area(regs, error_code, address); ++ bad_area_nosemaphore(regs, error_code, address); + return; + } + +@@ -1406,7 +1359,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); + return; +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1932,6 +1932,8 @@ void unmap_mapping_pages(struct address_ + pgoff_t start, pgoff_t nr, bool even_cows); + void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); ++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, ++ unsigned long address, struct pt_regs *regs); + #else + static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1150,6 +1150,10 @@ config LRU_GEN_STATS + This option has a per-memcg and per-node memory overhead. + # } + ++config LOCK_MM_AND_FIND_VMA ++ bool ++ depends on !STACK_GROWSUP ++ + source "mm/damon/Kconfig" + + endmenu +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5246,6 +5246,127 @@ vm_fault_t handle_mm_fault(struct vm_are + } + EXPORT_SYMBOL_GPL(handle_mm_fault); + ++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA ++#include ++ ++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) ++{ ++ /* Even if this succeeds, make it clear we *might* have slept */ ++ if (likely(mmap_read_trylock(mm))) { ++ might_sleep(); ++ return true; ++ } ++ ++ if (regs && !user_mode(regs)) { ++ unsigned long ip = instruction_pointer(regs); ++ if (!search_exception_tables(ip)) ++ return false; ++ } ++ ++ mmap_read_lock(mm); ++ return true; ++} ++ ++static inline bool mmap_upgrade_trylock(struct mm_struct *mm) ++{ ++ /* ++ * We don't have this operation yet. ++ * ++ * It should be easy enough to do: it's basically a ++ * atomic_long_try_cmpxchg_acquire() ++ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but ++ * it also needs the proper lockdep magic etc. ++ */ ++ return false; ++} ++ ++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) ++{ ++ mmap_read_unlock(mm); ++ if (regs && !user_mode(regs)) { ++ unsigned long ip = instruction_pointer(regs); ++ if (!search_exception_tables(ip)) ++ return false; ++ } ++ mmap_write_lock(mm); ++ return true; ++} ++ ++/* ++ * Helper for page fault handling. ++ * ++ * This is kind of equivalend to "mmap_read_lock()" followed ++ * by "find_extend_vma()", except it's a lot more careful about ++ * the locking (and will drop the lock on failure). ++ * ++ * For example, if we have a kernel bug that causes a page ++ * fault, we don't want to just use mmap_read_lock() to get ++ * the mm lock, because that would deadlock if the bug were ++ * to happen while we're holding the mm lock for writing. ++ * ++ * So this checks the exception tables on kernel faults in ++ * order to only do this all for instructions that are actually ++ * expected to fault. ++ * ++ * We can also actually take the mm lock for writing if we ++ * need to extend the vma, which helps the VM layer a lot. ++ */ ++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, ++ unsigned long addr, struct pt_regs *regs) ++{ ++ struct vm_area_struct *vma; ++ ++ if (!get_mmap_lock_carefully(mm, regs)) ++ return NULL; ++ ++ vma = find_vma(mm, addr); ++ if (likely(vma && (vma->vm_start <= addr))) ++ return vma; ++ ++ /* ++ * Well, dang. We might still be successful, but only ++ * if we can extend a vma to do so. ++ */ ++ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { ++ mmap_read_unlock(mm); ++ return NULL; ++ } ++ ++ /* ++ * We can try to upgrade the mmap lock atomically, ++ * in which case we can continue to use the vma ++ * we already looked up. ++ * ++ * Otherwise we'll have to drop the mmap lock and ++ * re-take it, and also look up the vma again, ++ * re-checking it. ++ */ ++ if (!mmap_upgrade_trylock(mm)) { ++ if (!upgrade_mmap_lock_carefully(mm, regs)) ++ return NULL; ++ ++ vma = find_vma(mm, addr); ++ if (!vma) ++ goto fail; ++ if (vma->vm_start <= addr) ++ goto success; ++ if (!(vma->vm_flags & VM_GROWSDOWN)) ++ goto fail; ++ } ++ ++ if (expand_stack(vma, addr)) ++ goto fail; ++ ++success: ++ mmap_write_downgrade(mm); ++ return vma; ++ ++fail: ++ mmap_write_unlock(mm); ++ return NULL; ++} ++#endif ++ + #ifndef __PAGETABLE_P4D_FOLDED + /* + * Allocate p4d page table. diff --git a/queue-6.1/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch b/queue-6.1/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch new file mode 100644 index 00000000000..37abf97c8f9 --- /dev/null +++ b/queue-6.1/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch @@ -0,0 +1,244 @@ +From 37a9a30aeabe9fd620bcda2bb333f28a1593820d Mon Sep 17 00:00:00 2001 +From: "Liam R. Howlett" +Date: Fri, 16 Jun 2023 15:58:54 -0700 +Subject: mm: make find_extend_vma() fail if write lock not held + +From: "Liam R. Howlett" + +commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream. + +Make calls to extend_vma() and find_extend_vma() fail if the write lock +is required. + +To avoid making this a flag-day event, this still allows the old +read-locking case for the trivial situations, and passes in a flag to +say "is it write-locked". That way write-lockers can say "yes, I'm +being careful", and legacy users will continue to work in all the common +cases until they have been fully converted to the new world order. + +Co-Developed-by: Matthew Wilcox (Oracle) +Signed-off-by: Matthew Wilcox (Oracle) +Signed-off-by: Liam R. Howlett +Signed-off-by: Linus Torvalds +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + fs/binfmt_elf.c | 6 +++--- + fs/exec.c | 5 +++-- + include/linux/mm.h | 10 +++++++--- + mm/memory.c | 2 +- + mm/mmap.c | 50 +++++++++++++++++++++++++++++++++----------------- + mm/nommu.c | 3 ++- + 6 files changed, 49 insertions(+), 27 deletions(-) + +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -315,10 +315,10 @@ create_elf_tables(struct linux_binprm *b + * Grow the stack manually; some architectures have a limit on how + * far ahead a user-space access may be in order to grow the stack. + */ +- if (mmap_read_lock_killable(mm)) ++ if (mmap_write_lock_killable(mm)) + return -EINTR; +- vma = find_extend_vma(mm, bprm->p); +- mmap_read_unlock(mm); ++ vma = find_extend_vma_locked(mm, bprm->p, true); ++ mmap_write_unlock(mm); + if (!vma) + return -EFAULT; + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -203,7 +203,8 @@ static struct page *get_arg_page(struct + + #ifdef CONFIG_STACK_GROWSUP + if (write) { +- ret = expand_downwards(bprm->vma, pos); ++ /* We claim to hold the lock - nobody to race with */ ++ ret = expand_downwards(bprm->vma, pos, true); + if (ret < 0) + return NULL; + } +@@ -854,7 +855,7 @@ int setup_arg_pages(struct linux_binprm + stack_base = vma->vm_start - stack_expand; + #endif + current->mm->start_stack = bprm->p; +- ret = expand_stack(vma, stack_base); ++ ret = expand_stack_locked(vma, stack_base, true); + if (ret) + ret = -EFAULT; + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2810,11 +2810,13 @@ extern vm_fault_t filemap_page_mkwrite(s + + extern unsigned long stack_guard_gap; + /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ +-extern int expand_stack(struct vm_area_struct *vma, unsigned long address); ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked); ++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) + + /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ +-extern int expand_downwards(struct vm_area_struct *vma, +- unsigned long address); ++int expand_downwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked); + #if VM_GROWSUP + extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); + #else +@@ -2915,6 +2917,8 @@ unsigned long change_prot_numa(struct vm + #endif + + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, ++ unsigned long addr, bool write_locked); + int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); + int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5352,7 +5352,7 @@ struct vm_area_struct *lock_mm_and_find_ + goto fail; + } + +- if (expand_stack(vma, addr)) ++ if (expand_stack_locked(vma, addr, true)) + goto fail; + + success: +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1945,7 +1945,8 @@ static int acct_stack_growth(struct vm_a + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +-int expand_upwards(struct vm_area_struct *vma, unsigned long address) ++int expand_upwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; +@@ -1969,6 +1970,8 @@ int expand_upwards(struct vm_area_struct + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + ++ if (!write_locked) ++ return -EAGAIN; + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) +@@ -2037,7 +2040,8 @@ int expand_upwards(struct vm_area_struct + /* + * vma is the first one with address < vma->vm_start. Have to extend vma. + */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address) ++int expand_downwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); +@@ -2051,10 +2055,13 @@ int expand_downwards(struct vm_area_stru + /* Enforce stack_guard_gap */ + prev = mas_prev(&mas, 0); + /* Check that both stack segments have the same anon_vma? */ +- if (prev && !(prev->vm_flags & VM_GROWSDOWN) && +- vma_is_accessible(prev)) { +- if (address - prev->vm_end < stack_guard_gap) ++ if (prev) { ++ if (!(prev->vm_flags & VM_GROWSDOWN) && ++ vma_is_accessible(prev) && ++ (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; ++ if (!write_locked && (prev->vm_end == address)) ++ return -EAGAIN; + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) +@@ -2132,13 +2139,14 @@ static int __init cmdline_parse_stack_gu + __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + + #ifdef CONFIG_STACK_GROWSUP +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { +- return expand_upwards(vma, address); ++ return expand_upwards(vma, address, write_locked); + } + +-struct vm_area_struct * +-find_extend_vma(struct mm_struct *mm, unsigned long addr) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, ++ unsigned long addr, bool write_locked) + { + struct vm_area_struct *vma, *prev; + +@@ -2146,20 +2154,25 @@ find_extend_vma(struct mm_struct *mm, un + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; +- if (!prev || expand_stack(prev, addr)) ++ if (!prev) ++ return NULL; ++ if (expand_stack_locked(prev, addr, write_locked)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); + return prev; + } + #else +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { +- return expand_downwards(vma, address); ++ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) ++ return -EINVAL; ++ return expand_downwards(vma, address, write_locked); + } + +-struct vm_area_struct * +-find_extend_vma(struct mm_struct *mm, unsigned long addr) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, ++ unsigned long addr, bool write_locked) + { + struct vm_area_struct *vma; + unsigned long start; +@@ -2170,10 +2183,8 @@ find_extend_vma(struct mm_struct *mm, un + return NULL; + if (vma->vm_start <= addr) + return vma; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return NULL; + start = vma->vm_start; +- if (expand_stack(vma, addr)) ++ if (expand_stack_locked(vma, addr, write_locked)) + return NULL; + if (vma->vm_flags & VM_LOCKED) + populate_vma_page_range(vma, addr, start, NULL); +@@ -2181,6 +2192,11 @@ find_extend_vma(struct mm_struct *mm, un + } + #endif + ++struct vm_area_struct *find_extend_vma(struct mm_struct *mm, ++ unsigned long addr) ++{ ++ return find_extend_vma_locked(mm, addr, false); ++} + EXPORT_SYMBOL_GPL(find_extend_vma); + + /* +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -694,7 +694,8 @@ struct vm_area_struct *find_extend_vma(s + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + return -ENOMEM; + } diff --git a/queue-6.1/mm-make-the-page-fault-mmap-locking-killable.patch b/queue-6.1/mm-make-the-page-fault-mmap-locking-killable.patch new file mode 100644 index 00000000000..a78615038c3 --- /dev/null +++ b/queue-6.1/mm-make-the-page-fault-mmap-locking-killable.patch @@ -0,0 +1,48 @@ +From 92a6879f1c3fc7fdf6660b10be045c457ec697c6 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 16:17:48 -0700 +Subject: mm: make the page fault mmap locking killable + +From: Linus Torvalds + +commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream. + +This is done as a separate patch from introducing the new +lock_mm_and_find_vma() helper, because while it's an obvious change, +it's not what x86 used to do in this area. + +We already abort the page fault on fatal signals anyway, so why should +we wait for the mmap lock only to then abort later? With the new helper +function that returns without the lock held on failure anyway, this is +particularly easy and straightforward. + +Signed-off-by: Linus Torvalds +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5263,8 +5263,7 @@ static inline bool get_mmap_lock_careful + return false; + } + +- mmap_read_lock(mm); +- return true; ++ return !mmap_read_lock_killable(mm); + } + + static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +@@ -5288,8 +5287,7 @@ static inline bool upgrade_mmap_lock_car + if (!search_exception_tables(ip)) + return false; + } +- mmap_write_lock(mm); +- return true; ++ return !mmap_write_lock_killable(mm); + } + + /* diff --git a/queue-6.1/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch b/queue-6.1/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..04d32490115 --- /dev/null +++ b/queue-6.1/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch @@ -0,0 +1,49 @@ +From d47a1e567e9744a2a097ae2a39a2b028619d1f15 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 11:17:05 -0700 +Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma() + +From: Linus Torvalds + +commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream. + +This is one of the simple cases, except there's no pt_regs pointer. +Which is fine, as lock_mm_and_find_vma() is set up to work fine with a +NULL pt_regs. + +Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting, +so we can just use the helper without any extra work. + +Signed-off-by: Linus Torvalds +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/mm/copro_fault.c | 14 +++----------- + 1 file changed, 3 insertions(+), 11 deletions(-) + +--- a/arch/powerpc/mm/copro_fault.c ++++ b/arch/powerpc/mm/copro_fault.c +@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru + if (mm->pgd == NULL) + return -EFAULT; + +- mmap_read_lock(mm); +- ret = -EFAULT; +- vma = find_vma(mm, ea); ++ vma = lock_mm_and_find_vma(mm, ea, NULL); + if (!vma) +- goto out_unlock; +- +- if (ea < vma->vm_start) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto out_unlock; +- if (expand_stack(vma, ea)) +- goto out_unlock; +- } ++ return -EFAULT; + ++ ret = -EFAULT; + is_write = dsisr & DSISR_ISSTORE; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) diff --git a/queue-6.1/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..38a8974c7b4 --- /dev/null +++ b/queue-6.1/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,88 @@ +From 689298e7d498f2c6d3e8116bce0a7c769e5369dc Mon Sep 17 00:00:00 2001 +From: Michael Ellerman +Date: Fri, 16 Jun 2023 15:51:29 +1000 +Subject: powerpc/mm: Convert to using lock_mm_and_find_vma() + +From: Michael Ellerman + +commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream. + +Signed-off-by: Michael Ellerman +Signed-off-by: Linus Torvalds +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/Kconfig | 1 + + arch/powerpc/mm/fault.c | 41 ++++------------------------------------- + 2 files changed, 5 insertions(+), 37 deletions(-) + +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -257,6 +257,7 @@ config PPC + select IRQ_DOMAIN + select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN && MODULES ++ select LOCK_MM_AND_FIND_VMA + select MMU_GATHER_PAGE_SIZE + select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS +--- a/arch/powerpc/mm/fault.c ++++ b/arch/powerpc/mm/fault.c +@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re + return __bad_area_nosemaphore(regs, address, si_code); + } + +-static noinline int bad_area(struct pt_regs *regs, unsigned long address) +-{ +- return __bad_area(regs, address, SEGV_MAPERR); +-} +- + static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, + struct vm_area_struct *vma) + { +@@ -481,40 +476,12 @@ static int ___do_page_fault(struct pt_re + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the +- * exceptions table. +- * +- * As the vast majority of faults will be valid we will only perform +- * the source reference check when there is a possibility of a deadlock. +- * Attempt to lock the address space, if we cannot we then validate the +- * source. If this is invalid we can skip the address space check, +- * thus avoiding the deadlock. +- */ +- if (unlikely(!mmap_read_trylock(mm))) { +- if (!is_user && !search_exception_tables(regs->nip)) +- return bad_area_nosemaphore(regs, address); +- ++ * exceptions table. lock_mm_and_find_vma() handles that logic. ++ */ + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case we'll have missed the might_sleep() from +- * down_read(): +- */ +- might_sleep(); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) +- return bad_area(regs, address); +- +- if (unlikely(vma->vm_start > address)) { +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) +- return bad_area(regs, address); +- +- if (unlikely(expand_stack(vma, address))) +- return bad_area(regs, address); +- } ++ return bad_area_nosemaphore(regs, address); + + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) diff --git a/queue-6.1/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.1/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..aa14f598a26 --- /dev/null +++ b/queue-6.1/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,98 @@ +From a907c689b4e7014c73c7c34fb1520431b75c787c Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 20:18:18 +0200 +Subject: riscv/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream. + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +[6.1: Kconfig context] +Signed-off-by: Samuel Mendoza-Jonas +Signed-off-by: David Woodhouse +Signed-off-by: Greg Kroah-Hartman +--- + arch/riscv/Kconfig | 1 + + arch/riscv/mm/fault.c | 31 +++++++++++++------------------ + 2 files changed, 14 insertions(+), 18 deletions(-) + +--- a/arch/riscv/Kconfig ++++ b/arch/riscv/Kconfig +@@ -114,6 +114,7 @@ config RISCV + select HAVE_RSEQ + select IRQ_DOMAIN + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA if MODULES + select MODULE_SECTIONS if MODULES + select OF +--- a/arch/riscv/mm/fault.c ++++ b/arch/riscv/mm/fault.c +@@ -83,13 +83,13 @@ static inline void mm_fault_error(struct + BUG(); + } + +-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) ++static inline void ++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) + { + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ +- mmap_read_unlock(mm); + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); +@@ -99,6 +99,15 @@ static inline void bad_area(struct pt_re + no_context(regs, addr); + } + ++static inline void ++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, ++ unsigned long addr) ++{ ++ mmap_read_unlock(mm); ++ ++ bad_area_nosemaphore(regs, code, addr); ++} ++ + static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) + { + pgd_t *pgd, *pgd_k; +@@ -281,23 +290,10 @@ asmlinkage void do_page_fault(struct pt_ + else if (cause == EXC_INST_PAGE_FAULT) + flags |= FAULT_FLAG_INSTRUCTION; + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, addr); ++ vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); +- return; +- } +- if (likely(vma->vm_start <= addr)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); +- return; +- } +- if (unlikely(expand_stack(vma, addr))) { +- tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); ++ bad_area_nosemaphore(regs, code, addr); + return; + } + +@@ -305,7 +301,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it. + */ +-good_area: + code = SEGV_ACCERR; + + if (unlikely(access_error(cause, vma))) { diff --git a/queue-6.1/series b/queue-6.1/series index eebea1092ff..34905d9a593 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -11,3 +11,16 @@ x86-smp-use-dedicated-cache-line-for-mwait_play_dead.patch x86-smp-cure-kexec-vs.-mwait_play_dead-breakage.patch can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch + +mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch +mm-make-the-page-fault-mmap-locking-killable.patch +arm64-mm-convert-to-using-lock_mm_and_find_vma.patch +powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch +mips-mm-convert-to-using-lock_mm_and_find_vma.patch +riscv-mm-convert-to-using-lock_mm_and_find_vma.patch +arm-mm-convert-to-using-lock_mm_and_find_vma.patch +mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch +powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch +mm-make-find_extend_vma-fail-if-write-lock-not-held.patch +execve-expand-new-process-stack-manually-ahead-of-time.patch +mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch diff --git a/queue-6.3/arm-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/arm-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..708b2a7de56 --- /dev/null +++ b/queue-6.3/arm-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,136 @@ +From 8b35ca3e45e35a26a21427f35d4093606e93ad0a Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 21:24:30 +0200 +Subject: arm/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream. + +arm has an additional check for address < FIRST_USER_ADDRESS before +expanding the stack. Since FIRST_USER_ADDRESS is defined everywhere +(generally as 0), move that check to the generic expand_downwards(). + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm/Kconfig | 1 + arch/arm/mm/fault.c | 63 +++++++++++----------------------------------------- + mm/mmap.c | 2 - + 3 files changed, 16 insertions(+), 50 deletions(-) + +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -125,6 +125,7 @@ config ARM + select HAVE_UID16 + select HAVE_VIRT_CPU_ACCOUNTING_GEN + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_REL + select NEED_DMA_MAP_STATE + select OF_EARLY_FLATTREE if OF +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -232,37 +232,11 @@ static inline bool is_permission_fault(u + return false; + } + +-static vm_fault_t __kprobes +-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags, +- unsigned long vma_flags, struct pt_regs *regs) +-{ +- struct vm_area_struct *vma = find_vma(mm, addr); +- if (unlikely(!vma)) +- return VM_FAULT_BADMAP; +- +- if (unlikely(vma->vm_start > addr)) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return VM_FAULT_BADMAP; +- if (addr < FIRST_USER_ADDRESS) +- return VM_FAULT_BADMAP; +- if (expand_stack(vma, addr)) +- return VM_FAULT_BADMAP; +- } +- +- /* +- * ok, we have a good vm_area for this memory access, check the +- * permissions on the VMA allow for the fault which occurred. +- */ +- if (!(vma->vm_flags & vma_flags)) +- return VM_FAULT_BADACCESS; +- +- return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); +-} +- + static int __kprobes + do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { + struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; + int sig, code; + vm_fault_t fault; + unsigned int flags = FAULT_FLAG_DEFAULT; +@@ -301,31 +275,21 @@ do_page_fault(unsigned long addr, unsign + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + +- /* +- * As per x86, we may deadlock here. However, since the kernel only +- * validly references user space from well defined areas of the code, +- * we can bug out early if this is from code which shouldn't. +- */ +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) +- goto no_context; + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case, we'll have missed the might_sleep() from +- * down_read() +- */ +- might_sleep(); +-#ifdef CONFIG_DEBUG_VM +- if (!user_mode(regs) && +- !search_exception_tables(regs->ARM_pc)) +- goto no_context; +-#endif ++ vma = lock_mm_and_find_vma(mm, addr, regs); ++ if (unlikely(!vma)) { ++ fault = VM_FAULT_BADMAP; ++ goto bad_area; + } + +- fault = __do_page_fault(mm, addr, flags, vm_flags, regs); ++ /* ++ * ok, we have a good vm_area for this memory access, check the ++ * permissions on the VMA allow for the fault which occurred. ++ */ ++ if (!(vma->vm_flags & vm_flags)) ++ fault = VM_FAULT_BADACCESS; ++ else ++ fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); + + /* If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_lock because +@@ -356,6 +320,7 @@ retry: + if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) + return 0; + ++bad_area: + /* + * If we are in kernel mode at this point, we + * have no context to handle this fault with. +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1999,7 +1999,7 @@ int expand_downwards(struct vm_area_stru + int error = 0; + + address &= PAGE_MASK; +- if (address < mmap_min_addr) ++ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + + /* Enforce stack_guard_gap */ diff --git a/queue-6.3/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..4d947455b65 --- /dev/null +++ b/queue-6.3/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,101 @@ +From ae870a68b5d13d67cf4f18d47bb01ee3fee40acb Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 17:11:44 -0700 +Subject: arm64/mm: Convert to using lock_mm_and_find_vma() + +From: Linus Torvalds + +commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream. + +This converts arm64 to use the new page fault helper. It was very +straightforward, but still needed a fix for the "obvious" conversion I +initially did. Thanks to Suren for the fix and testing. + +Fixed-and-tested-by: Suren Baghdasaryan +Unnecessary-code-removal-by: Liam R. Howlett +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/Kconfig | 1 + + arch/arm64/mm/fault.c | 44 +++++++------------------------------------- + 2 files changed, 8 insertions(+), 37 deletions(-) + +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -219,6 +219,7 @@ config ARM64 + select IRQ_DOMAIN + select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select NEED_DMA_MAP_STATE + select NEED_SG_DMA_LENGTH +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa + #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) + #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) + +-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, ++static vm_fault_t __do_page_fault(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long addr, + unsigned int mm_flags, unsigned long vm_flags, + struct pt_regs *regs) + { +- struct vm_area_struct *vma = find_vma(mm, addr); +- +- if (unlikely(!vma)) +- return VM_FAULT_BADMAP; +- + /* + * Ok, we have a good vm_area for this memory access, so we can handle + * it. +- */ +- if (unlikely(vma->vm_start > addr)) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return VM_FAULT_BADMAP; +- if (expand_stack(vma, addr)) +- return VM_FAULT_BADMAP; +- } +- +- /* + * Check that the permissions on the VMA allow for the fault which + * occurred. + */ +@@ -585,31 +572,14 @@ static int __kprobes do_page_fault(unsig + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + +- /* +- * As per x86, we may deadlock here. However, since the kernel only +- * validly references user space from well defined areas of the code, +- * we can bug out early if this is from code which shouldn't. +- */ +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->pc)) +- goto no_context; + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above mmap_read_trylock() might have succeeded in which +- * case, we'll have missed the might_sleep() from down_read(). +- */ +- might_sleep(); +-#ifdef CONFIG_DEBUG_VM +- if (!user_mode(regs) && !search_exception_tables(regs->pc)) { +- mmap_read_unlock(mm); +- goto no_context; +- } +-#endif ++ vma = lock_mm_and_find_vma(mm, addr, regs); ++ if (unlikely(!vma)) { ++ fault = VM_FAULT_BADMAP; ++ goto done; + } + +- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs); ++ fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs); + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { diff --git a/queue-6.3/execve-expand-new-process-stack-manually-ahead-of-time.patch b/queue-6.3/execve-expand-new-process-stack-manually-ahead-of-time.patch new file mode 100644 index 00000000000..75b8e5af50f --- /dev/null +++ b/queue-6.3/execve-expand-new-process-stack-manually-ahead-of-time.patch @@ -0,0 +1,88 @@ +From f313c51d26aa87e69633c9b46efb37a930faca71 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Mon, 19 Jun 2023 11:34:15 -0700 +Subject: execve: expand new process stack manually ahead of time + +From: Linus Torvalds + +commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream. + +This is a small step towards a model where GUP itself would not expand +the stack, and any user that needs GUP to not look up existing mappings, +but actually expand on them, would have to do so manually before-hand, +and with the mm lock held for writing. + +It turns out that execve() already did almost exactly that, except it +didn't take the mm lock at all (it's single-threaded so no locking +technically needed, but it could cause lockdep errors). And it only did +it for the CONFIG_STACK_GROWSUP case, since in that case GUP has +obviously never expanded the stack downwards. + +So just make that CONFIG_STACK_GROWSUP case do the right thing with +locking, and enable it generally. This will eventually help GUP, and in +the meantime avoids a special case and the lockdep issue. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/exec.c | 37 +++++++++++++++++++++---------------- + 1 file changed, 21 insertions(+), 16 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -199,34 +199,39 @@ static struct page *get_arg_page(struct + int write) + { + struct page *page; ++ struct vm_area_struct *vma = bprm->vma; ++ struct mm_struct *mm = bprm->mm; + int ret; +- unsigned int gup_flags = 0; + +-#ifdef CONFIG_STACK_GROWSUP +- if (write) { +- /* We claim to hold the lock - nobody to race with */ +- ret = expand_downwards(bprm->vma, pos, true); +- if (ret < 0) ++ /* ++ * Avoid relying on expanding the stack down in GUP (which ++ * does not work for STACK_GROWSUP anyway), and just do it ++ * by hand ahead of time. ++ */ ++ if (write && pos < vma->vm_start) { ++ mmap_write_lock(mm); ++ ret = expand_downwards(vma, pos, true); ++ if (unlikely(ret < 0)) { ++ mmap_write_unlock(mm); + return NULL; +- } +-#endif +- +- if (write) +- gup_flags |= FOLL_WRITE; ++ } ++ mmap_write_downgrade(mm); ++ } else ++ mmap_read_lock(mm); + + /* + * We are doing an exec(). 'current' is the process +- * doing the exec and bprm->mm is the new process's mm. ++ * doing the exec and 'mm' is the new process's mm. + */ +- mmap_read_lock(bprm->mm); +- ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, ++ ret = get_user_pages_remote(mm, pos, 1, ++ write ? FOLL_WRITE : 0, + &page, NULL, NULL); +- mmap_read_unlock(bprm->mm); ++ mmap_read_unlock(mm); + if (ret <= 0) + return NULL; + + if (write) +- acct_arg_size(bprm, vma_pages(bprm->vma)); ++ acct_arg_size(bprm, vma_pages(vma)); + + return page; + } diff --git a/queue-6.3/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch b/queue-6.3/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch new file mode 100644 index 00000000000..cc1b1efc4d6 --- /dev/null +++ b/queue-6.3/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch @@ -0,0 +1,59 @@ +From a425ac5365f6cb3cc47bf83e6bff0213c10445f7 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sun, 25 Jun 2023 14:02:25 -0700 +Subject: gup: add warning if some caller would seem to want stack expansion + +From: Linus Torvalds + +commit a425ac5365f6cb3cc47bf83e6bff0213c10445f7 upstream. + +It feels very unlikely that anybody would want to do a GUP in an +unmapped area under the stack pointer, but real users sometimes do some +really strange things. So add a (temporary) warning for the case where +a GUP fails and expanding the stack might have made it work. + +It's trivial to do the expansion in the caller as part of getting the mm +lock in the first place - see __access_remote_vm() for ptrace, for +example - it's just that it's unnecessarily painful to do it deep in the +guts of the GUP lookup when we might have to drop and re-take the lock. + +I doubt anybody actually does anything quite this strange, but let's be +proactive: adding these warnings is simple, and will make debugging it +much easier if they trigger. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/gup.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1096,7 +1096,11 @@ static long __get_user_pages(struct mm_s + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { +- vma = vma_lookup(mm, start); ++ vma = find_vma(mm, start); ++ if (vma && (start < vma->vm_start)) { ++ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN); ++ vma = NULL; ++ } + if (!vma && in_gate_area(mm, start)) { + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, +@@ -1265,9 +1269,13 @@ int fixup_user_fault(struct mm_struct *m + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + retry: +- vma = vma_lookup(mm, address); ++ vma = find_vma(mm, address); + if (!vma) + return -EFAULT; ++ if (address < vma->vm_start ) { ++ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN); ++ return -EFAULT; ++ } + + if (!vma_permits_fault(vma, fault_flags)) + return -EFAULT; diff --git a/queue-6.3/mips-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/mips-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..58efcc62b48 --- /dev/null +++ b/queue-6.3/mips-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,53 @@ +From 4bce37a68ff884e821a02a731897a8119e0c37b7 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 18:47:40 +0200 +Subject: mips/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream. + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/mips/Kconfig | 1 + + arch/mips/mm/fault.c | 12 ++---------- + 2 files changed, 3 insertions(+), 10 deletions(-) + +--- a/arch/mips/Kconfig ++++ b/arch/mips/Kconfig +@@ -94,6 +94,7 @@ config MIPS + select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP + select IRQ_FORCED_THREADING + select ISA if EISA ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_REL if MODULES + select MODULES_USE_ELF_RELA if MODULES && 64BIT + select PERF_USE_VMALLOC +--- a/arch/mips/mm/fault.c ++++ b/arch/mips/mm/fault.c +@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + si_code = SEGV_ACCERR; + + if (write) { diff --git a/queue-6.3/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch b/queue-6.3/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch new file mode 100644 index 00000000000..e8d712681f0 --- /dev/null +++ b/queue-6.3/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch @@ -0,0 +1,671 @@ +From 8d7071af890768438c14db6172cc8f9f4d04e184 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 13:45:51 -0700 +Subject: mm: always expand the stack with the mmap write lock held + +From: Linus Torvalds + +commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream. + +This finishes the job of always holding the mmap write lock when +extending the user stack vma, and removes the 'write_locked' argument +from the vm helper functions again. + +For some cases, we just avoid expanding the stack at all: drivers and +page pinning really shouldn't be extending any stacks. Let's see if any +strange users really wanted that. + +It's worth noting that architectures that weren't converted to the new +lock_mm_and_find_vma() helper function are left using the legacy +"expand_stack()" function, but it has been changed to drop the mmap_lock +and take it for writing while expanding the vma. This makes it fairly +straightforward to convert the remaining architectures. + +As a result of dropping and re-taking the lock, the calling conventions +for this function have also changed, since the old vma may no longer be +valid. So it will now return the new vma if successful, and NULL - and +the lock dropped - if the area could not be extended. + +Tested-by: Vegard Nossum +Tested-by: John Paul Adrian Glaubitz # ia64 +Tested-by: Frank Scheiner # ia64 +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/ia64/mm/fault.c | 36 ++---------- + arch/m68k/mm/fault.c | 9 ++- + arch/microblaze/mm/fault.c | 5 + + arch/openrisc/mm/fault.c | 5 + + arch/parisc/mm/fault.c | 23 +++----- + arch/s390/mm/fault.c | 5 + + arch/sparc/mm/fault_64.c | 8 +- + arch/um/kernel/trap.c | 11 ++- + drivers/iommu/amd/iommu_v2.c | 4 - + drivers/iommu/iommu-sva.c | 2 + fs/binfmt_elf.c | 2 + fs/exec.c | 4 - + include/linux/mm.h | 16 +---- + mm/gup.c | 6 +- + mm/memory.c | 10 +++ + mm/mmap.c | 121 ++++++++++++++++++++++++++++++++++--------- + mm/nommu.c | 18 ++---- + 17 files changed, 169 insertions(+), 116 deletions(-) + +--- a/arch/ia64/mm/fault.c ++++ b/arch/ia64/mm/fault.c +@@ -110,10 +110,12 @@ retry: + * register backing store that needs to expand upwards, in + * this case vma will be null, but prev_vma will ne non-null + */ +- if (( !vma && prev_vma ) || (address < vma->vm_start) ) +- goto check_expansion; ++ if (( !vma && prev_vma ) || (address < vma->vm_start) ) { ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; ++ } + +- good_area: + code = SEGV_ACCERR; + + /* OK, we've got a good vm_area for this memory area. Check the access permissions: */ +@@ -177,35 +179,9 @@ retry: + mmap_read_unlock(mm); + return; + +- check_expansion: +- if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) { +- if (!vma) +- goto bad_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) +- || REGION_OFFSET(address) >= RGN_MAP_LIMIT) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; +- } else { +- vma = prev_vma; +- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) +- || REGION_OFFSET(address) >= RGN_MAP_LIMIT) +- goto bad_area; +- /* +- * Since the register backing store is accessed sequentially, +- * we disallow growing it by more than a page at a time. +- */ +- if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) +- goto bad_area; +- if (expand_upwards(vma, address)) +- goto bad_area; +- } +- goto good_area; +- + bad_area: + mmap_read_unlock(mm); ++ bad_area_nosemaphore: + if ((isr & IA64_ISR_SP) + || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) + { +--- a/arch/m68k/mm/fault.c ++++ b/arch/m68k/mm/fault.c +@@ -105,8 +105,9 @@ retry: + if (address + 256 < rdusp()) + goto map_err; + } +- if (expand_stack(vma, address)) +- goto map_err; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto map_err_nosemaphore; + + /* + * Ok, we have a good vm_area for this memory access, so +@@ -196,10 +197,12 @@ bus_err: + goto send_sig; + + map_err: ++ mmap_read_unlock(mm); ++map_err_nosemaphore: + current->thread.signo = SIGSEGV; + current->thread.code = SEGV_MAPERR; + current->thread.faddr = address; +- goto send_sig; ++ return send_fault_sig(regs); + + acc_err: + current->thread.signo = SIGSEGV; +--- a/arch/microblaze/mm/fault.c ++++ b/arch/microblaze/mm/fault.c +@@ -192,8 +192,9 @@ retry: + && (kernel_mode(regs) || !store_updates_sp(regs))) + goto bad_area; + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + + good_area: + code = SEGV_ACCERR; +--- a/arch/openrisc/mm/fault.c ++++ b/arch/openrisc/mm/fault.c +@@ -127,8 +127,9 @@ retry: + if (address + PAGE_SIZE < regs->sp) + goto bad_area; + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + + /* + * Ok, we have a good vm_area for this memory access, so +--- a/arch/parisc/mm/fault.c ++++ b/arch/parisc/mm/fault.c +@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs, + retry: + mmap_read_lock(mm); + vma = find_vma_prev(mm, address, &prev_vma); +- if (!vma || address < vma->vm_start) +- goto check_expansion; ++ if (!vma || address < vma->vm_start) { ++ if (!prev || !(prev->vm_flags & VM_GROWSUP)) ++ goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; ++ } ++ + /* + * Ok, we have a good vm_area for this memory access. We still need to + * check the access permissions. + */ + +-good_area: +- + if ((vma->vm_flags & acc_type) != acc_type) + goto bad_area; + +@@ -347,17 +351,13 @@ good_area: + mmap_read_unlock(mm); + return; + +-check_expansion: +- vma = prev_vma; +- if (vma && (expand_stack(vma, address) == 0)) +- goto good_area; +- + /* + * Something tried to access memory that isn't in our memory map.. + */ + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + if (user_mode(regs)) { + int signo, si_code; + +@@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs + { + unsigned long insn = regs->iir; + int breg, treg, xreg, val = 0; +- struct vm_area_struct *vma, *prev_vma; ++ struct vm_area_struct *vma; + struct task_struct *tsk; + struct mm_struct *mm; + unsigned long address; +@@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs + /* Search for VMA */ + address = regs->ior; + mmap_read_lock(mm); +- vma = find_vma_prev(mm, address, &prev_vma); ++ vma = vma_lookup(mm, address); + mmap_read_unlock(mm); + + /* +@@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs + */ + acc_type = (insn & 0x40) ? VM_WRITE : VM_READ; + if (vma +- && address >= vma->vm_start + && (vma->vm_flags & acc_type) == acc_type) + val = 1; + } +--- a/arch/s390/mm/fault.c ++++ b/arch/s390/mm/fault.c +@@ -433,8 +433,9 @@ retry: + if (unlikely(vma->vm_start > address)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_up; +- if (expand_stack(vma, address)) +- goto out_up; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto out; + } + + /* +--- a/arch/sparc/mm/fault_64.c ++++ b/arch/sparc/mm/fault_64.c +@@ -383,8 +383,9 @@ continue_fault: + goto bad_area; + } + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. +@@ -487,8 +488,9 @@ exit_exception: + * Fix it, but check if it's kernel or user first.. + */ + bad_area: +- insn = get_fault_insn(regs, insn); + mmap_read_unlock(mm); ++bad_area_nosemaphore: ++ insn = get_fault_insn(regs, insn); + + handle_kernel_fault: + do_kernel_fault(regs, si_code, fault_code, insn, address); +--- a/arch/um/kernel/trap.c ++++ b/arch/um/kernel/trap.c +@@ -47,14 +47,15 @@ retry: + vma = find_vma(mm, address); + if (!vma) + goto out; +- else if (vma->vm_start <= address) ++ if (vma->vm_start <= address) + goto good_area; +- else if (!(vma->vm_flags & VM_GROWSDOWN)) ++ if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; +- else if (is_user && !ARCH_IS_STACKGROW(address)) +- goto out; +- else if (expand_stack(vma, address)) ++ if (is_user && !ARCH_IS_STACKGROW(address)) + goto out; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto out_nosemaphore; + + good_area: + *code_out = SEGV_ACCERR; +--- a/drivers/iommu/amd/iommu_v2.c ++++ b/drivers/iommu/amd/iommu_v2.c +@@ -485,8 +485,8 @@ static void do_fault(struct work_struct + flags |= FAULT_FLAG_REMOTE; + + mmap_read_lock(mm); +- vma = find_extend_vma(mm, address); +- if (!vma || address < vma->vm_start) ++ vma = vma_lookup(mm, address); ++ if (!vma) + /* failed to get a vma in the right range */ + goto out; + +--- a/drivers/iommu/iommu-sva.c ++++ b/drivers/iommu/iommu-sva.c +@@ -203,7 +203,7 @@ iommu_sva_handle_iopf(struct iommu_fault + + mmap_read_lock(mm); + +- vma = find_extend_vma(mm, prm->addr); ++ vma = vma_lookup(mm, prm->addr); + if (!vma) + /* Unmapped area */ + goto out_put_mm; +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -322,7 +322,7 @@ create_elf_tables(struct linux_binprm *b + */ + if (mmap_write_lock_killable(mm)) + return -EINTR; +- vma = find_extend_vma_locked(mm, bprm->p, true); ++ vma = find_extend_vma_locked(mm, bprm->p); + mmap_write_unlock(mm); + if (!vma) + return -EFAULT; +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -210,7 +210,7 @@ static struct page *get_arg_page(struct + */ + if (write && pos < vma->vm_start) { + mmap_write_lock(mm); +- ret = expand_downwards(vma, pos, true); ++ ret = expand_downwards(vma, pos); + if (unlikely(ret < 0)) { + mmap_write_unlock(mm); + return NULL; +@@ -858,7 +858,7 @@ int setup_arg_pages(struct linux_binprm + stack_base = vma->vm_end - stack_expand; + #endif + current->mm->start_stack = bprm->p; +- ret = expand_stack_locked(vma, stack_base, true); ++ ret = expand_stack_locked(vma, stack_base); + if (ret) + ret = -EFAULT; + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3065,18 +3065,11 @@ extern vm_fault_t filemap_page_mkwrite(s + + extern unsigned long stack_guard_gap; + /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked); +-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); ++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); + + /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked); +-#if VM_GROWSUP +-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); +-#else +- #define expand_upwards(vma, address) (0) +-#endif ++int expand_downwards(struct vm_area_struct *vma, unsigned long address); + + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ + extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +@@ -3171,9 +3164,8 @@ unsigned long change_prot_numa(struct vm + unsigned long start, unsigned long end); + #endif + +-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); + struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, +- unsigned long addr, bool write_locked); ++ unsigned long addr); + int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); + int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1096,7 +1096,7 @@ static long __get_user_pages(struct mm_s + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { +- vma = find_extend_vma(mm, start); ++ vma = vma_lookup(mm, start); + if (!vma && in_gate_area(mm, start)) { + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, +@@ -1265,8 +1265,8 @@ int fixup_user_fault(struct mm_struct *m + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + retry: +- vma = find_extend_vma(mm, address); +- if (!vma || address < vma->vm_start) ++ vma = vma_lookup(mm, address); ++ if (!vma) + return -EFAULT; + + if (!vma_permits_fault(vma, fault_flags)) +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5336,7 +5336,7 @@ struct vm_area_struct *lock_mm_and_find_ + goto fail; + } + +- if (expand_stack_locked(vma, addr, true)) ++ if (expand_stack_locked(vma, addr)) + goto fail; + + success: +@@ -5620,6 +5620,14 @@ int __access_remote_vm(struct mm_struct + if (mmap_read_lock_killable(mm)) + return 0; + ++ /* We might need to expand the stack to access it */ ++ vma = vma_lookup(mm, addr); ++ if (!vma) { ++ vma = expand_stack(mm, addr); ++ if (!vma) ++ return 0; ++ } ++ + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, ret, offset; +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1898,8 +1898,7 @@ static int acct_stack_growth(struct vm_a + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +-int expand_upwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++static int expand_upwards(struct vm_area_struct *vma, unsigned long address) + { + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; +@@ -1923,8 +1922,6 @@ int expand_upwards(struct vm_area_struct + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + +- if (!write_locked) +- return -EAGAIN; + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) +@@ -1993,15 +1990,18 @@ int expand_upwards(struct vm_area_struct + + /* + * vma is the first one with address < vma->vm_start. Have to extend vma. ++ * mmap_lock held for writing. + */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_downwards(struct vm_area_struct *vma, unsigned long address) + { + struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); + struct vm_area_struct *prev; + int error = 0; + ++ if (!(vma->vm_flags & VM_GROWSDOWN)) ++ return -EFAULT; ++ + address &= PAGE_MASK; + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; +@@ -2014,8 +2014,6 @@ int expand_downwards(struct vm_area_stru + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; +- if (!write_locked && (prev->vm_end == address)) +- return -EAGAIN; + } + + if (mas_preallocate(&mas, GFP_KERNEL)) +@@ -2094,14 +2092,12 @@ static int __init cmdline_parse_stack_gu + __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + + #ifdef CONFIG_STACK_GROWSUP +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) + { +- return expand_upwards(vma, address, write_locked); ++ return expand_upwards(vma, address); + } + +-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, +- unsigned long addr, bool write_locked) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) + { + struct vm_area_struct *vma, *prev; + +@@ -2111,23 +2107,21 @@ struct vm_area_struct *find_extend_vma_l + return vma; + if (!prev) + return NULL; +- if (expand_stack_locked(prev, addr, write_locked)) ++ if (expand_stack_locked(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); + return prev; + } + #else +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) + { + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + return -EINVAL; +- return expand_downwards(vma, address, write_locked); ++ return expand_downwards(vma, address); + } + +-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, +- unsigned long addr, bool write_locked) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) + { + struct vm_area_struct *vma; + unsigned long start; +@@ -2139,7 +2133,7 @@ struct vm_area_struct *find_extend_vma_l + if (vma->vm_start <= addr) + return vma; + start = vma->vm_start; +- if (expand_stack_locked(vma, addr, write_locked)) ++ if (expand_stack_locked(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) + populate_vma_page_range(vma, addr, start, NULL); +@@ -2147,12 +2141,91 @@ struct vm_area_struct *find_extend_vma_l + } + #endif + +-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, +- unsigned long addr) ++/* ++ * IA64 has some horrid mapping rules: it can expand both up and down, ++ * but with various special rules. ++ * ++ * We'll get rid of this architecture eventually, so the ugliness is ++ * temporary. ++ */ ++#ifdef CONFIG_IA64 ++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr) ++{ ++ return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) && ++ REGION_OFFSET(addr) < RGN_MAP_LIMIT; ++} ++ ++/* ++ * IA64 stacks grow down, but there's a special register backing store ++ * that can grow up. Only sequentially, though, so the new address must ++ * match vm_end. ++ */ ++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr) ++{ ++ if (!vma_expand_ok(vma, addr)) ++ return -EFAULT; ++ if (vma->vm_end != (addr & PAGE_MASK)) ++ return -EFAULT; ++ return expand_upwards(vma, addr); ++} ++ ++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr) ++{ ++ if (!vma_expand_ok(vma, addr)) ++ return -EFAULT; ++ return expand_downwards(vma, addr); ++} ++ ++#elif defined(CONFIG_STACK_GROWSUP) ++ ++#define vma_expand_up(vma,addr) expand_upwards(vma, addr) ++#define vma_expand_down(vma, addr) (-EFAULT) ++ ++#else ++ ++#define vma_expand_up(vma,addr) (-EFAULT) ++#define vma_expand_down(vma, addr) expand_downwards(vma, addr) ++ ++#endif ++ ++/* ++ * expand_stack(): legacy interface for page faulting. Don't use unless ++ * you have to. ++ * ++ * This is called with the mm locked for reading, drops the lock, takes ++ * the lock for writing, tries to look up a vma again, expands it if ++ * necessary, and downgrades the lock to reading again. ++ * ++ * If no vma is found or it can't be expanded, it returns NULL and has ++ * dropped the lock. ++ */ ++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) + { +- return find_extend_vma_locked(mm, addr, false); ++ struct vm_area_struct *vma, *prev; ++ ++ mmap_read_unlock(mm); ++ if (mmap_write_lock_killable(mm)) ++ return NULL; ++ ++ vma = find_vma_prev(mm, addr, &prev); ++ if (vma && vma->vm_start <= addr) ++ goto success; ++ ++ if (prev && !vma_expand_up(prev, addr)) { ++ vma = prev; ++ goto success; ++ } ++ ++ if (vma && !vma_expand_down(vma, addr)) ++ goto success; ++ ++ mmap_write_unlock(mm); ++ return NULL; ++ ++success: ++ mmap_write_downgrade(mm); ++ return vma; + } +-EXPORT_SYMBOL_GPL(find_extend_vma); + + /* + * Ok - we have the memory areas we should free on a maple tree so release them, +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -631,24 +631,20 @@ struct vm_area_struct *find_vma(struct m + EXPORT_SYMBOL(find_vma); + + /* +- * find a VMA +- * - we don't extend stack VMAs under NOMMU conditions +- */ +-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +-{ +- return find_vma(mm, addr); +-} +- +-/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) + { + return -ENOMEM; + } + ++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) ++{ ++ mmap_read_unlock(mm); ++ return NULL; ++} ++ + /* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_lock at least held readlocked diff --git a/queue-6.3/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch b/queue-6.3/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..63b4467d83a --- /dev/null +++ b/queue-6.3/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch @@ -0,0 +1,489 @@ +From a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 10:55:38 -0700 +Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma() + +From: Linus Torvalds + +commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream. + +This does the simple pattern conversion of alpha, arc, csky, hexagon, +loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma() +helper. They all have the regular fault handling pattern without odd +special cases. + +The remaining architectures all have something that keeps us from a +straightforward conversion: ia64 and parisc have stacks that can grow +both up as well as down (and ia64 has special address region checks). + +And m68k, microblaze, openrisc, sparc64, and um end up having extra +rules about only expanding the stack down a limited amount below the +user space stack pointer. That is something that x86 used to do too +(long long ago), and it probably could just be skipped, but it still +makes the conversion less than trivial. + +Note that this conversion was done manually and with the exception of +alpha without any build testing, because I have a fairly limited cross- +building environment. The cases are all simple, and I went through the +changes several times, but... + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/alpha/Kconfig | 1 + + arch/alpha/mm/fault.c | 13 +++---------- + arch/arc/Kconfig | 1 + + arch/arc/mm/fault.c | 11 +++-------- + arch/csky/Kconfig | 1 + + arch/csky/mm/fault.c | 22 +++++----------------- + arch/hexagon/Kconfig | 1 + + arch/hexagon/mm/vm_fault.c | 18 ++++-------------- + arch/loongarch/Kconfig | 1 + + arch/loongarch/mm/fault.c | 16 ++++++---------- + arch/nios2/Kconfig | 1 + + arch/nios2/mm/fault.c | 17 ++--------------- + arch/sh/Kconfig | 1 + + arch/sh/mm/fault.c | 17 ++--------------- + arch/sparc/Kconfig | 1 + + arch/sparc/mm/fault_32.c | 32 ++++++++------------------------ + arch/xtensa/Kconfig | 1 + + arch/xtensa/mm/fault.c | 14 +++----------- + 18 files changed, 45 insertions(+), 124 deletions(-) + +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -29,6 +29,7 @@ config ALPHA + select GENERIC_SMP_IDLE_THREAD + select HAVE_ARCH_AUDITSYSCALL + select HAVE_MOD_ARCH_SPECIFIC ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select ODD_RT_SIGACTION + select OLD_SIGSUSPEND +--- a/arch/alpha/mm/fault.c ++++ b/arch/alpha/mm/fault.c +@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns + flags |= FAULT_FLAG_USER; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + + /* Ok, we have a good vm_area for this memory access, so + we can handle it. */ +- good_area: + si_code = SEGV_ACCERR; + if (cause < 0) { + if (!(vma->vm_flags & VM_EXEC)) +@@ -192,6 +184,7 @@ retry: + bad_area: + mmap_read_unlock(mm); + ++ bad_area_nosemaphore: + if (user_mode(regs)) + goto do_sigsegv; + +--- a/arch/arc/Kconfig ++++ b/arch/arc/Kconfig +@@ -41,6 +41,7 @@ config ARC + select HAVE_PERF_EVENTS + select HAVE_SYSCALL_TRACEPOINTS + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select OF + select OF_EARLY_FLATTREE +--- a/arch/arc/mm/fault.c ++++ b/arch/arc/mm/fault.c +@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (unlikely(address < vma->vm_start)) { +- if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address)) +- goto bad_area; +- } ++ goto bad_area_nosemaphore; + + /* + * vm_area is good, now check permissions for this memory access +@@ -161,6 +155,7 @@ retry: + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + /* + * Major/minor page fault accounting + * (in case of retry we only land here once) +--- a/arch/csky/Kconfig ++++ b/arch/csky/Kconfig +@@ -96,6 +96,7 @@ config CSKY + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_STACKPROTECTOR + select HAVE_SYSCALL_TRACEPOINTS ++ select LOCK_MM_AND_FIND_VMA + select MAY_HAVE_SPARSE_IRQ + select MODULES_USE_ELF_RELA if MODULES + select OF +--- a/arch/csky/mm/fault.c ++++ b/arch/csky/mm/fault.c +@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct + BUG(); + } + +-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) ++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) + { + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ +- mmap_read_unlock(mm); + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); +@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_ + if (is_write(regs)) + flags |= FAULT_FLAG_WRITE; + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, addr); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, mm, code, addr); +- return; +- } +- if (likely(vma->vm_start <= addr)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, mm, code, addr); +- return; +- } +- if (unlikely(expand_stack(vma, addr))) { +- bad_area(regs, mm, code, addr); ++ bad_area_nosemaphore(regs, mm, code, addr); + return; + } + +@@ -259,11 +247,11 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it. + */ +-good_area: + code = SEGV_ACCERR; + + if (unlikely(access_error(regs, vma))) { +- bad_area(regs, mm, code, addr); ++ mmap_read_unlock(mm); ++ bad_area_nosemaphore(regs, mm, code, addr); + return; + } + +--- a/arch/hexagon/Kconfig ++++ b/arch/hexagon/Kconfig +@@ -28,6 +28,7 @@ config HEXAGON + select GENERIC_SMP_IDLE_THREAD + select STACKTRACE_SUPPORT + select GENERIC_CLOCKEVENTS_BROADCAST ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select GENERIC_CPU_DEVICES + select ARCH_WANT_LD_ORPHAN_WARN +--- a/arch/hexagon/mm/vm_fault.c ++++ b/arch/hexagon/mm/vm_fault.c +@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- if (!vma) +- goto bad_area; ++ vma = lock_mm_and_find_vma(mm, address, regs); ++ if (unlikely(!vma)) ++ goto bad_area_nosemaphore; + +- if (vma->vm_start <= address) +- goto good_area; +- +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- +- if (expand_stack(vma, address)) +- goto bad_area; +- +-good_area: + /* Address space is OK. Now check access rights. */ + si_code = SEGV_ACCERR; + +@@ -143,6 +132,7 @@ good_area: + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + if (user_mode(regs)) { + force_sig_fault(SIGSEGV, si_code, (void __user *)address); + return; +--- a/arch/loongarch/Kconfig ++++ b/arch/loongarch/Kconfig +@@ -125,6 +125,7 @@ config LOONGARCH + select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP + select IRQ_FORCED_THREADING + select IRQ_LOONGARCH_CPU ++ select LOCK_MM_AND_FIND_VMA + select MMU_GATHER_MERGE_VMAS if MMU + select MODULES_USE_ELF_RELA if MODULES + select NEED_PER_CPU_EMBED_FIRST_CHUNK +--- a/arch/loongarch/mm/fault.c ++++ b/arch/loongarch/mm/fault.c +@@ -169,22 +169,18 @@ static void __kprobes __do_page_fault(st + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (!expand_stack(vma, address)) +- goto good_area; ++ vma = lock_mm_and_find_vma(mm, address, regs); ++ if (unlikely(!vma)) ++ goto bad_area_nosemaphore; ++ goto good_area; ++ + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + do_sigsegv(regs, write, address, si_code); + return; + +--- a/arch/nios2/Kconfig ++++ b/arch/nios2/Kconfig +@@ -16,6 +16,7 @@ config NIOS2 + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_KGDB + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select OF + select OF_EARLY_FLATTREE +--- a/arch/nios2/mm/fault.c ++++ b/arch/nios2/mm/fault.c +@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_ + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->ea)) +- goto bad_area_nosemaphore; + retry: +- mmap_read_lock(mm); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + code = SEGV_ACCERR; + + switch (cause) { +--- a/arch/sh/Kconfig ++++ b/arch/sh/Kconfig +@@ -56,6 +56,7 @@ config SUPERH + select HAVE_STACKPROTECTOR + select HAVE_SYSCALL_TRACEPOINTS + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select NEED_SG_DMA_LENGTH + select NO_DMA if !MMU && !DMA_COHERENT +--- a/arch/sh/mm/fault.c ++++ b/arch/sh/mm/fault.c +@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault( + } + + retry: +- mmap_read_lock(mm); +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, error_code, address); +- return; +- } +- if (likely(vma->vm_start <= address)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, error_code, address); +- return; +- } +- if (unlikely(expand_stack(vma, address))) { +- bad_area(regs, error_code, address); ++ bad_area_nosemaphore(regs, error_code, address); + return; + } + +@@ -461,7 +449,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address); + return; +--- a/arch/sparc/Kconfig ++++ b/arch/sparc/Kconfig +@@ -56,6 +56,7 @@ config SPARC32 + select DMA_DIRECT_REMAP + select GENERIC_ATOMIC64 + select HAVE_UID16 ++ select LOCK_MM_AND_FIND_VMA + select OLD_SIGACTION + select ZONE_DMA + +--- a/arch/sparc/mm/fault_32.c ++++ b/arch/sparc/mm/fault_32.c +@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt + if (pagefault_disabled() || !mm) + goto no_context; + ++ if (!from_user && address >= PAGE_OFFSET) ++ goto no_context; ++ + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + retry: +- mmap_read_lock(mm); +- +- if (!from_user && address >= PAGE_OFFSET) +- goto bad_area; +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + code = SEGV_ACCERR; + if (write) { + if (!(vma->vm_flags & VM_WRITE)) +@@ -321,17 +312,9 @@ static void force_user_fault(unsigned lo + + code = SEGV_MAPERR; + +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; +-good_area: ++ goto bad_area_nosemaphore; + code = SEGV_ACCERR; + if (write) { + if (!(vma->vm_flags & VM_WRITE)) +@@ -350,6 +333,7 @@ good_area: + return; + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address); + return; + +--- a/arch/xtensa/Kconfig ++++ b/arch/xtensa/Kconfig +@@ -49,6 +49,7 @@ config XTENSA + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING_GEN + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select PERF_USE_VMALLOC + select TRACE_IRQFLAGS_SUPPORT +--- a/arch/xtensa/mm/fault.c ++++ b/arch/xtensa/mm/fault.c +@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs) + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + + /* Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + +-good_area: + code = SEGV_ACCERR; + + if (is_write) { +@@ -205,6 +196,7 @@ good_area: + */ + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + if (user_mode(regs)) { + force_sig_fault(SIGSEGV, code, (void *) address); + return; diff --git a/queue-6.3/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch b/queue-6.3/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch new file mode 100644 index 00000000000..aa638f12bbe --- /dev/null +++ b/queue-6.3/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch @@ -0,0 +1,295 @@ +From c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 15:17:36 -0700 +Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper + +From: Linus Torvalds + +commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream. + +.. and make x86 use it. + +This basically extracts the existing x86 "find and expand faulting vma" +code, but extends it to also take the mmap lock for writing in case we +actually do need to expand the vma. + +We've historically short-circuited that case, and have some rather ugly +special logic to serialize the stack segment expansion (since we only +hold the mmap lock for reading) that doesn't match the normal VM +locking. + +That slight violation of locking worked well, right up until it didn't: +the maple tree code really does want proper locking even for simple +extension of an existing vma. + +So extract the code for "look up the vma of the fault" from x86, fix it +up to do the necessary write locking, and make it available as a helper +function for other architectures that can use the common helper. + +Note: I say "common helper", but it really only handles the normal +stack-grows-down case. Which is all architectures except for PA-RISC +and IA64. So some rare architectures can't use the helper, but if they +care they'll just need to open-code this logic. + +It's also worth pointing out that this code really would like to have an +optimistic "mmap_upgrade_trylock()" to make it quicker to go from a +read-lock (for the common case) to taking the write lock (for having to +extend the vma) in the normal single-threaded situation where there is +no other locking activity. + +But that _is_ all the very uncommon special case, so while it would be +nice to have such an operation, it probably doesn't matter in reality. +I did put in the skeleton code for such a possible future expansion, +even if it only acts as pseudo-documentation for what we're doing. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/Kconfig | 1 + arch/x86/mm/fault.c | 52 ---------------------- + include/linux/mm.h | 2 + mm/Kconfig | 4 + + mm/memory.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 130 insertions(+), 50 deletions(-) + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -274,6 +274,7 @@ config X86 + select HAVE_GENERIC_VDSO + select HOTPLUG_SMT if SMP + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK + select NEED_SG_DMA_LENGTH +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -879,12 +879,6 @@ __bad_area(struct pt_regs *regs, unsigne + __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); + } + +-static noinline void +-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +-{ +- __bad_area(regs, error_code, address, 0, SEGV_MAPERR); +-} +- + static inline bool bad_area_access_from_pkeys(unsigned long error_code, + struct vm_area_struct *vma) + { +@@ -1333,51 +1327,10 @@ void do_user_addr_fault(struct pt_regs * + } + #endif + +- /* +- * Kernel-mode access to the user address space should only occur +- * on well-defined single instructions listed in the exception +- * tables. But, an erroneous kernel fault occurring outside one of +- * those areas which also holds mmap_lock might deadlock attempting +- * to validate the fault against the address space. +- * +- * Only do the expensive exception table search when we might be at +- * risk of a deadlock. This happens if we +- * 1. Failed to acquire mmap_lock, and +- * 2. The access did not originate in userspace. +- */ +- if (unlikely(!mmap_read_trylock(mm))) { +- if (!user_mode(regs) && !search_exception_tables(regs->ip)) { +- /* +- * Fault from code in kernel from +- * which we do not expect faults. +- */ +- bad_area_nosemaphore(regs, error_code, address); +- return; +- } + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case we'll have missed the might_sleep() from +- * down_read(): +- */ +- might_sleep(); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, error_code, address); +- return; +- } +- if (likely(vma->vm_start <= address)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, error_code, address); +- return; +- } +- if (unlikely(expand_stack(vma, address))) { +- bad_area(regs, error_code, address); ++ bad_area_nosemaphore(regs, error_code, address); + return; + } + +@@ -1385,7 +1338,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); + return; +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2190,6 +2190,8 @@ void unmap_mapping_pages(struct address_ + pgoff_t start, pgoff_t nr, bool even_cows); + void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); ++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, ++ unsigned long address, struct pt_regs *regs); + #else + static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1202,6 +1202,10 @@ config LRU_GEN_STATS + This option has a per-memcg and per-node memory overhead. + # } + ++config LOCK_MM_AND_FIND_VMA ++ bool ++ depends on !STACK_GROWSUP ++ + source "mm/damon/Kconfig" + + endmenu +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5230,6 +5230,127 @@ vm_fault_t handle_mm_fault(struct vm_are + } + EXPORT_SYMBOL_GPL(handle_mm_fault); + ++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA ++#include ++ ++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) ++{ ++ /* Even if this succeeds, make it clear we *might* have slept */ ++ if (likely(mmap_read_trylock(mm))) { ++ might_sleep(); ++ return true; ++ } ++ ++ if (regs && !user_mode(regs)) { ++ unsigned long ip = instruction_pointer(regs); ++ if (!search_exception_tables(ip)) ++ return false; ++ } ++ ++ mmap_read_lock(mm); ++ return true; ++} ++ ++static inline bool mmap_upgrade_trylock(struct mm_struct *mm) ++{ ++ /* ++ * We don't have this operation yet. ++ * ++ * It should be easy enough to do: it's basically a ++ * atomic_long_try_cmpxchg_acquire() ++ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but ++ * it also needs the proper lockdep magic etc. ++ */ ++ return false; ++} ++ ++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) ++{ ++ mmap_read_unlock(mm); ++ if (regs && !user_mode(regs)) { ++ unsigned long ip = instruction_pointer(regs); ++ if (!search_exception_tables(ip)) ++ return false; ++ } ++ mmap_write_lock(mm); ++ return true; ++} ++ ++/* ++ * Helper for page fault handling. ++ * ++ * This is kind of equivalend to "mmap_read_lock()" followed ++ * by "find_extend_vma()", except it's a lot more careful about ++ * the locking (and will drop the lock on failure). ++ * ++ * For example, if we have a kernel bug that causes a page ++ * fault, we don't want to just use mmap_read_lock() to get ++ * the mm lock, because that would deadlock if the bug were ++ * to happen while we're holding the mm lock for writing. ++ * ++ * So this checks the exception tables on kernel faults in ++ * order to only do this all for instructions that are actually ++ * expected to fault. ++ * ++ * We can also actually take the mm lock for writing if we ++ * need to extend the vma, which helps the VM layer a lot. ++ */ ++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, ++ unsigned long addr, struct pt_regs *regs) ++{ ++ struct vm_area_struct *vma; ++ ++ if (!get_mmap_lock_carefully(mm, regs)) ++ return NULL; ++ ++ vma = find_vma(mm, addr); ++ if (likely(vma && (vma->vm_start <= addr))) ++ return vma; ++ ++ /* ++ * Well, dang. We might still be successful, but only ++ * if we can extend a vma to do so. ++ */ ++ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { ++ mmap_read_unlock(mm); ++ return NULL; ++ } ++ ++ /* ++ * We can try to upgrade the mmap lock atomically, ++ * in which case we can continue to use the vma ++ * we already looked up. ++ * ++ * Otherwise we'll have to drop the mmap lock and ++ * re-take it, and also look up the vma again, ++ * re-checking it. ++ */ ++ if (!mmap_upgrade_trylock(mm)) { ++ if (!upgrade_mmap_lock_carefully(mm, regs)) ++ return NULL; ++ ++ vma = find_vma(mm, addr); ++ if (!vma) ++ goto fail; ++ if (vma->vm_start <= addr) ++ goto success; ++ if (!(vma->vm_flags & VM_GROWSDOWN)) ++ goto fail; ++ } ++ ++ if (expand_stack(vma, addr)) ++ goto fail; ++ ++success: ++ mmap_write_downgrade(mm); ++ return vma; ++ ++fail: ++ mmap_write_unlock(mm); ++ return NULL; ++} ++#endif ++ + #ifndef __PAGETABLE_P4D_FOLDED + /* + * Allocate p4d page table. diff --git a/queue-6.3/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch b/queue-6.3/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch new file mode 100644 index 00000000000..6cdca9992e5 --- /dev/null +++ b/queue-6.3/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch @@ -0,0 +1,242 @@ +From f440fa1ac955e2898893f9301568435eb5cdfc4b Mon Sep 17 00:00:00 2001 +From: "Liam R. Howlett" +Date: Fri, 16 Jun 2023 15:58:54 -0700 +Subject: mm: make find_extend_vma() fail if write lock not held + +From: Liam R. Howlett + +commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream. + +Make calls to extend_vma() and find_extend_vma() fail if the write lock +is required. + +To avoid making this a flag-day event, this still allows the old +read-locking case for the trivial situations, and passes in a flag to +say "is it write-locked". That way write-lockers can say "yes, I'm +being careful", and legacy users will continue to work in all the common +cases until they have been fully converted to the new world order. + +Co-Developed-by: Matthew Wilcox (Oracle) +Signed-off-by: Matthew Wilcox (Oracle) +Signed-off-by: Liam R. Howlett +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/binfmt_elf.c | 6 +++--- + fs/exec.c | 5 +++-- + include/linux/mm.h | 10 +++++++--- + mm/memory.c | 2 +- + mm/mmap.c | 50 +++++++++++++++++++++++++++++++++----------------- + mm/nommu.c | 3 ++- + 6 files changed, 49 insertions(+), 27 deletions(-) + +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *b + * Grow the stack manually; some architectures have a limit on how + * far ahead a user-space access may be in order to grow the stack. + */ +- if (mmap_read_lock_killable(mm)) ++ if (mmap_write_lock_killable(mm)) + return -EINTR; +- vma = find_extend_vma(mm, bprm->p); +- mmap_read_unlock(mm); ++ vma = find_extend_vma_locked(mm, bprm->p, true); ++ mmap_write_unlock(mm); + if (!vma) + return -EFAULT; + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -204,7 +204,8 @@ static struct page *get_arg_page(struct + + #ifdef CONFIG_STACK_GROWSUP + if (write) { +- ret = expand_downwards(bprm->vma, pos); ++ /* We claim to hold the lock - nobody to race with */ ++ ret = expand_downwards(bprm->vma, pos, true); + if (ret < 0) + return NULL; + } +@@ -852,7 +853,7 @@ int setup_arg_pages(struct linux_binprm + stack_base = vma->vm_end - stack_expand; + #endif + current->mm->start_stack = bprm->p; +- ret = expand_stack(vma, stack_base); ++ ret = expand_stack_locked(vma, stack_base, true); + if (ret) + ret = -EFAULT; + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3065,11 +3065,13 @@ extern vm_fault_t filemap_page_mkwrite(s + + extern unsigned long stack_guard_gap; + /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ +-extern int expand_stack(struct vm_area_struct *vma, unsigned long address); ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked); ++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) + + /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ +-extern int expand_downwards(struct vm_area_struct *vma, +- unsigned long address); ++int expand_downwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked); + #if VM_GROWSUP + extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); + #else +@@ -3170,6 +3172,8 @@ unsigned long change_prot_numa(struct vm + #endif + + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, ++ unsigned long addr, bool write_locked); + int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); + int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5336,7 +5336,7 @@ struct vm_area_struct *lock_mm_and_find_ + goto fail; + } + +- if (expand_stack(vma, addr)) ++ if (expand_stack_locked(vma, addr, true)) + goto fail; + + success: +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1898,7 +1898,8 @@ static int acct_stack_growth(struct vm_a + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +-int expand_upwards(struct vm_area_struct *vma, unsigned long address) ++int expand_upwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; +@@ -1922,6 +1923,8 @@ int expand_upwards(struct vm_area_struct + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + ++ if (!write_locked) ++ return -EAGAIN; + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) +@@ -1991,7 +1994,8 @@ int expand_upwards(struct vm_area_struct + /* + * vma is the first one with address < vma->vm_start. Have to extend vma. + */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address) ++int expand_downwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); +@@ -2005,10 +2009,13 @@ int expand_downwards(struct vm_area_stru + /* Enforce stack_guard_gap */ + prev = mas_prev(&mas, 0); + /* Check that both stack segments have the same anon_vma? */ +- if (prev && !(prev->vm_flags & VM_GROWSDOWN) && +- vma_is_accessible(prev)) { +- if (address - prev->vm_end < stack_guard_gap) ++ if (prev) { ++ if (!(prev->vm_flags & VM_GROWSDOWN) && ++ vma_is_accessible(prev) && ++ (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; ++ if (!write_locked && (prev->vm_end == address)) ++ return -EAGAIN; + } + + if (mas_preallocate(&mas, GFP_KERNEL)) +@@ -2087,13 +2094,14 @@ static int __init cmdline_parse_stack_gu + __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + + #ifdef CONFIG_STACK_GROWSUP +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { +- return expand_upwards(vma, address); ++ return expand_upwards(vma, address, write_locked); + } + +-struct vm_area_struct * +-find_extend_vma(struct mm_struct *mm, unsigned long addr) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, ++ unsigned long addr, bool write_locked) + { + struct vm_area_struct *vma, *prev; + +@@ -2101,20 +2109,25 @@ find_extend_vma(struct mm_struct *mm, un + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; +- if (!prev || expand_stack(prev, addr)) ++ if (!prev) ++ return NULL; ++ if (expand_stack_locked(prev, addr, write_locked)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); + return prev; + } + #else +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { +- return expand_downwards(vma, address); ++ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) ++ return -EINVAL; ++ return expand_downwards(vma, address, write_locked); + } + +-struct vm_area_struct * +-find_extend_vma(struct mm_struct *mm, unsigned long addr) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, ++ unsigned long addr, bool write_locked) + { + struct vm_area_struct *vma; + unsigned long start; +@@ -2125,10 +2138,8 @@ find_extend_vma(struct mm_struct *mm, un + return NULL; + if (vma->vm_start <= addr) + return vma; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return NULL; + start = vma->vm_start; +- if (expand_stack(vma, addr)) ++ if (expand_stack_locked(vma, addr, write_locked)) + return NULL; + if (vma->vm_flags & VM_LOCKED) + populate_vma_page_range(vma, addr, start, NULL); +@@ -2136,6 +2147,11 @@ find_extend_vma(struct mm_struct *mm, un + } + #endif + ++struct vm_area_struct *find_extend_vma(struct mm_struct *mm, ++ unsigned long addr) ++{ ++ return find_extend_vma_locked(mm, addr, false); ++} + EXPORT_SYMBOL_GPL(find_extend_vma); + + /* +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -643,7 +643,8 @@ struct vm_area_struct *find_extend_vma(s + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + return -ENOMEM; + } diff --git a/queue-6.3/mm-make-the-page-fault-mmap-locking-killable.patch b/queue-6.3/mm-make-the-page-fault-mmap-locking-killable.patch new file mode 100644 index 00000000000..b5337eedfbd --- /dev/null +++ b/queue-6.3/mm-make-the-page-fault-mmap-locking-killable.patch @@ -0,0 +1,46 @@ +From eda0047296a16d65a7f2bc60a408f70d178b2014 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 16:17:48 -0700 +Subject: mm: make the page fault mmap locking killable + +From: Linus Torvalds + +commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream. + +This is done as a separate patch from introducing the new +lock_mm_and_find_vma() helper, because while it's an obvious change, +it's not what x86 used to do in this area. + +We already abort the page fault on fatal signals anyway, so why should +we wait for the mmap lock only to then abort later? With the new helper +function that returns without the lock held on failure anyway, this is +particularly easy and straightforward. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5247,8 +5247,7 @@ static inline bool get_mmap_lock_careful + return false; + } + +- mmap_read_lock(mm); +- return true; ++ return !mmap_read_lock_killable(mm); + } + + static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +@@ -5272,8 +5271,7 @@ static inline bool upgrade_mmap_lock_car + if (!search_exception_tables(ip)) + return false; + } +- mmap_write_lock(mm); +- return true; ++ return !mmap_write_lock_killable(mm); + } + + /* diff --git a/queue-6.3/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch b/queue-6.3/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..f29786928b8 --- /dev/null +++ b/queue-6.3/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch @@ -0,0 +1,47 @@ +From 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 11:17:05 -0700 +Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma() + +From: Linus Torvalds + +commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream. + +This is one of the simple cases, except there's no pt_regs pointer. +Which is fine, as lock_mm_and_find_vma() is set up to work fine with a +NULL pt_regs. + +Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting, +so we can just use the helper without any extra work. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/mm/copro_fault.c | 14 +++----------- + 1 file changed, 3 insertions(+), 11 deletions(-) + +--- a/arch/powerpc/mm/copro_fault.c ++++ b/arch/powerpc/mm/copro_fault.c +@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru + if (mm->pgd == NULL) + return -EFAULT; + +- mmap_read_lock(mm); +- ret = -EFAULT; +- vma = find_vma(mm, ea); ++ vma = lock_mm_and_find_vma(mm, ea, NULL); + if (!vma) +- goto out_unlock; +- +- if (ea < vma->vm_start) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto out_unlock; +- if (expand_stack(vma, ea)) +- goto out_unlock; +- } ++ return -EFAULT; + ++ ret = -EFAULT; + is_write = dsisr & DSISR_ISSTORE; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) diff --git a/queue-6.3/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..95fca6c0808 --- /dev/null +++ b/queue-6.3/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,86 @@ +From e6fe228c4ffafdfc970cf6d46883a1f481baf7ea Mon Sep 17 00:00:00 2001 +From: Michael Ellerman +Date: Fri, 16 Jun 2023 15:51:29 +1000 +Subject: powerpc/mm: Convert to using lock_mm_and_find_vma() + +From: Michael Ellerman + +commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream. + +Signed-off-by: Michael Ellerman +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/Kconfig | 1 + + arch/powerpc/mm/fault.c | 41 ++++------------------------------------- + 2 files changed, 5 insertions(+), 37 deletions(-) + +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -263,6 +263,7 @@ config PPC + select IRQ_DOMAIN + select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN && MODULES ++ select LOCK_MM_AND_FIND_VMA + select MMU_GATHER_PAGE_SIZE + select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS +--- a/arch/powerpc/mm/fault.c ++++ b/arch/powerpc/mm/fault.c +@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re + return __bad_area_nosemaphore(regs, address, si_code); + } + +-static noinline int bad_area(struct pt_regs *regs, unsigned long address) +-{ +- return __bad_area(regs, address, SEGV_MAPERR); +-} +- + static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, + struct vm_area_struct *vma) + { +@@ -481,40 +476,12 @@ static int ___do_page_fault(struct pt_re + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the +- * exceptions table. +- * +- * As the vast majority of faults will be valid we will only perform +- * the source reference check when there is a possibility of a deadlock. +- * Attempt to lock the address space, if we cannot we then validate the +- * source. If this is invalid we can skip the address space check, +- * thus avoiding the deadlock. +- */ +- if (unlikely(!mmap_read_trylock(mm))) { +- if (!is_user && !search_exception_tables(regs->nip)) +- return bad_area_nosemaphore(regs, address); +- ++ * exceptions table. lock_mm_and_find_vma() handles that logic. ++ */ + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case we'll have missed the might_sleep() from +- * down_read(): +- */ +- might_sleep(); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) +- return bad_area(regs, address); +- +- if (unlikely(vma->vm_start > address)) { +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) +- return bad_area(regs, address); +- +- if (unlikely(expand_stack(vma, address))) +- return bad_area(regs, address); +- } ++ return bad_area_nosemaphore(regs, address); + + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) diff --git a/queue-6.3/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.3/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..2999c21ff3b --- /dev/null +++ b/queue-6.3/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,95 @@ +From 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 20:18:18 +0200 +Subject: riscv/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream. + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/riscv/Kconfig | 1 + + arch/riscv/mm/fault.c | 31 +++++++++++++------------------ + 2 files changed, 14 insertions(+), 18 deletions(-) + +--- a/arch/riscv/Kconfig ++++ b/arch/riscv/Kconfig +@@ -119,6 +119,7 @@ config RISCV + select HAVE_SYSCALL_TRACEPOINTS + select IRQ_DOMAIN + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA if MODULES + select MODULE_SECTIONS if MODULES + select OF +--- a/arch/riscv/mm/fault.c ++++ b/arch/riscv/mm/fault.c +@@ -83,13 +83,13 @@ static inline void mm_fault_error(struct + BUG(); + } + +-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) ++static inline void ++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) + { + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ +- mmap_read_unlock(mm); + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); +@@ -99,6 +99,15 @@ static inline void bad_area(struct pt_re + no_context(regs, addr); + } + ++static inline void ++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, ++ unsigned long addr) ++{ ++ mmap_read_unlock(mm); ++ ++ bad_area_nosemaphore(regs, code, addr); ++} ++ + static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) + { + pgd_t *pgd, *pgd_k; +@@ -286,23 +295,10 @@ asmlinkage void do_page_fault(struct pt_ + else if (cause == EXC_INST_PAGE_FAULT) + flags |= FAULT_FLAG_INSTRUCTION; + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, addr); ++ vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); +- return; +- } +- if (likely(vma->vm_start <= addr)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); +- return; +- } +- if (unlikely(expand_stack(vma, addr))) { +- tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); ++ bad_area_nosemaphore(regs, code, addr); + return; + } + +@@ -310,7 +306,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it. + */ +-good_area: + code = SEGV_ACCERR; + + if (unlikely(access_error(cause, vma))) { diff --git a/queue-6.3/series b/queue-6.3/series index 44c48b4009c..fcec13792a1 100644 --- a/queue-6.3/series +++ b/queue-6.3/series @@ -9,3 +9,17 @@ x86-smp-cure-kexec-vs.-mwait_play_dead-breakage.patch cpufreq-amd-pstate-make-amd-pstate-epp-driver-name-hyphenated.patch can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch + +mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch +mm-make-the-page-fault-mmap-locking-killable.patch +arm64-mm-convert-to-using-lock_mm_and_find_vma.patch +powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch +mips-mm-convert-to-using-lock_mm_and_find_vma.patch +riscv-mm-convert-to-using-lock_mm_and_find_vma.patch +arm-mm-convert-to-using-lock_mm_and_find_vma.patch +mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch +powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch +mm-make-find_extend_vma-fail-if-write-lock-not-held.patch +execve-expand-new-process-stack-manually-ahead-of-time.patch +mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch +gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch diff --git a/queue-6.4/arm-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/arm-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..3be848b4221 --- /dev/null +++ b/queue-6.4/arm-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,136 @@ +From 8b35ca3e45e35a26a21427f35d4093606e93ad0a Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 21:24:30 +0200 +Subject: arm/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 8b35ca3e45e35a26a21427f35d4093606e93ad0a upstream. + +arm has an additional check for address < FIRST_USER_ADDRESS before +expanding the stack. Since FIRST_USER_ADDRESS is defined everywhere +(generally as 0), move that check to the generic expand_downwards(). + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm/Kconfig | 1 + arch/arm/mm/fault.c | 63 +++++++++++----------------------------------------- + mm/mmap.c | 2 - + 3 files changed, 16 insertions(+), 50 deletions(-) + +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -125,6 +125,7 @@ config ARM + select HAVE_UID16 + select HAVE_VIRT_CPU_ACCOUNTING_GEN + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_REL + select NEED_DMA_MAP_STATE + select OF_EARLY_FLATTREE if OF +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -232,37 +232,11 @@ static inline bool is_permission_fault(u + return false; + } + +-static vm_fault_t __kprobes +-__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags, +- unsigned long vma_flags, struct pt_regs *regs) +-{ +- struct vm_area_struct *vma = find_vma(mm, addr); +- if (unlikely(!vma)) +- return VM_FAULT_BADMAP; +- +- if (unlikely(vma->vm_start > addr)) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return VM_FAULT_BADMAP; +- if (addr < FIRST_USER_ADDRESS) +- return VM_FAULT_BADMAP; +- if (expand_stack(vma, addr)) +- return VM_FAULT_BADMAP; +- } +- +- /* +- * ok, we have a good vm_area for this memory access, check the +- * permissions on the VMA allow for the fault which occurred. +- */ +- if (!(vma->vm_flags & vma_flags)) +- return VM_FAULT_BADACCESS; +- +- return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); +-} +- + static int __kprobes + do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { + struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; + int sig, code; + vm_fault_t fault; + unsigned int flags = FAULT_FLAG_DEFAULT; +@@ -301,31 +275,21 @@ do_page_fault(unsigned long addr, unsign + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + +- /* +- * As per x86, we may deadlock here. However, since the kernel only +- * validly references user space from well defined areas of the code, +- * we can bug out early if this is from code which shouldn't. +- */ +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) +- goto no_context; + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case, we'll have missed the might_sleep() from +- * down_read() +- */ +- might_sleep(); +-#ifdef CONFIG_DEBUG_VM +- if (!user_mode(regs) && +- !search_exception_tables(regs->ARM_pc)) +- goto no_context; +-#endif ++ vma = lock_mm_and_find_vma(mm, addr, regs); ++ if (unlikely(!vma)) { ++ fault = VM_FAULT_BADMAP; ++ goto bad_area; + } + +- fault = __do_page_fault(mm, addr, flags, vm_flags, regs); ++ /* ++ * ok, we have a good vm_area for this memory access, check the ++ * permissions on the VMA allow for the fault which occurred. ++ */ ++ if (!(vma->vm_flags & vm_flags)) ++ fault = VM_FAULT_BADACCESS; ++ else ++ fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); + + /* If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_lock because +@@ -356,6 +320,7 @@ retry: + if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) + return 0; + ++bad_area: + /* + * If we are in kernel mode at this point, we + * have no context to handle this fault with. +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -2036,7 +2036,7 @@ int expand_downwards(struct vm_area_stru + int error = 0; + + address &= PAGE_MASK; +- if (address < mmap_min_addr) ++ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + + /* Enforce stack_guard_gap */ diff --git a/queue-6.4/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..dc6485a3714 --- /dev/null +++ b/queue-6.4/arm64-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,112 @@ +From ae870a68b5d13d67cf4f18d47bb01ee3fee40acb Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 17:11:44 -0700 +Subject: arm64/mm: Convert to using lock_mm_and_find_vma() + +From: Linus Torvalds + +commit ae870a68b5d13d67cf4f18d47bb01ee3fee40acb upstream. + +This converts arm64 to use the new page fault helper. It was very +straightforward, but still needed a fix for the "obvious" conversion I +initially did. Thanks to Suren for the fix and testing. + +Fixed-and-tested-by: Suren Baghdasaryan +Unnecessary-code-removal-by: Liam R. Howlett +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/Kconfig | 1 + + arch/arm64/mm/fault.c | 47 ++++++++--------------------------------------- + 2 files changed, 9 insertions(+), 39 deletions(-) + +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -225,6 +225,7 @@ config ARM64 + select IRQ_DOMAIN + select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select NEED_DMA_MAP_STATE + select NEED_SG_DMA_LENGTH +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -483,27 +483,14 @@ static void do_bad_area(unsigned long fa + #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) + #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) + +-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, ++static vm_fault_t __do_page_fault(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long addr, + unsigned int mm_flags, unsigned long vm_flags, + struct pt_regs *regs) + { +- struct vm_area_struct *vma = find_vma(mm, addr); +- +- if (unlikely(!vma)) +- return VM_FAULT_BADMAP; +- + /* + * Ok, we have a good vm_area for this memory access, so we can handle + * it. +- */ +- if (unlikely(vma->vm_start > addr)) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return VM_FAULT_BADMAP; +- if (expand_stack(vma, addr)) +- return VM_FAULT_BADMAP; +- } +- +- /* + * Check that the permissions on the VMA allow for the fault which + * occurred. + */ +@@ -617,31 +604,15 @@ static int __kprobes do_page_fault(unsig + } + lock_mmap: + #endif /* CONFIG_PER_VMA_LOCK */ +- /* +- * As per x86, we may deadlock here. However, since the kernel only +- * validly references user space from well defined areas of the code, +- * we can bug out early if this is from code which shouldn't. +- */ +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->pc)) +- goto no_context; ++ + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above mmap_read_trylock() might have succeeded in which +- * case, we'll have missed the might_sleep() from down_read(). +- */ +- might_sleep(); +-#ifdef CONFIG_DEBUG_VM +- if (!user_mode(regs) && !search_exception_tables(regs->pc)) { +- mmap_read_unlock(mm); +- goto no_context; +- } +-#endif ++ vma = lock_mm_and_find_vma(mm, addr, regs); ++ if (unlikely(!vma)) { ++ fault = VM_FAULT_BADMAP; ++ goto done; + } + +- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs); ++ fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs); + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { +@@ -660,9 +631,7 @@ retry: + } + mmap_read_unlock(mm); + +-#ifdef CONFIG_PER_VMA_LOCK + done: +-#endif + /* + * Handle the "normal" (no error) case first. + */ diff --git a/queue-6.4/execve-expand-new-process-stack-manually-ahead-of-time.patch b/queue-6.4/execve-expand-new-process-stack-manually-ahead-of-time.patch new file mode 100644 index 00000000000..82044d28bfa --- /dev/null +++ b/queue-6.4/execve-expand-new-process-stack-manually-ahead-of-time.patch @@ -0,0 +1,88 @@ +From f313c51d26aa87e69633c9b46efb37a930faca71 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Mon, 19 Jun 2023 11:34:15 -0700 +Subject: execve: expand new process stack manually ahead of time + +From: Linus Torvalds + +commit f313c51d26aa87e69633c9b46efb37a930faca71 upstream. + +This is a small step towards a model where GUP itself would not expand +the stack, and any user that needs GUP to not look up existing mappings, +but actually expand on them, would have to do so manually before-hand, +and with the mm lock held for writing. + +It turns out that execve() already did almost exactly that, except it +didn't take the mm lock at all (it's single-threaded so no locking +technically needed, but it could cause lockdep errors). And it only did +it for the CONFIG_STACK_GROWSUP case, since in that case GUP has +obviously never expanded the stack downwards. + +So just make that CONFIG_STACK_GROWSUP case do the right thing with +locking, and enable it generally. This will eventually help GUP, and in +the meantime avoids a special case and the lockdep issue. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/exec.c | 37 +++++++++++++++++++++---------------- + 1 file changed, 21 insertions(+), 16 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -200,34 +200,39 @@ static struct page *get_arg_page(struct + int write) + { + struct page *page; ++ struct vm_area_struct *vma = bprm->vma; ++ struct mm_struct *mm = bprm->mm; + int ret; +- unsigned int gup_flags = 0; + +-#ifdef CONFIG_STACK_GROWSUP +- if (write) { +- /* We claim to hold the lock - nobody to race with */ +- ret = expand_downwards(bprm->vma, pos, true); +- if (ret < 0) ++ /* ++ * Avoid relying on expanding the stack down in GUP (which ++ * does not work for STACK_GROWSUP anyway), and just do it ++ * by hand ahead of time. ++ */ ++ if (write && pos < vma->vm_start) { ++ mmap_write_lock(mm); ++ ret = expand_downwards(vma, pos, true); ++ if (unlikely(ret < 0)) { ++ mmap_write_unlock(mm); + return NULL; +- } +-#endif +- +- if (write) +- gup_flags |= FOLL_WRITE; ++ } ++ mmap_write_downgrade(mm); ++ } else ++ mmap_read_lock(mm); + + /* + * We are doing an exec(). 'current' is the process +- * doing the exec and bprm->mm is the new process's mm. ++ * doing the exec and 'mm' is the new process's mm. + */ +- mmap_read_lock(bprm->mm); +- ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, ++ ret = get_user_pages_remote(mm, pos, 1, ++ write ? FOLL_WRITE : 0, + &page, NULL, NULL); +- mmap_read_unlock(bprm->mm); ++ mmap_read_unlock(mm); + if (ret <= 0) + return NULL; + + if (write) +- acct_arg_size(bprm, vma_pages(bprm->vma)); ++ acct_arg_size(bprm, vma_pages(vma)); + + return page; + } diff --git a/queue-6.4/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch b/queue-6.4/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch new file mode 100644 index 00000000000..cc1b1efc4d6 --- /dev/null +++ b/queue-6.4/gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch @@ -0,0 +1,59 @@ +From a425ac5365f6cb3cc47bf83e6bff0213c10445f7 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sun, 25 Jun 2023 14:02:25 -0700 +Subject: gup: add warning if some caller would seem to want stack expansion + +From: Linus Torvalds + +commit a425ac5365f6cb3cc47bf83e6bff0213c10445f7 upstream. + +It feels very unlikely that anybody would want to do a GUP in an +unmapped area under the stack pointer, but real users sometimes do some +really strange things. So add a (temporary) warning for the case where +a GUP fails and expanding the stack might have made it work. + +It's trivial to do the expansion in the caller as part of getting the mm +lock in the first place - see __access_remote_vm() for ptrace, for +example - it's just that it's unnecessarily painful to do it deep in the +guts of the GUP lookup when we might have to drop and re-take the lock. + +I doubt anybody actually does anything quite this strange, but let's be +proactive: adding these warnings is simple, and will make debugging it +much easier if they trigger. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/gup.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1096,7 +1096,11 @@ static long __get_user_pages(struct mm_s + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { +- vma = vma_lookup(mm, start); ++ vma = find_vma(mm, start); ++ if (vma && (start < vma->vm_start)) { ++ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN); ++ vma = NULL; ++ } + if (!vma && in_gate_area(mm, start)) { + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, +@@ -1265,9 +1269,13 @@ int fixup_user_fault(struct mm_struct *m + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + retry: +- vma = vma_lookup(mm, address); ++ vma = find_vma(mm, address); + if (!vma) + return -EFAULT; ++ if (address < vma->vm_start ) { ++ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN); ++ return -EFAULT; ++ } + + if (!vma_permits_fault(vma, fault_flags)) + return -EFAULT; diff --git a/queue-6.4/mips-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/mips-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..915e050ad67 --- /dev/null +++ b/queue-6.4/mips-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,53 @@ +From 4bce37a68ff884e821a02a731897a8119e0c37b7 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 18:47:40 +0200 +Subject: mips/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 4bce37a68ff884e821a02a731897a8119e0c37b7 upstream. + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/mips/Kconfig | 1 + + arch/mips/mm/fault.c | 12 ++---------- + 2 files changed, 3 insertions(+), 10 deletions(-) + +--- a/arch/mips/Kconfig ++++ b/arch/mips/Kconfig +@@ -91,6 +91,7 @@ config MIPS + select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP + select IRQ_FORCED_THREADING + select ISA if EISA ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_REL if MODULES + select MODULES_USE_ELF_RELA if MODULES && 64BIT + select PERF_USE_VMALLOC +--- a/arch/mips/mm/fault.c ++++ b/arch/mips/mm/fault.c +@@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_re + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + si_code = SEGV_ACCERR; + + if (write) { diff --git a/queue-6.4/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch b/queue-6.4/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch new file mode 100644 index 00000000000..5965c6d7baf --- /dev/null +++ b/queue-6.4/mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch @@ -0,0 +1,671 @@ +From 8d7071af890768438c14db6172cc8f9f4d04e184 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 13:45:51 -0700 +Subject: mm: always expand the stack with the mmap write lock held + +From: Linus Torvalds + +commit 8d7071af890768438c14db6172cc8f9f4d04e184 upstream. + +This finishes the job of always holding the mmap write lock when +extending the user stack vma, and removes the 'write_locked' argument +from the vm helper functions again. + +For some cases, we just avoid expanding the stack at all: drivers and +page pinning really shouldn't be extending any stacks. Let's see if any +strange users really wanted that. + +It's worth noting that architectures that weren't converted to the new +lock_mm_and_find_vma() helper function are left using the legacy +"expand_stack()" function, but it has been changed to drop the mmap_lock +and take it for writing while expanding the vma. This makes it fairly +straightforward to convert the remaining architectures. + +As a result of dropping and re-taking the lock, the calling conventions +for this function have also changed, since the old vma may no longer be +valid. So it will now return the new vma if successful, and NULL - and +the lock dropped - if the area could not be extended. + +Tested-by: Vegard Nossum +Tested-by: John Paul Adrian Glaubitz # ia64 +Tested-by: Frank Scheiner # ia64 +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/ia64/mm/fault.c | 36 ++---------- + arch/m68k/mm/fault.c | 9 ++- + arch/microblaze/mm/fault.c | 5 + + arch/openrisc/mm/fault.c | 5 + + arch/parisc/mm/fault.c | 23 +++----- + arch/s390/mm/fault.c | 5 + + arch/sparc/mm/fault_64.c | 8 +- + arch/um/kernel/trap.c | 11 ++- + drivers/iommu/amd/iommu_v2.c | 4 - + drivers/iommu/iommu-sva.c | 2 + fs/binfmt_elf.c | 2 + fs/exec.c | 4 - + include/linux/mm.h | 16 +---- + mm/gup.c | 6 +- + mm/memory.c | 10 +++ + mm/mmap.c | 121 ++++++++++++++++++++++++++++++++++--------- + mm/nommu.c | 18 ++---- + 17 files changed, 169 insertions(+), 116 deletions(-) + +--- a/arch/ia64/mm/fault.c ++++ b/arch/ia64/mm/fault.c +@@ -110,10 +110,12 @@ retry: + * register backing store that needs to expand upwards, in + * this case vma will be null, but prev_vma will ne non-null + */ +- if (( !vma && prev_vma ) || (address < vma->vm_start) ) +- goto check_expansion; ++ if (( !vma && prev_vma ) || (address < vma->vm_start) ) { ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; ++ } + +- good_area: + code = SEGV_ACCERR; + + /* OK, we've got a good vm_area for this memory area. Check the access permissions: */ +@@ -177,35 +179,9 @@ retry: + mmap_read_unlock(mm); + return; + +- check_expansion: +- if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) { +- if (!vma) +- goto bad_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) +- || REGION_OFFSET(address) >= RGN_MAP_LIMIT) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; +- } else { +- vma = prev_vma; +- if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) +- || REGION_OFFSET(address) >= RGN_MAP_LIMIT) +- goto bad_area; +- /* +- * Since the register backing store is accessed sequentially, +- * we disallow growing it by more than a page at a time. +- */ +- if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) +- goto bad_area; +- if (expand_upwards(vma, address)) +- goto bad_area; +- } +- goto good_area; +- + bad_area: + mmap_read_unlock(mm); ++ bad_area_nosemaphore: + if ((isr & IA64_ISR_SP) + || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) + { +--- a/arch/m68k/mm/fault.c ++++ b/arch/m68k/mm/fault.c +@@ -105,8 +105,9 @@ retry: + if (address + 256 < rdusp()) + goto map_err; + } +- if (expand_stack(vma, address)) +- goto map_err; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto map_err_nosemaphore; + + /* + * Ok, we have a good vm_area for this memory access, so +@@ -196,10 +197,12 @@ bus_err: + goto send_sig; + + map_err: ++ mmap_read_unlock(mm); ++map_err_nosemaphore: + current->thread.signo = SIGSEGV; + current->thread.code = SEGV_MAPERR; + current->thread.faddr = address; +- goto send_sig; ++ return send_fault_sig(regs); + + acc_err: + current->thread.signo = SIGSEGV; +--- a/arch/microblaze/mm/fault.c ++++ b/arch/microblaze/mm/fault.c +@@ -192,8 +192,9 @@ retry: + && (kernel_mode(regs) || !store_updates_sp(regs))) + goto bad_area; + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + + good_area: + code = SEGV_ACCERR; +--- a/arch/openrisc/mm/fault.c ++++ b/arch/openrisc/mm/fault.c +@@ -127,8 +127,9 @@ retry: + if (address + PAGE_SIZE < regs->sp) + goto bad_area; + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + + /* + * Ok, we have a good vm_area for this memory access, so +--- a/arch/parisc/mm/fault.c ++++ b/arch/parisc/mm/fault.c +@@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs, + retry: + mmap_read_lock(mm); + vma = find_vma_prev(mm, address, &prev_vma); +- if (!vma || address < vma->vm_start) +- goto check_expansion; ++ if (!vma || address < vma->vm_start) { ++ if (!prev || !(prev->vm_flags & VM_GROWSUP)) ++ goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; ++ } ++ + /* + * Ok, we have a good vm_area for this memory access. We still need to + * check the access permissions. + */ + +-good_area: +- + if ((vma->vm_flags & acc_type) != acc_type) + goto bad_area; + +@@ -347,17 +351,13 @@ good_area: + mmap_read_unlock(mm); + return; + +-check_expansion: +- vma = prev_vma; +- if (vma && (expand_stack(vma, address) == 0)) +- goto good_area; +- + /* + * Something tried to access memory that isn't in our memory map.. + */ + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + if (user_mode(regs)) { + int signo, si_code; + +@@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs + { + unsigned long insn = regs->iir; + int breg, treg, xreg, val = 0; +- struct vm_area_struct *vma, *prev_vma; ++ struct vm_area_struct *vma; + struct task_struct *tsk; + struct mm_struct *mm; + unsigned long address; +@@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs + /* Search for VMA */ + address = regs->ior; + mmap_read_lock(mm); +- vma = find_vma_prev(mm, address, &prev_vma); ++ vma = vma_lookup(mm, address); + mmap_read_unlock(mm); + + /* +@@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs + */ + acc_type = (insn & 0x40) ? VM_WRITE : VM_READ; + if (vma +- && address >= vma->vm_start + && (vma->vm_flags & acc_type) == acc_type) + val = 1; + } +--- a/arch/s390/mm/fault.c ++++ b/arch/s390/mm/fault.c +@@ -457,8 +457,9 @@ retry: + if (unlikely(vma->vm_start > address)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_up; +- if (expand_stack(vma, address)) +- goto out_up; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto out; + } + + /* +--- a/arch/sparc/mm/fault_64.c ++++ b/arch/sparc/mm/fault_64.c +@@ -383,8 +383,9 @@ continue_fault: + goto bad_area; + } + } +- if (expand_stack(vma, address)) +- goto bad_area; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. +@@ -487,8 +488,9 @@ exit_exception: + * Fix it, but check if it's kernel or user first.. + */ + bad_area: +- insn = get_fault_insn(regs, insn); + mmap_read_unlock(mm); ++bad_area_nosemaphore: ++ insn = get_fault_insn(regs, insn); + + handle_kernel_fault: + do_kernel_fault(regs, si_code, fault_code, insn, address); +--- a/arch/um/kernel/trap.c ++++ b/arch/um/kernel/trap.c +@@ -47,14 +47,15 @@ retry: + vma = find_vma(mm, address); + if (!vma) + goto out; +- else if (vma->vm_start <= address) ++ if (vma->vm_start <= address) + goto good_area; +- else if (!(vma->vm_flags & VM_GROWSDOWN)) ++ if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; +- else if (is_user && !ARCH_IS_STACKGROW(address)) +- goto out; +- else if (expand_stack(vma, address)) ++ if (is_user && !ARCH_IS_STACKGROW(address)) + goto out; ++ vma = expand_stack(mm, address); ++ if (!vma) ++ goto out_nosemaphore; + + good_area: + *code_out = SEGV_ACCERR; +--- a/drivers/iommu/amd/iommu_v2.c ++++ b/drivers/iommu/amd/iommu_v2.c +@@ -485,8 +485,8 @@ static void do_fault(struct work_struct + flags |= FAULT_FLAG_REMOTE; + + mmap_read_lock(mm); +- vma = find_extend_vma(mm, address); +- if (!vma || address < vma->vm_start) ++ vma = vma_lookup(mm, address); ++ if (!vma) + /* failed to get a vma in the right range */ + goto out; + +--- a/drivers/iommu/iommu-sva.c ++++ b/drivers/iommu/iommu-sva.c +@@ -175,7 +175,7 @@ iommu_sva_handle_iopf(struct iommu_fault + + mmap_read_lock(mm); + +- vma = find_extend_vma(mm, prm->addr); ++ vma = vma_lookup(mm, prm->addr); + if (!vma) + /* Unmapped area */ + goto out_put_mm; +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -322,7 +322,7 @@ create_elf_tables(struct linux_binprm *b + */ + if (mmap_write_lock_killable(mm)) + return -EINTR; +- vma = find_extend_vma_locked(mm, bprm->p, true); ++ vma = find_extend_vma_locked(mm, bprm->p); + mmap_write_unlock(mm); + if (!vma) + return -EFAULT; +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -211,7 +211,7 @@ static struct page *get_arg_page(struct + */ + if (write && pos < vma->vm_start) { + mmap_write_lock(mm); +- ret = expand_downwards(vma, pos, true); ++ ret = expand_downwards(vma, pos); + if (unlikely(ret < 0)) { + mmap_write_unlock(mm); + return NULL; +@@ -859,7 +859,7 @@ int setup_arg_pages(struct linux_binprm + stack_base = vma->vm_end - stack_expand; + #endif + current->mm->start_stack = bprm->p; +- ret = expand_stack_locked(vma, stack_base, true); ++ ret = expand_stack_locked(vma, stack_base); + if (ret) + ret = -EFAULT; + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3192,18 +3192,11 @@ extern vm_fault_t filemap_page_mkwrite(s + + extern unsigned long stack_guard_gap; + /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked); +-#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); ++struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); + + /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked); +-#if VM_GROWSUP +-extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); +-#else +- #define expand_upwards(vma, address) (0) +-#endif ++int expand_downwards(struct vm_area_struct *vma, unsigned long address); + + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ + extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +@@ -3298,9 +3291,8 @@ unsigned long change_prot_numa(struct vm + unsigned long start, unsigned long end); + #endif + +-struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); + struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, +- unsigned long addr, bool write_locked); ++ unsigned long addr); + int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); + int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1096,7 +1096,7 @@ static long __get_user_pages(struct mm_s + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { +- vma = find_extend_vma(mm, start); ++ vma = vma_lookup(mm, start); + if (!vma && in_gate_area(mm, start)) { + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, +@@ -1265,8 +1265,8 @@ int fixup_user_fault(struct mm_struct *m + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + retry: +- vma = find_extend_vma(mm, address); +- if (!vma || address < vma->vm_start) ++ vma = vma_lookup(mm, address); ++ if (!vma) + return -EFAULT; + + if (!vma_permits_fault(vma, fault_flags)) +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_ + goto fail; + } + +- if (expand_stack_locked(vma, addr, true)) ++ if (expand_stack_locked(vma, addr)) + goto fail; + + success: +@@ -5713,6 +5713,14 @@ int __access_remote_vm(struct mm_struct + if (mmap_read_lock_killable(mm)) + return 0; + ++ /* We might need to expand the stack to access it */ ++ vma = vma_lookup(mm, addr); ++ if (!vma) { ++ vma = expand_stack(mm, addr); ++ if (!vma) ++ return 0; ++ } ++ + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, ret, offset; +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1935,8 +1935,7 @@ static int acct_stack_growth(struct vm_a + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +-int expand_upwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++static int expand_upwards(struct vm_area_struct *vma, unsigned long address) + { + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; +@@ -1960,8 +1959,6 @@ int expand_upwards(struct vm_area_struct + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + +- if (!write_locked) +- return -EAGAIN; + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) +@@ -2030,15 +2027,18 @@ int expand_upwards(struct vm_area_struct + + /* + * vma is the first one with address < vma->vm_start. Have to extend vma. ++ * mmap_lock held for writing. + */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_downwards(struct vm_area_struct *vma, unsigned long address) + { + struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); + struct vm_area_struct *prev; + int error = 0; + ++ if (!(vma->vm_flags & VM_GROWSDOWN)) ++ return -EFAULT; ++ + address &= PAGE_MASK; + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; +@@ -2051,8 +2051,6 @@ int expand_downwards(struct vm_area_stru + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; +- if (!write_locked && (prev->vm_end == address)) +- return -EAGAIN; + } + + if (mas_preallocate(&mas, GFP_KERNEL)) +@@ -2131,14 +2129,12 @@ static int __init cmdline_parse_stack_gu + __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + + #ifdef CONFIG_STACK_GROWSUP +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) + { +- return expand_upwards(vma, address, write_locked); ++ return expand_upwards(vma, address); + } + +-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, +- unsigned long addr, bool write_locked) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) + { + struct vm_area_struct *vma, *prev; + +@@ -2148,23 +2144,21 @@ struct vm_area_struct *find_extend_vma_l + return vma; + if (!prev) + return NULL; +- if (expand_stack_locked(prev, addr, write_locked)) ++ if (expand_stack_locked(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); + return prev; + } + #else +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) + { + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + return -EINVAL; +- return expand_downwards(vma, address, write_locked); ++ return expand_downwards(vma, address); + } + +-struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, +- unsigned long addr, bool write_locked) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) + { + struct vm_area_struct *vma; + unsigned long start; +@@ -2176,7 +2170,7 @@ struct vm_area_struct *find_extend_vma_l + if (vma->vm_start <= addr) + return vma; + start = vma->vm_start; +- if (expand_stack_locked(vma, addr, write_locked)) ++ if (expand_stack_locked(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) + populate_vma_page_range(vma, addr, start, NULL); +@@ -2184,12 +2178,91 @@ struct vm_area_struct *find_extend_vma_l + } + #endif + +-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, +- unsigned long addr) ++/* ++ * IA64 has some horrid mapping rules: it can expand both up and down, ++ * but with various special rules. ++ * ++ * We'll get rid of this architecture eventually, so the ugliness is ++ * temporary. ++ */ ++#ifdef CONFIG_IA64 ++static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr) ++{ ++ return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) && ++ REGION_OFFSET(addr) < RGN_MAP_LIMIT; ++} ++ ++/* ++ * IA64 stacks grow down, but there's a special register backing store ++ * that can grow up. Only sequentially, though, so the new address must ++ * match vm_end. ++ */ ++static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr) ++{ ++ if (!vma_expand_ok(vma, addr)) ++ return -EFAULT; ++ if (vma->vm_end != (addr & PAGE_MASK)) ++ return -EFAULT; ++ return expand_upwards(vma, addr); ++} ++ ++static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr) ++{ ++ if (!vma_expand_ok(vma, addr)) ++ return -EFAULT; ++ return expand_downwards(vma, addr); ++} ++ ++#elif defined(CONFIG_STACK_GROWSUP) ++ ++#define vma_expand_up(vma,addr) expand_upwards(vma, addr) ++#define vma_expand_down(vma, addr) (-EFAULT) ++ ++#else ++ ++#define vma_expand_up(vma,addr) (-EFAULT) ++#define vma_expand_down(vma, addr) expand_downwards(vma, addr) ++ ++#endif ++ ++/* ++ * expand_stack(): legacy interface for page faulting. Don't use unless ++ * you have to. ++ * ++ * This is called with the mm locked for reading, drops the lock, takes ++ * the lock for writing, tries to look up a vma again, expands it if ++ * necessary, and downgrades the lock to reading again. ++ * ++ * If no vma is found or it can't be expanded, it returns NULL and has ++ * dropped the lock. ++ */ ++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) + { +- return find_extend_vma_locked(mm, addr, false); ++ struct vm_area_struct *vma, *prev; ++ ++ mmap_read_unlock(mm); ++ if (mmap_write_lock_killable(mm)) ++ return NULL; ++ ++ vma = find_vma_prev(mm, addr, &prev); ++ if (vma && vma->vm_start <= addr) ++ goto success; ++ ++ if (prev && !vma_expand_up(prev, addr)) { ++ vma = prev; ++ goto success; ++ } ++ ++ if (vma && !vma_expand_down(vma, addr)) ++ goto success; ++ ++ mmap_write_unlock(mm); ++ return NULL; ++ ++success: ++ mmap_write_downgrade(mm); ++ return vma; + } +-EXPORT_SYMBOL_GPL(find_extend_vma); + + /* + * Ok - we have the memory areas we should free on a maple tree so release them, +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -631,24 +631,20 @@ struct vm_area_struct *find_vma(struct m + EXPORT_SYMBOL(find_vma); + + /* +- * find a VMA +- * - we don't extend stack VMAs under NOMMU conditions +- */ +-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +-{ +- return find_vma(mm, addr); +-} +- +-/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +-int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, +- bool write_locked) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) + { + return -ENOMEM; + } + ++struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) ++{ ++ mmap_read_unlock(mm); ++ return NULL; ++} ++ + /* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_lock at least held readlocked diff --git a/queue-6.4/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch b/queue-6.4/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..77c9efbf783 --- /dev/null +++ b/queue-6.4/mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch @@ -0,0 +1,489 @@ +From a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 10:55:38 -0700 +Subject: mm/fault: convert remaining simple cases to lock_mm_and_find_vma() + +From: Linus Torvalds + +commit a050ba1e7422f2cc60ff8bfde3f96d34d00cb585 upstream. + +This does the simple pattern conversion of alpha, arc, csky, hexagon, +loongarch, nios2, sh, sparc32, and xtensa to the lock_mm_and_find_vma() +helper. They all have the regular fault handling pattern without odd +special cases. + +The remaining architectures all have something that keeps us from a +straightforward conversion: ia64 and parisc have stacks that can grow +both up as well as down (and ia64 has special address region checks). + +And m68k, microblaze, openrisc, sparc64, and um end up having extra +rules about only expanding the stack down a limited amount below the +user space stack pointer. That is something that x86 used to do too +(long long ago), and it probably could just be skipped, but it still +makes the conversion less than trivial. + +Note that this conversion was done manually and with the exception of +alpha without any build testing, because I have a fairly limited cross- +building environment. The cases are all simple, and I went through the +changes several times, but... + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/alpha/Kconfig | 1 + + arch/alpha/mm/fault.c | 13 +++---------- + arch/arc/Kconfig | 1 + + arch/arc/mm/fault.c | 11 +++-------- + arch/csky/Kconfig | 1 + + arch/csky/mm/fault.c | 22 +++++----------------- + arch/hexagon/Kconfig | 1 + + arch/hexagon/mm/vm_fault.c | 18 ++++-------------- + arch/loongarch/Kconfig | 1 + + arch/loongarch/mm/fault.c | 16 ++++++---------- + arch/nios2/Kconfig | 1 + + arch/nios2/mm/fault.c | 17 ++--------------- + arch/sh/Kconfig | 1 + + arch/sh/mm/fault.c | 17 ++--------------- + arch/sparc/Kconfig | 1 + + arch/sparc/mm/fault_32.c | 32 ++++++++------------------------ + arch/xtensa/Kconfig | 1 + + arch/xtensa/mm/fault.c | 14 +++----------- + 18 files changed, 45 insertions(+), 124 deletions(-) + +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -30,6 +30,7 @@ config ALPHA + select HAS_IOPORT + select HAVE_ARCH_AUDITSYSCALL + select HAVE_MOD_ARCH_SPECIFIC ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select ODD_RT_SIGACTION + select OLD_SIGSUSPEND +--- a/arch/alpha/mm/fault.c ++++ b/arch/alpha/mm/fault.c +@@ -119,20 +119,12 @@ do_page_fault(unsigned long address, uns + flags |= FAULT_FLAG_USER; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + + /* Ok, we have a good vm_area for this memory access, so + we can handle it. */ +- good_area: + si_code = SEGV_ACCERR; + if (cause < 0) { + if (!(vma->vm_flags & VM_EXEC)) +@@ -192,6 +184,7 @@ retry: + bad_area: + mmap_read_unlock(mm); + ++ bad_area_nosemaphore: + if (user_mode(regs)) + goto do_sigsegv; + +--- a/arch/arc/Kconfig ++++ b/arch/arc/Kconfig +@@ -41,6 +41,7 @@ config ARC + select HAVE_PERF_EVENTS + select HAVE_SYSCALL_TRACEPOINTS + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select OF + select OF_EARLY_FLATTREE +--- a/arch/arc/mm/fault.c ++++ b/arch/arc/mm/fault.c +@@ -113,15 +113,9 @@ void do_page_fault(unsigned long address + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (unlikely(address < vma->vm_start)) { +- if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address)) +- goto bad_area; +- } ++ goto bad_area_nosemaphore; + + /* + * vm_area is good, now check permissions for this memory access +@@ -161,6 +155,7 @@ retry: + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + /* + * Major/minor page fault accounting + * (in case of retry we only land here once) +--- a/arch/csky/Kconfig ++++ b/arch/csky/Kconfig +@@ -96,6 +96,7 @@ config CSKY + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_STACKPROTECTOR + select HAVE_SYSCALL_TRACEPOINTS ++ select LOCK_MM_AND_FIND_VMA + select MAY_HAVE_SPARSE_IRQ + select MODULES_USE_ELF_RELA if MODULES + select OF +--- a/arch/csky/mm/fault.c ++++ b/arch/csky/mm/fault.c +@@ -97,13 +97,12 @@ static inline void mm_fault_error(struct + BUG(); + } + +-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) ++static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) + { + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ +- mmap_read_unlock(mm); + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); +@@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_ + if (is_write(regs)) + flags |= FAULT_FLAG_WRITE; + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, addr); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, mm, code, addr); +- return; +- } +- if (likely(vma->vm_start <= addr)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, mm, code, addr); +- return; +- } +- if (unlikely(expand_stack(vma, addr))) { +- bad_area(regs, mm, code, addr); ++ bad_area_nosemaphore(regs, mm, code, addr); + return; + } + +@@ -259,11 +247,11 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it. + */ +-good_area: + code = SEGV_ACCERR; + + if (unlikely(access_error(regs, vma))) { +- bad_area(regs, mm, code, addr); ++ mmap_read_unlock(mm); ++ bad_area_nosemaphore(regs, mm, code, addr); + return; + } + +--- a/arch/hexagon/Kconfig ++++ b/arch/hexagon/Kconfig +@@ -28,6 +28,7 @@ config HEXAGON + select GENERIC_SMP_IDLE_THREAD + select STACKTRACE_SUPPORT + select GENERIC_CLOCKEVENTS_BROADCAST ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select GENERIC_CPU_DEVICES + select ARCH_WANT_LD_ORPHAN_WARN +--- a/arch/hexagon/mm/vm_fault.c ++++ b/arch/hexagon/mm/vm_fault.c +@@ -57,21 +57,10 @@ void do_page_fault(unsigned long address + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- if (!vma) +- goto bad_area; ++ vma = lock_mm_and_find_vma(mm, address, regs); ++ if (unlikely(!vma)) ++ goto bad_area_nosemaphore; + +- if (vma->vm_start <= address) +- goto good_area; +- +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- +- if (expand_stack(vma, address)) +- goto bad_area; +- +-good_area: + /* Address space is OK. Now check access rights. */ + si_code = SEGV_ACCERR; + +@@ -143,6 +132,7 @@ good_area: + bad_area: + mmap_read_unlock(mm); + ++bad_area_nosemaphore: + if (user_mode(regs)) { + force_sig_fault(SIGSEGV, si_code, (void __user *)address); + return; +--- a/arch/loongarch/Kconfig ++++ b/arch/loongarch/Kconfig +@@ -130,6 +130,7 @@ config LOONGARCH + select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP + select IRQ_FORCED_THREADING + select IRQ_LOONGARCH_CPU ++ select LOCK_MM_AND_FIND_VMA + select MMU_GATHER_MERGE_VMAS if MMU + select MODULES_USE_ELF_RELA if MODULES + select NEED_PER_CPU_EMBED_FIRST_CHUNK +--- a/arch/loongarch/mm/fault.c ++++ b/arch/loongarch/mm/fault.c +@@ -169,22 +169,18 @@ static void __kprobes __do_page_fault(st + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (!expand_stack(vma, address)) +- goto good_area; ++ vma = lock_mm_and_find_vma(mm, address, regs); ++ if (unlikely(!vma)) ++ goto bad_area_nosemaphore; ++ goto good_area; ++ + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + do_sigsegv(regs, write, address, si_code); + return; + +--- a/arch/nios2/Kconfig ++++ b/arch/nios2/Kconfig +@@ -16,6 +16,7 @@ config NIOS2 + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_KGDB + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select OF + select OF_EARLY_FLATTREE +--- a/arch/nios2/mm/fault.c ++++ b/arch/nios2/mm/fault.c +@@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_ + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + +- if (!mmap_read_trylock(mm)) { +- if (!user_mode(regs) && !search_exception_tables(regs->ea)) +- goto bad_area_nosemaphore; + retry: +- mmap_read_lock(mm); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + code = SEGV_ACCERR; + + switch (cause) { +--- a/arch/sh/Kconfig ++++ b/arch/sh/Kconfig +@@ -59,6 +59,7 @@ config SUPERH + select HAVE_STACKPROTECTOR + select HAVE_SYSCALL_TRACEPOINTS + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select NEED_SG_DMA_LENGTH + select NO_DMA if !MMU && !DMA_COHERENT +--- a/arch/sh/mm/fault.c ++++ b/arch/sh/mm/fault.c +@@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault( + } + + retry: +- mmap_read_lock(mm); +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, error_code, address); +- return; +- } +- if (likely(vma->vm_start <= address)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, error_code, address); +- return; +- } +- if (unlikely(expand_stack(vma, address))) { +- bad_area(regs, error_code, address); ++ bad_area_nosemaphore(regs, error_code, address); + return; + } + +@@ -461,7 +449,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address); + return; +--- a/arch/sparc/Kconfig ++++ b/arch/sparc/Kconfig +@@ -57,6 +57,7 @@ config SPARC32 + select DMA_DIRECT_REMAP + select GENERIC_ATOMIC64 + select HAVE_UID16 ++ select LOCK_MM_AND_FIND_VMA + select OLD_SIGACTION + select ZONE_DMA + +--- a/arch/sparc/mm/fault_32.c ++++ b/arch/sparc/mm/fault_32.c +@@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt + if (pagefault_disabled() || !mm) + goto no_context; + ++ if (!from_user && address >= PAGE_OFFSET) ++ goto no_context; ++ + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + retry: +- mmap_read_lock(mm); +- +- if (!from_user && address >= PAGE_OFFSET) +- goto bad_area; +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + code = SEGV_ACCERR; + if (write) { + if (!(vma->vm_flags & VM_WRITE)) +@@ -321,17 +312,9 @@ static void force_user_fault(unsigned lo + + code = SEGV_MAPERR; + +- mmap_read_lock(mm); +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; +-good_area: ++ goto bad_area_nosemaphore; + code = SEGV_ACCERR; + if (write) { + if (!(vma->vm_flags & VM_WRITE)) +@@ -350,6 +333,7 @@ good_area: + return; + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address); + return; + +--- a/arch/xtensa/Kconfig ++++ b/arch/xtensa/Kconfig +@@ -49,6 +49,7 @@ config XTENSA + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING_GEN + select IRQ_DOMAIN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA + select PERF_USE_VMALLOC + select TRACE_IRQFLAGS_SUPPORT +--- a/arch/xtensa/mm/fault.c ++++ b/arch/xtensa/mm/fault.c +@@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs) + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, address); +- ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) +- goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; +- if (expand_stack(vma, address)) +- goto bad_area; ++ goto bad_area_nosemaphore; + + /* Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + +-good_area: + code = SEGV_ACCERR; + + if (is_write) { +@@ -205,6 +196,7 @@ good_area: + */ + bad_area: + mmap_read_unlock(mm); ++bad_area_nosemaphore: + if (user_mode(regs)) { + force_sig_fault(SIGSEGV, code, (void *) address); + return; diff --git a/queue-6.4/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch b/queue-6.4/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch new file mode 100644 index 00000000000..88491d4c4ec --- /dev/null +++ b/queue-6.4/mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch @@ -0,0 +1,295 @@ +From c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 15:17:36 -0700 +Subject: mm: introduce new 'lock_mm_and_find_vma()' page fault helper + +From: Linus Torvalds + +commit c2508ec5a58db67093f4fb8bf89a9a7c53a109e9 upstream. + +.. and make x86 use it. + +This basically extracts the existing x86 "find and expand faulting vma" +code, but extends it to also take the mmap lock for writing in case we +actually do need to expand the vma. + +We've historically short-circuited that case, and have some rather ugly +special logic to serialize the stack segment expansion (since we only +hold the mmap lock for reading) that doesn't match the normal VM +locking. + +That slight violation of locking worked well, right up until it didn't: +the maple tree code really does want proper locking even for simple +extension of an existing vma. + +So extract the code for "look up the vma of the fault" from x86, fix it +up to do the necessary write locking, and make it available as a helper +function for other architectures that can use the common helper. + +Note: I say "common helper", but it really only handles the normal +stack-grows-down case. Which is all architectures except for PA-RISC +and IA64. So some rare architectures can't use the helper, but if they +care they'll just need to open-code this logic. + +It's also worth pointing out that this code really would like to have an +optimistic "mmap_upgrade_trylock()" to make it quicker to go from a +read-lock (for the common case) to taking the write lock (for having to +extend the vma) in the normal single-threaded situation where there is +no other locking activity. + +But that _is_ all the very uncommon special case, so while it would be +nice to have such an operation, it probably doesn't matter in reality. +I did put in the skeleton code for such a possible future expansion, +even if it only acts as pseudo-documentation for what we're doing. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/Kconfig | 1 + arch/x86/mm/fault.c | 52 ---------------------- + include/linux/mm.h | 2 + mm/Kconfig | 4 + + mm/memory.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 130 insertions(+), 50 deletions(-) + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -276,6 +276,7 @@ config X86 + select HAVE_GENERIC_VDSO + select HOTPLUG_SMT if SMP + select IRQ_FORCED_THREADING ++ select LOCK_MM_AND_FIND_VMA + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK + select NEED_SG_DMA_LENGTH +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -880,12 +880,6 @@ __bad_area(struct pt_regs *regs, unsigne + __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); + } + +-static noinline void +-bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +-{ +- __bad_area(regs, error_code, address, 0, SEGV_MAPERR); +-} +- + static inline bool bad_area_access_from_pkeys(unsigned long error_code, + struct vm_area_struct *vma) + { +@@ -1366,51 +1360,10 @@ void do_user_addr_fault(struct pt_regs * + lock_mmap: + #endif /* CONFIG_PER_VMA_LOCK */ + +- /* +- * Kernel-mode access to the user address space should only occur +- * on well-defined single instructions listed in the exception +- * tables. But, an erroneous kernel fault occurring outside one of +- * those areas which also holds mmap_lock might deadlock attempting +- * to validate the fault against the address space. +- * +- * Only do the expensive exception table search when we might be at +- * risk of a deadlock. This happens if we +- * 1. Failed to acquire mmap_lock, and +- * 2. The access did not originate in userspace. +- */ +- if (unlikely(!mmap_read_trylock(mm))) { +- if (!user_mode(regs) && !search_exception_tables(regs->ip)) { +- /* +- * Fault from code in kernel from +- * which we do not expect faults. +- */ +- bad_area_nosemaphore(regs, error_code, address); +- return; +- } + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case we'll have missed the might_sleep() from +- * down_read(): +- */ +- might_sleep(); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { +- bad_area(regs, error_code, address); +- return; +- } +- if (likely(vma->vm_start <= address)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- bad_area(regs, error_code, address); +- return; +- } +- if (unlikely(expand_stack(vma, address))) { +- bad_area(regs, error_code, address); ++ bad_area_nosemaphore(regs, error_code, address); + return; + } + +@@ -1418,7 +1371,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +-good_area: + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); + return; +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2325,6 +2325,8 @@ void unmap_mapping_pages(struct address_ + pgoff_t start, pgoff_t nr, bool even_cows); + void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); ++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, ++ unsigned long address, struct pt_regs *regs); + #else + static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1206,6 +1206,10 @@ config PER_VMA_LOCK + This feature allows locking each virtual memory area separately when + handling page faults instead of taking mmap_lock. + ++config LOCK_MM_AND_FIND_VMA ++ bool ++ depends on !STACK_GROWSUP ++ + source "mm/damon/Kconfig" + + endmenu +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5262,6 +5262,127 @@ out: + } + EXPORT_SYMBOL_GPL(handle_mm_fault); + ++#ifdef CONFIG_LOCK_MM_AND_FIND_VMA ++#include ++ ++static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) ++{ ++ /* Even if this succeeds, make it clear we *might* have slept */ ++ if (likely(mmap_read_trylock(mm))) { ++ might_sleep(); ++ return true; ++ } ++ ++ if (regs && !user_mode(regs)) { ++ unsigned long ip = instruction_pointer(regs); ++ if (!search_exception_tables(ip)) ++ return false; ++ } ++ ++ mmap_read_lock(mm); ++ return true; ++} ++ ++static inline bool mmap_upgrade_trylock(struct mm_struct *mm) ++{ ++ /* ++ * We don't have this operation yet. ++ * ++ * It should be easy enough to do: it's basically a ++ * atomic_long_try_cmpxchg_acquire() ++ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but ++ * it also needs the proper lockdep magic etc. ++ */ ++ return false; ++} ++ ++static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) ++{ ++ mmap_read_unlock(mm); ++ if (regs && !user_mode(regs)) { ++ unsigned long ip = instruction_pointer(regs); ++ if (!search_exception_tables(ip)) ++ return false; ++ } ++ mmap_write_lock(mm); ++ return true; ++} ++ ++/* ++ * Helper for page fault handling. ++ * ++ * This is kind of equivalend to "mmap_read_lock()" followed ++ * by "find_extend_vma()", except it's a lot more careful about ++ * the locking (and will drop the lock on failure). ++ * ++ * For example, if we have a kernel bug that causes a page ++ * fault, we don't want to just use mmap_read_lock() to get ++ * the mm lock, because that would deadlock if the bug were ++ * to happen while we're holding the mm lock for writing. ++ * ++ * So this checks the exception tables on kernel faults in ++ * order to only do this all for instructions that are actually ++ * expected to fault. ++ * ++ * We can also actually take the mm lock for writing if we ++ * need to extend the vma, which helps the VM layer a lot. ++ */ ++struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, ++ unsigned long addr, struct pt_regs *regs) ++{ ++ struct vm_area_struct *vma; ++ ++ if (!get_mmap_lock_carefully(mm, regs)) ++ return NULL; ++ ++ vma = find_vma(mm, addr); ++ if (likely(vma && (vma->vm_start <= addr))) ++ return vma; ++ ++ /* ++ * Well, dang. We might still be successful, but only ++ * if we can extend a vma to do so. ++ */ ++ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { ++ mmap_read_unlock(mm); ++ return NULL; ++ } ++ ++ /* ++ * We can try to upgrade the mmap lock atomically, ++ * in which case we can continue to use the vma ++ * we already looked up. ++ * ++ * Otherwise we'll have to drop the mmap lock and ++ * re-take it, and also look up the vma again, ++ * re-checking it. ++ */ ++ if (!mmap_upgrade_trylock(mm)) { ++ if (!upgrade_mmap_lock_carefully(mm, regs)) ++ return NULL; ++ ++ vma = find_vma(mm, addr); ++ if (!vma) ++ goto fail; ++ if (vma->vm_start <= addr) ++ goto success; ++ if (!(vma->vm_flags & VM_GROWSDOWN)) ++ goto fail; ++ } ++ ++ if (expand_stack(vma, addr)) ++ goto fail; ++ ++success: ++ mmap_write_downgrade(mm); ++ return vma; ++ ++fail: ++ mmap_write_unlock(mm); ++ return NULL; ++} ++#endif ++ + #ifdef CONFIG_PER_VMA_LOCK + /* + * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be diff --git a/queue-6.4/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch b/queue-6.4/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch new file mode 100644 index 00000000000..ee7ca86c04d --- /dev/null +++ b/queue-6.4/mm-make-find_extend_vma-fail-if-write-lock-not-held.patch @@ -0,0 +1,242 @@ +From f440fa1ac955e2898893f9301568435eb5cdfc4b Mon Sep 17 00:00:00 2001 +From: "Liam R. Howlett" +Date: Fri, 16 Jun 2023 15:58:54 -0700 +Subject: mm: make find_extend_vma() fail if write lock not held + +From: Liam R. Howlett + +commit f440fa1ac955e2898893f9301568435eb5cdfc4b upstream. + +Make calls to extend_vma() and find_extend_vma() fail if the write lock +is required. + +To avoid making this a flag-day event, this still allows the old +read-locking case for the trivial situations, and passes in a flag to +say "is it write-locked". That way write-lockers can say "yes, I'm +being careful", and legacy users will continue to work in all the common +cases until they have been fully converted to the new world order. + +Co-Developed-by: Matthew Wilcox (Oracle) +Signed-off-by: Matthew Wilcox (Oracle) +Signed-off-by: Liam R. Howlett +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + fs/binfmt_elf.c | 6 +++--- + fs/exec.c | 5 +++-- + include/linux/mm.h | 10 +++++++--- + mm/memory.c | 2 +- + mm/mmap.c | 50 +++++++++++++++++++++++++++++++++----------------- + mm/nommu.c | 3 ++- + 6 files changed, 49 insertions(+), 27 deletions(-) + +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *b + * Grow the stack manually; some architectures have a limit on how + * far ahead a user-space access may be in order to grow the stack. + */ +- if (mmap_read_lock_killable(mm)) ++ if (mmap_write_lock_killable(mm)) + return -EINTR; +- vma = find_extend_vma(mm, bprm->p); +- mmap_read_unlock(mm); ++ vma = find_extend_vma_locked(mm, bprm->p, true); ++ mmap_write_unlock(mm); + if (!vma) + return -EFAULT; + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -205,7 +205,8 @@ static struct page *get_arg_page(struct + + #ifdef CONFIG_STACK_GROWSUP + if (write) { +- ret = expand_downwards(bprm->vma, pos); ++ /* We claim to hold the lock - nobody to race with */ ++ ret = expand_downwards(bprm->vma, pos, true); + if (ret < 0) + return NULL; + } +@@ -853,7 +854,7 @@ int setup_arg_pages(struct linux_binprm + stack_base = vma->vm_end - stack_expand; + #endif + current->mm->start_stack = bprm->p; +- ret = expand_stack(vma, stack_base); ++ ret = expand_stack_locked(vma, stack_base, true); + if (ret) + ret = -EFAULT; + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -3192,11 +3192,13 @@ extern vm_fault_t filemap_page_mkwrite(s + + extern unsigned long stack_guard_gap; + /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ +-extern int expand_stack(struct vm_area_struct *vma, unsigned long address); ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked); ++#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) + + /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ +-extern int expand_downwards(struct vm_area_struct *vma, +- unsigned long address); ++int expand_downwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked); + #if VM_GROWSUP + extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); + #else +@@ -3297,6 +3299,8 @@ unsigned long change_prot_numa(struct vm + #endif + + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, ++ unsigned long addr, bool write_locked); + int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); + int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_ + goto fail; + } + +- if (expand_stack(vma, addr)) ++ if (expand_stack_locked(vma, addr, true)) + goto fail; + + success: +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1935,7 +1935,8 @@ static int acct_stack_growth(struct vm_a + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +-int expand_upwards(struct vm_area_struct *vma, unsigned long address) ++int expand_upwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; +@@ -1959,6 +1960,8 @@ int expand_upwards(struct vm_area_struct + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + ++ if (!write_locked) ++ return -EAGAIN; + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) +@@ -2028,7 +2031,8 @@ int expand_upwards(struct vm_area_struct + /* + * vma is the first one with address < vma->vm_start. Have to extend vma. + */ +-int expand_downwards(struct vm_area_struct *vma, unsigned long address) ++int expand_downwards(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); +@@ -2042,10 +2046,13 @@ int expand_downwards(struct vm_area_stru + /* Enforce stack_guard_gap */ + prev = mas_prev(&mas, 0); + /* Check that both stack segments have the same anon_vma? */ +- if (prev && !(prev->vm_flags & VM_GROWSDOWN) && +- vma_is_accessible(prev)) { +- if (address - prev->vm_end < stack_guard_gap) ++ if (prev) { ++ if (!(prev->vm_flags & VM_GROWSDOWN) && ++ vma_is_accessible(prev) && ++ (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; ++ if (!write_locked && (prev->vm_end == address)) ++ return -EAGAIN; + } + + if (mas_preallocate(&mas, GFP_KERNEL)) +@@ -2124,13 +2131,14 @@ static int __init cmdline_parse_stack_gu + __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + + #ifdef CONFIG_STACK_GROWSUP +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { +- return expand_upwards(vma, address); ++ return expand_upwards(vma, address, write_locked); + } + +-struct vm_area_struct * +-find_extend_vma(struct mm_struct *mm, unsigned long addr) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, ++ unsigned long addr, bool write_locked) + { + struct vm_area_struct *vma, *prev; + +@@ -2138,20 +2146,25 @@ find_extend_vma(struct mm_struct *mm, un + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; +- if (!prev || expand_stack(prev, addr)) ++ if (!prev) ++ return NULL; ++ if (expand_stack_locked(prev, addr, write_locked)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); + return prev; + } + #else +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { +- return expand_downwards(vma, address); ++ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) ++ return -EINVAL; ++ return expand_downwards(vma, address, write_locked); + } + +-struct vm_area_struct * +-find_extend_vma(struct mm_struct *mm, unsigned long addr) ++struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, ++ unsigned long addr, bool write_locked) + { + struct vm_area_struct *vma; + unsigned long start; +@@ -2162,10 +2175,8 @@ find_extend_vma(struct mm_struct *mm, un + return NULL; + if (vma->vm_start <= addr) + return vma; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- return NULL; + start = vma->vm_start; +- if (expand_stack(vma, addr)) ++ if (expand_stack_locked(vma, addr, write_locked)) + return NULL; + if (vma->vm_flags & VM_LOCKED) + populate_vma_page_range(vma, addr, start, NULL); +@@ -2173,6 +2184,11 @@ find_extend_vma(struct mm_struct *mm, un + } + #endif + ++struct vm_area_struct *find_extend_vma(struct mm_struct *mm, ++ unsigned long addr) ++{ ++ return find_extend_vma_locked(mm, addr, false); ++} + EXPORT_SYMBOL_GPL(find_extend_vma); + + /* +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -643,7 +643,8 @@ struct vm_area_struct *find_extend_vma(s + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +-int expand_stack(struct vm_area_struct *vma, unsigned long address) ++int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, ++ bool write_locked) + { + return -ENOMEM; + } diff --git a/queue-6.4/mm-make-the-page-fault-mmap-locking-killable.patch b/queue-6.4/mm-make-the-page-fault-mmap-locking-killable.patch new file mode 100644 index 00000000000..9116f6a7524 --- /dev/null +++ b/queue-6.4/mm-make-the-page-fault-mmap-locking-killable.patch @@ -0,0 +1,46 @@ +From eda0047296a16d65a7f2bc60a408f70d178b2014 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 15 Jun 2023 16:17:48 -0700 +Subject: mm: make the page fault mmap locking killable + +From: Linus Torvalds + +commit eda0047296a16d65a7f2bc60a408f70d178b2014 upstream. + +This is done as a separate patch from introducing the new +lock_mm_and_find_vma() helper, because while it's an obvious change, +it's not what x86 used to do in this area. + +We already abort the page fault on fatal signals anyway, so why should +we wait for the mmap lock only to then abort later? With the new helper +function that returns without the lock held on failure anyway, this is +particularly easy and straightforward. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5279,8 +5279,7 @@ static inline bool get_mmap_lock_careful + return false; + } + +- mmap_read_lock(mm); +- return true; ++ return !mmap_read_lock_killable(mm); + } + + static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +@@ -5304,8 +5303,7 @@ static inline bool upgrade_mmap_lock_car + if (!search_exception_tables(ip)) + return false; + } +- mmap_write_lock(mm); +- return true; ++ return !mmap_write_lock_killable(mm); + } + + /* diff --git a/queue-6.4/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch b/queue-6.4/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..f29786928b8 --- /dev/null +++ b/queue-6.4/powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch @@ -0,0 +1,47 @@ +From 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sat, 24 Jun 2023 11:17:05 -0700 +Subject: powerpc/mm: convert coprocessor fault to lock_mm_and_find_vma() + +From: Linus Torvalds + +commit 2cd76c50d0b41cec5c87abfcdf25b236a2793fb6 upstream. + +This is one of the simple cases, except there's no pt_regs pointer. +Which is fine, as lock_mm_and_find_vma() is set up to work fine with a +NULL pt_regs. + +Powerpc already enabled LOCK_MM_AND_FIND_VMA for the main CPU faulting, +so we can just use the helper without any extra work. + +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/mm/copro_fault.c | 14 +++----------- + 1 file changed, 3 insertions(+), 11 deletions(-) + +--- a/arch/powerpc/mm/copro_fault.c ++++ b/arch/powerpc/mm/copro_fault.c +@@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_stru + if (mm->pgd == NULL) + return -EFAULT; + +- mmap_read_lock(mm); +- ret = -EFAULT; +- vma = find_vma(mm, ea); ++ vma = lock_mm_and_find_vma(mm, ea, NULL); + if (!vma) +- goto out_unlock; +- +- if (ea < vma->vm_start) { +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto out_unlock; +- if (expand_stack(vma, ea)) +- goto out_unlock; +- } ++ return -EFAULT; + ++ ret = -EFAULT; + is_write = dsisr & DSISR_ISSTORE; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) diff --git a/queue-6.4/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..0fcbcfb8e60 --- /dev/null +++ b/queue-6.4/powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,86 @@ +From e6fe228c4ffafdfc970cf6d46883a1f481baf7ea Mon Sep 17 00:00:00 2001 +From: Michael Ellerman +Date: Fri, 16 Jun 2023 15:51:29 +1000 +Subject: powerpc/mm: Convert to using lock_mm_and_find_vma() + +From: Michael Ellerman + +commit e6fe228c4ffafdfc970cf6d46883a1f481baf7ea upstream. + +Signed-off-by: Michael Ellerman +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/Kconfig | 1 + + arch/powerpc/mm/fault.c | 41 ++++------------------------------------- + 2 files changed, 5 insertions(+), 37 deletions(-) + +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -278,6 +278,7 @@ config PPC + select IRQ_DOMAIN + select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN && MODULES ++ select LOCK_MM_AND_FIND_VMA + select MMU_GATHER_PAGE_SIZE + select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS +--- a/arch/powerpc/mm/fault.c ++++ b/arch/powerpc/mm/fault.c +@@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *re + return __bad_area_nosemaphore(regs, address, si_code); + } + +-static noinline int bad_area(struct pt_regs *regs, unsigned long address) +-{ +- return __bad_area(regs, address, SEGV_MAPERR); +-} +- + static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, + struct vm_area_struct *vma) + { +@@ -515,40 +510,12 @@ lock_mmap: + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the +- * exceptions table. +- * +- * As the vast majority of faults will be valid we will only perform +- * the source reference check when there is a possibility of a deadlock. +- * Attempt to lock the address space, if we cannot we then validate the +- * source. If this is invalid we can skip the address space check, +- * thus avoiding the deadlock. +- */ +- if (unlikely(!mmap_read_trylock(mm))) { +- if (!is_user && !search_exception_tables(regs->nip)) +- return bad_area_nosemaphore(regs, address); +- ++ * exceptions table. lock_mm_and_find_vma() handles that logic. ++ */ + retry: +- mmap_read_lock(mm); +- } else { +- /* +- * The above down_read_trylock() might have succeeded in +- * which case we'll have missed the might_sleep() from +- * down_read(): +- */ +- might_sleep(); +- } +- +- vma = find_vma(mm, address); ++ vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) +- return bad_area(regs, address); +- +- if (unlikely(vma->vm_start > address)) { +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) +- return bad_area(regs, address); +- +- if (unlikely(expand_stack(vma, address))) +- return bad_area(regs, address); +- } ++ return bad_area_nosemaphore(regs, address); + + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) diff --git a/queue-6.4/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch b/queue-6.4/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch new file mode 100644 index 00000000000..b3604267d47 --- /dev/null +++ b/queue-6.4/riscv-mm-convert-to-using-lock_mm_and_find_vma.patch @@ -0,0 +1,95 @@ +From 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 22 Jun 2023 20:18:18 +0200 +Subject: riscv/mm: Convert to using lock_mm_and_find_vma() + +From: Ben Hutchings + +commit 7267ef7b0b77f4ed23b7b3c87d8eca7bd9c2d007 upstream. + +Signed-off-by: Ben Hutchings +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + arch/riscv/Kconfig | 1 + + arch/riscv/mm/fault.c | 31 +++++++++++++------------------ + 2 files changed, 14 insertions(+), 18 deletions(-) + +--- a/arch/riscv/Kconfig ++++ b/arch/riscv/Kconfig +@@ -126,6 +126,7 @@ config RISCV + select IRQ_DOMAIN + select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN ++ select LOCK_MM_AND_FIND_VMA + select MODULES_USE_ELF_RELA if MODULES + select MODULE_SECTIONS if MODULES + select OF +--- a/arch/riscv/mm/fault.c ++++ b/arch/riscv/mm/fault.c +@@ -84,13 +84,13 @@ static inline void mm_fault_error(struct + BUG(); + } + +-static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) ++static inline void ++bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) + { + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ +- mmap_read_unlock(mm); + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); +@@ -100,6 +100,15 @@ static inline void bad_area(struct pt_re + no_context(regs, addr); + } + ++static inline void ++bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, ++ unsigned long addr) ++{ ++ mmap_read_unlock(mm); ++ ++ bad_area_nosemaphore(regs, code, addr); ++} ++ + static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) + { + pgd_t *pgd, *pgd_k; +@@ -287,23 +296,10 @@ void handle_page_fault(struct pt_regs *r + else if (cause == EXC_INST_PAGE_FAULT) + flags |= FAULT_FLAG_INSTRUCTION; + retry: +- mmap_read_lock(mm); +- vma = find_vma(mm, addr); ++ vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); +- return; +- } +- if (likely(vma->vm_start <= addr)) +- goto good_area; +- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { +- tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); +- return; +- } +- if (unlikely(expand_stack(vma, addr))) { +- tsk->thread.bad_cause = cause; +- bad_area(regs, mm, code, addr); ++ bad_area_nosemaphore(regs, code, addr); + return; + } + +@@ -311,7 +307,6 @@ retry: + * Ok, we have a good vm_area for this memory access, so + * we can handle it. + */ +-good_area: + code = SEGV_ACCERR; + + if (unlikely(access_error(cause, vma))) { diff --git a/queue-6.4/series b/queue-6.4/series index ca8e99a97ae..bbaeef50df2 100644 --- a/queue-6.4/series +++ b/queue-6.4/series @@ -7,3 +7,16 @@ x86-smp-cure-kexec-vs.-mwait_play_dead-breakage.patch cpufreq-amd-pstate-make-amd-pstate-epp-driver-name-hyphenated.patch can-isotp-isotp_sendmsg-fix-return-error-fix-on-tx-path.patch maple_tree-fix-potential-out-of-bounds-access-in-mas_wr_end_piv.patch +mm-introduce-new-lock_mm_and_find_vma-page-fault-helper.patch +mm-make-the-page-fault-mmap-locking-killable.patch +arm64-mm-convert-to-using-lock_mm_and_find_vma.patch +powerpc-mm-convert-to-using-lock_mm_and_find_vma.patch +mips-mm-convert-to-using-lock_mm_and_find_vma.patch +riscv-mm-convert-to-using-lock_mm_and_find_vma.patch +arm-mm-convert-to-using-lock_mm_and_find_vma.patch +mm-fault-convert-remaining-simple-cases-to-lock_mm_and_find_vma.patch +powerpc-mm-convert-coprocessor-fault-to-lock_mm_and_find_vma.patch +mm-make-find_extend_vma-fail-if-write-lock-not-held.patch +execve-expand-new-process-stack-manually-ahead-of-time.patch +mm-always-expand-the-stack-with-the-mmap-write-lock-held.patch +gup-add-warning-if-some-caller-would-seem-to-want-stack-expansion.patch