From: Greg Kroah-Hartman Date: Wed, 19 Nov 2014 17:48:08 +0000 (-0800) Subject: 3.10-stable patches X-Git-Tag: v3.10.61~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=a232e522f1928b2654af59fdef11ad4a05927381;p=thirdparty%2Fkernel%2Fstable-queue.git 3.10-stable patches added patches: arch-mm-do-not-invoke-oom-killer-on-kernel-fault-oom.patch arch-mm-pass-userspace-fault-flag-to-generic-fault-handler.patch arch-mm-remove-obsolete-init-oom-protection.patch --- diff --git a/queue-3.10/arch-mm-do-not-invoke-oom-killer-on-kernel-fault-oom.patch b/queue-3.10/arch-mm-do-not-invoke-oom-killer-on-kernel-fault-oom.patch new file mode 100644 index 00000000000..80e6cdae951 --- /dev/null +++ b/queue-3.10/arch-mm-do-not-invoke-oom-killer-on-kernel-fault-oom.patch @@ -0,0 +1,160 @@ +From 871341023c771ad233620b7a1fb3d9c7031c4e5c Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Thu, 12 Sep 2013 15:13:38 -0700 +Subject: arch: mm: do not invoke OOM killer on kernel fault OOM + +From: Johannes Weiner + +commit 871341023c771ad233620b7a1fb3d9c7031c4e5c upstream. + +Kernel faults are expected to handle OOM conditions gracefully (gup, +uaccess etc.), so they should never invoke the OOM killer. Reserve this +for faults triggered in user context when it is the only option. + +Most architectures already do this, fix up the remaining few. + +Signed-off-by: Johannes Weiner +Reviewed-by: Michal Hocko +Acked-by: KOSAKI Motohiro +Cc: David Rientjes +Cc: KAMEZAWA Hiroyuki +Cc: azurIt +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Cong Wang +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/mm/fault.c | 14 +++++++------- + arch/arm64/mm/fault.c | 14 +++++++------- + arch/avr32/mm/fault.c | 2 +- + arch/mips/mm/fault.c | 2 ++ + arch/um/kernel/trap.c | 2 ++ + arch/unicore32/mm/fault.c | 14 +++++++------- + 6 files changed, 26 insertions(+), 22 deletions(-) + +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -349,6 +349,13 @@ retry: + if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) + return 0; + ++ /* ++ * If we are in kernel mode at this point, we ++ * have no context to handle this fault with. ++ */ ++ if (!user_mode(regs)) ++ goto no_context; ++ + if (fault & VM_FAULT_OOM) { + /* + * We ran out of memory, call the OOM killer, and return to +@@ -359,13 +366,6 @@ retry: + return 0; + } + +- /* +- * If we are in kernel mode at this point, we +- * have no context to handle this fault with. +- */ +- if (!user_mode(regs)) +- goto no_context; +- + if (fault & VM_FAULT_SIGBUS) { + /* + * We had some memory, but were unable to +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -288,6 +288,13 @@ retry: + VM_FAULT_BADACCESS)))) + return 0; + ++ /* ++ * If we are in kernel mode at this point, we have no context to ++ * handle this fault with. ++ */ ++ if (!user_mode(regs)) ++ goto no_context; ++ + if (fault & VM_FAULT_OOM) { + /* + * We ran out of memory, call the OOM killer, and return to +@@ -298,13 +305,6 @@ retry: + return 0; + } + +- /* +- * If we are in kernel mode at this point, we have no context to +- * handle this fault with. +- */ +- if (!user_mode(regs)) +- goto no_context; +- + if (fault & VM_FAULT_SIGBUS) { + /* + * We had some memory, but were unable to successfully fix up +--- a/arch/avr32/mm/fault.c ++++ b/arch/avr32/mm/fault.c +@@ -228,9 +228,9 @@ no_context: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- pagefault_out_of_memory(); + if (!user_mode(regs)) + goto no_context; ++ pagefault_out_of_memory(); + return; + + do_sigbus: +--- a/arch/mips/mm/fault.c ++++ b/arch/mips/mm/fault.c +@@ -240,6 +240,8 @@ out_of_memory: + * (which will retry the fault, or kill us if we got oom-killed). + */ + up_read(&mm->mmap_sem); ++ if (!user_mode(regs)) ++ goto no_context; + pagefault_out_of_memory(); + return; + +--- a/arch/um/kernel/trap.c ++++ b/arch/um/kernel/trap.c +@@ -124,6 +124,8 @@ out_of_memory: + * (which will retry the fault, or kill us if we got oom-killed). + */ + up_read(&mm->mmap_sem); ++ if (!is_user) ++ goto out_nosemaphore; + pagefault_out_of_memory(); + return 0; + } +--- a/arch/unicore32/mm/fault.c ++++ b/arch/unicore32/mm/fault.c +@@ -278,6 +278,13 @@ retry: + (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) + return 0; + ++ /* ++ * If we are in kernel mode at this point, we ++ * have no context to handle this fault with. ++ */ ++ if (!user_mode(regs)) ++ goto no_context; ++ + if (fault & VM_FAULT_OOM) { + /* + * We ran out of memory, call the OOM killer, and return to +@@ -288,13 +295,6 @@ retry: + return 0; + } + +- /* +- * If we are in kernel mode at this point, we +- * have no context to handle this fault with. +- */ +- if (!user_mode(regs)) +- goto no_context; +- + if (fault & VM_FAULT_SIGBUS) { + /* + * We had some memory, but were unable to diff --git a/queue-3.10/arch-mm-pass-userspace-fault-flag-to-generic-fault-handler.patch b/queue-3.10/arch-mm-pass-userspace-fault-flag-to-generic-fault-handler.patch new file mode 100644 index 00000000000..abb17f6e85a --- /dev/null +++ b/queue-3.10/arch-mm-pass-userspace-fault-flag-to-generic-fault-handler.patch @@ -0,0 +1,835 @@ +From 759496ba6407c6994d6a5ce3a5e74937d7816208 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Thu, 12 Sep 2013 15:13:39 -0700 +Subject: arch: mm: pass userspace fault flag to generic fault handler + +From: Johannes Weiner + +commit 759496ba6407c6994d6a5ce3a5e74937d7816208 upstream. + +Unlike global OOM handling, memory cgroup code will invoke the OOM killer +in any OOM situation because it has no way of telling faults occuring in +kernel context - which could be handled more gracefully - from +user-triggered faults. + +Pass a flag that identifies faults originating in user space from the +architecture-specific fault handlers to generic code so that memcg OOM +handling can be improved. + +Signed-off-by: Johannes Weiner +Reviewed-by: Michal Hocko +Cc: David Rientjes +Cc: KAMEZAWA Hiroyuki +Cc: azurIt +Cc: KOSAKI Motohiro +Signed-off-by: Andrew Morton +Signed-off-by: Cong Wang +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/alpha/mm/fault.c | 7 ++++--- + arch/arc/mm/fault.c | 6 ++++-- + arch/arm/mm/fault.c | 9 ++++++--- + arch/arm64/mm/fault.c | 17 ++++++++++------- + arch/avr32/mm/fault.c | 2 ++ + arch/cris/mm/fault.c | 6 ++++-- + arch/frv/mm/fault.c | 10 ++++++---- + arch/hexagon/mm/vm_fault.c | 6 ++++-- + arch/ia64/mm/fault.c | 6 ++++-- + arch/m32r/mm/fault.c | 10 ++++++---- + arch/m68k/mm/fault.c | 2 ++ + arch/metag/mm/fault.c | 6 ++++-- + arch/microblaze/mm/fault.c | 7 +++++-- + arch/mips/mm/fault.c | 6 ++++-- + arch/mn10300/mm/fault.c | 2 ++ + arch/openrisc/mm/fault.c | 1 + + arch/parisc/mm/fault.c | 7 +++++-- + arch/powerpc/mm/fault.c | 7 ++++--- + arch/s390/mm/fault.c | 2 ++ + arch/score/mm/fault.c | 7 ++++++- + arch/sh/mm/fault.c | 9 ++++++--- + arch/sparc/mm/fault_32.c | 12 +++++++++--- + arch/sparc/mm/fault_64.c | 6 ++++-- + arch/tile/mm/fault.c | 7 +++++-- + arch/um/kernel/trap.c | 20 ++++++++++++-------- + arch/unicore32/mm/fault.c | 8 ++++++-- + arch/x86/mm/fault.c | 8 +++++--- + arch/xtensa/mm/fault.c | 2 ++ + include/linux/mm.h | 1 + + 29 files changed, 135 insertions(+), 64 deletions(-) + +--- a/arch/alpha/mm/fault.c ++++ b/arch/alpha/mm/fault.c +@@ -89,8 +89,7 @@ do_page_fault(unsigned long address, uns + const struct exception_table_entry *fixup; + int fault, si_code = SEGV_MAPERR; + siginfo_t info; +- unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (cause > 0 ? FAULT_FLAG_WRITE : 0)); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + /* As of EV6, a load into $31/$f31 is a prefetch, and never faults + (or is suppressed by the PALcode). Support that for older CPUs +@@ -115,7 +114,8 @@ do_page_fault(unsigned long address, uns + if (address >= TASK_SIZE) + goto vmalloc_fault; + #endif +- ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); +@@ -142,6 +142,7 @@ retry: + } else { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + } + + /* If for any reason at all we couldn't handle the fault, +--- a/arch/arc/mm/fault.c ++++ b/arch/arc/mm/fault.c +@@ -59,8 +59,7 @@ void do_page_fault(struct pt_regs *regs, + struct mm_struct *mm = tsk->mm; + siginfo_t info; + int fault, ret; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (write ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + /* + * We fault-in kernel-space virtual memory on-demand. The +@@ -88,6 +87,8 @@ void do_page_fault(struct pt_regs *regs, + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); +@@ -115,6 +116,7 @@ good_area: + if (write) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + } else { + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsign + struct task_struct *tsk; + struct mm_struct *mm; + int fault, sig, code; +- int write = fsr & FSR_WRITE; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (write ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + if (notify_page_fault(regs, fsr)) + return 0; +@@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsign + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; ++ if (fsr & FSR_WRITE) ++ flags |= FAULT_FLAG_WRITE; ++ + /* + * As per x86, we may deadlock here. However, since the kernel only + * validly references user space from well defined areas of the code, +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -199,13 +199,6 @@ static int __kprobes do_page_fault(unsig + unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; + unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + +- if (esr & ESR_LNX_EXEC) { +- vm_flags = VM_EXEC; +- } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { +- vm_flags = VM_WRITE; +- mm_flags |= FAULT_FLAG_WRITE; +- } +- + tsk = current; + mm = tsk->mm; + +@@ -220,6 +213,16 @@ static int __kprobes do_page_fault(unsig + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(regs)) ++ mm_flags |= FAULT_FLAG_USER; ++ ++ if (esr & ESR_LNX_EXEC) { ++ vm_flags = VM_EXEC; ++ } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { ++ vm_flags = VM_WRITE; ++ mm_flags |= FAULT_FLAG_WRITE; ++ } ++ + /* + * As per x86, we may deadlock here. However, since the kernel only + * validly references user space from well defined areas of the code, +--- a/arch/avr32/mm/fault.c ++++ b/arch/avr32/mm/fault.c +@@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned l + + local_irq_enable(); + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + +--- a/arch/cris/mm/fault.c ++++ b/arch/cris/mm/fault.c +@@ -58,8 +58,7 @@ do_page_fault(unsigned long address, str + struct vm_area_struct * vma; + siginfo_t info; + int fault; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- ((writeaccess & 1) ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + D(printk(KERN_DEBUG + "Page fault for %lX on %X at %lX, prot %d write %d\n", +@@ -117,6 +116,8 @@ do_page_fault(unsigned long address, str + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); +@@ -155,6 +156,7 @@ retry: + } else if (writeaccess == 1) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + } else { + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; +--- a/arch/frv/mm/fault.c ++++ b/arch/frv/mm/fault.c +@@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datamm + struct vm_area_struct *vma; + struct mm_struct *mm; + unsigned long _pme, lrai, lrad, fixup; ++ unsigned long flags = 0; + siginfo_t info; + pgd_t *pge; + pud_t *pue; + pte_t *pte; +- int write; + int fault; + + #if 0 +@@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datamm + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(__frame)) ++ flags |= FAULT_FLAG_USER; ++ + down_read(&mm->mmap_sem); + + vma = find_vma(mm, ear0); +@@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datamm + */ + good_area: + info.si_code = SEGV_ACCERR; +- write = 0; + switch (esr0 & ESR0_ATXC) { + default: + /* handle write to write protected page */ +@@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datamm + #endif + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; +- write = 1; ++ flags |= FAULT_FLAG_WRITE; + break; + + /* handle read from protected page */ +@@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datamm + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ +- fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0); ++ fault = handle_mm_fault(mm, vma, ear0, flags); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; +--- a/arch/hexagon/mm/vm_fault.c ++++ b/arch/hexagon/mm/vm_fault.c +@@ -53,8 +53,7 @@ void do_page_fault(unsigned long address + int si_code = SEGV_MAPERR; + int fault; + const struct exception_table_entry *fixup; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (cause > 0 ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + /* + * If we're in an interrupt or have no user context, +@@ -65,6 +64,8 @@ void do_page_fault(unsigned long address + + local_irq_enable(); + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); +@@ -96,6 +97,7 @@ good_area: + case FLT_STORE: + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + break; + } + +--- a/arch/ia64/mm/fault.c ++++ b/arch/ia64/mm/fault.c +@@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long addres + mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) + | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); + +- flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0); +- + /* mmap_sem is performance critical.... */ + prefetchw(&mm->mmap_sem); + +@@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long addres + if (notify_page_fault(regs, TRAP_BRKPT)) + return; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; ++ if (mask & VM_WRITE) ++ flags |= FAULT_FLAG_WRITE; + retry: + down_read(&mm->mmap_sem); + +--- a/arch/m32r/mm/fault.c ++++ b/arch/m32r/mm/fault.c +@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_ + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long page, addr; +- int write; ++ unsigned long flags = 0; + int fault; + siginfo_t info; + +@@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; + ++ if (error_code & ACE_USERMODE) ++ flags |= FAULT_FLAG_USER; ++ + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an +@@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_ + */ + good_area: + info.si_code = SEGV_ACCERR; +- write = 0; + switch (error_code & (ACE_WRITE|ACE_PROTECTION)) { + default: /* 3: write, present */ + /* fall through */ + case ACE_WRITE: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; +- write++; ++ flags |= FAULT_FLAG_WRITE; + break; + case ACE_PROTECTION: /* read, present */ + case 0: /* read, not present */ +@@ -194,7 +196,7 @@ good_area: + */ + addr = (address & PAGE_MASK); + set_thread_fault_code(error_code); +- fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0); ++ fault = handle_mm_fault(mm, vma, addr, flags); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; +--- a/arch/m68k/mm/fault.c ++++ b/arch/m68k/mm/fault.c +@@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + +--- a/arch/metag/mm/fault.c ++++ b/arch/metag/mm/fault.c +@@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, + struct vm_area_struct *vma, *prev_vma; + siginfo_t info; + int fault; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (write_access ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + tsk = current; + +@@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + +@@ -121,6 +122,7 @@ good_area: + if (write_access) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + } else { + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + goto bad_area; +--- a/arch/microblaze/mm/fault.c ++++ b/arch/microblaze/mm/fault.c +@@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, + int code = SEGV_MAPERR; + int is_write = error_code & ESR_S; + int fault; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (is_write ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + regs->ear = address; + regs->esr = error_code; +@@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, + die("Weird page fault", regs, SIGSEGV); + } + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; ++ + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an +@@ -199,6 +201,7 @@ good_area: + if (unlikely(is_write)) { + if (unlikely(!(vma->vm_flags & VM_WRITE))) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + /* a read */ + } else { + /* protection fault */ +--- a/arch/mips/mm/fault.c ++++ b/arch/mips/mm/fault.c +@@ -41,8 +41,7 @@ asmlinkage void __kprobes do_page_fault( + const int field = sizeof(unsigned long) * 2; + siginfo_t info; + int fault; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (write ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + #if 0 + printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), +@@ -92,6 +91,8 @@ asmlinkage void __kprobes do_page_fault( + if (in_atomic() || !mm) + goto bad_area_nosemaphore; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); +@@ -113,6 +114,7 @@ good_area: + if (write) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + } else { + if (cpu_has_rixi) { + if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) { +--- a/arch/mn10300/mm/fault.c ++++ b/arch/mn10300/mm/fault.c +@@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_ + if (in_atomic() || !mm) + goto no_context; + ++ if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + +--- a/arch/openrisc/mm/fault.c ++++ b/arch/openrisc/mm/fault.c +@@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_ + if (user_mode(regs)) { + /* Exception was in userspace: reenable interrupts */ + local_irq_enable(); ++ flags |= FAULT_FLAG_USER; + } else { + /* If exception was in a syscall, then IRQ's may have + * been enabled or disabled. If they were enabled, +--- a/arch/parisc/mm/fault.c ++++ b/arch/parisc/mm/fault.c +@@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; ++ if (acc_type & VM_WRITE) ++ flags |= FAULT_FLAG_WRITE; + retry: + down_read(&mm->mmap_sem); + vma = find_vma_prev(mm, address, &prev_vma); +@@ -203,8 +207,7 @@ good_area: + * fault. + */ + +- fault = handle_mm_fault(mm, vma, address, +- flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0)); ++ fault = handle_mm_fault(mm, vma, address, flags); + + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + return; +--- a/arch/powerpc/mm/fault.c ++++ b/arch/powerpc/mm/fault.c +@@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_re + is_write = error_code & ESR_DST; + #endif /* CONFIG_4xx || CONFIG_BOOKE */ + +- if (is_write) +- flags |= FAULT_FLAG_WRITE; +- + #ifdef CONFIG_PPC_ICSWX + /* + * we need to do this early because this "data storage +@@ -280,6 +277,9 @@ int __kprobes do_page_fault(struct pt_re + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; ++ + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an +@@ -408,6 +408,7 @@ good_area: + } else if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + /* a read */ + } else { + /* protection fault */ +--- a/arch/s390/mm/fault.c ++++ b/arch/s390/mm/fault.c +@@ -302,6 +302,8 @@ static inline int do_exception(struct pt + address = trans_exc_code & __FAIL_ADDR_MASK; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) + flags |= FAULT_FLAG_WRITE; + down_read(&mm->mmap_sem); +--- a/arch/score/mm/fault.c ++++ b/arch/score/mm/fault.c +@@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + const int field = sizeof(unsigned long) * 2; ++ unsigned long flags = 0; + siginfo_t info; + int fault; + +@@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; ++ + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); + if (!vma) +@@ -95,6 +99,7 @@ good_area: + if (write) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + } else { + if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) + goto bad_area; +@@ -105,7 +110,7 @@ good_area: + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ +- fault = handle_mm_fault(mm, vma, address, write); ++ fault = handle_mm_fault(mm, vma, address, flags); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; +--- a/arch/sh/mm/fault.c ++++ b/arch/sh/mm/fault.c +@@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault( + struct mm_struct *mm; + struct vm_area_struct * vma; + int fault; +- int write = error_code & FAULT_CODE_WRITE; +- unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (write ? FAULT_FLAG_WRITE : 0)); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + tsk = current; + mm = tsk->mm; +@@ -476,6 +474,11 @@ good_area: + + set_thread_fault_code(error_code); + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; ++ if (error_code & FAULT_CODE_WRITE) ++ flags |= FAULT_FLAG_WRITE; ++ + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo +--- a/arch/sparc/mm/fault_32.c ++++ b/arch/sparc/mm/fault_32.c +@@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt + unsigned long g2; + int from_user = !(regs->psr & PSR_PS); + int fault, code; +- unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (write ? FAULT_FLAG_WRITE : 0)); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + if (text_fault) + address = regs->pc; +@@ -235,6 +234,11 @@ good_area: + goto bad_area; + } + ++ if (from_user) ++ flags |= FAULT_FLAG_USER; ++ if (write) ++ flags |= FAULT_FLAG_WRITE; ++ + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo +@@ -383,6 +387,7 @@ static void force_user_fault(unsigned lo + struct vm_area_struct *vma; + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; ++ unsigned int flags = FAULT_FLAG_USER; + int code; + + code = SEGV_MAPERR; +@@ -402,11 +407,12 @@ good_area: + if (write) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + } else { + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; + } +- switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) { ++ switch (handle_mm_fault(mm, vma, address, flags)) { + case VM_FAULT_SIGBUS: + case VM_FAULT_OOM: + goto do_sigbus; +--- a/arch/sparc/mm/fault_64.c ++++ b/arch/sparc/mm/fault_64.c +@@ -323,7 +323,8 @@ asmlinkage void __kprobes do_sparc64_fau + bad_kernel_pc(regs, address); + return; + } +- } ++ } else ++ flags |= FAULT_FLAG_USER; + + /* + * If we're in an interrupt or have no user +@@ -426,13 +427,14 @@ good_area: + vma->vm_file != NULL) + set_thread_fault_code(fault_code | + FAULT_CODE_BLKCOMMIT); ++ ++ flags |= FAULT_FLAG_WRITE; + } else { + /* Allow reads even for write-only mappings */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; + } + +- flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, address, flags); + + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) +--- a/arch/tile/mm/fault.c ++++ b/arch/tile/mm/fault.c +@@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_r + if (!is_page_fault) + write = 1; + +- flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (write ? FAULT_FLAG_WRITE : 0)); ++ flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL); + +@@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_r + goto bad_area_nosemaphore; + } + ++ if (!is_kernel_mode) ++ flags |= FAULT_FLAG_USER; ++ + /* + * When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the +@@ -425,6 +427,7 @@ good_area: + #endif + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; ++ flags |= FAULT_FLAG_WRITE; + } else { + if (!is_page_fault || !(vma->vm_flags & VM_READ)) + goto bad_area; +--- a/arch/um/kernel/trap.c ++++ b/arch/um/kernel/trap.c +@@ -30,8 +30,7 @@ int handle_page_fault(unsigned long addr + pmd_t *pmd; + pte_t *pte; + int err = -EFAULT; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (is_write ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + *code_out = SEGV_MAPERR; + +@@ -42,6 +41,8 @@ int handle_page_fault(unsigned long addr + if (in_atomic()) + goto out_nosemaphore; + ++ if (is_user) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); +@@ -58,12 +59,15 @@ retry: + + good_area: + *code_out = SEGV_ACCERR; +- if (is_write && !(vma->vm_flags & VM_WRITE)) +- goto out; +- +- /* Don't require VM_READ|VM_EXEC for write faults! */ +- if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) +- goto out; ++ if (is_write) { ++ if (!(vma->vm_flags & VM_WRITE)) ++ goto out; ++ flags |= FAULT_FLAG_WRITE; ++ } else { ++ /* Don't require VM_READ|VM_EXEC for write faults! */ ++ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) ++ goto out; ++ } + + do { + int fault; +--- a/arch/unicore32/mm/fault.c ++++ b/arch/unicore32/mm/fault.c +@@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, uns + struct task_struct *tsk; + struct mm_struct *mm; + int fault, sig, code; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + tsk = current; + mm = tsk->mm; +@@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, uns + if (in_atomic() || !mm) + goto no_context; + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; ++ if (!(fsr ^ 0x12)) ++ flags |= FAULT_FLAG_WRITE; ++ + /* + * As per x86, we may deadlock here. However, since the kernel only + * validly references user space from well defined areas of the code, +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -1017,9 +1017,7 @@ __do_page_fault(struct pt_regs *regs, un + unsigned long address; + struct mm_struct *mm; + int fault; +- int write = error_code & PF_WRITE; +- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | +- (write ? FAULT_FLAG_WRITE : 0); ++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + tsk = current; + mm = tsk->mm; +@@ -1089,6 +1087,7 @@ __do_page_fault(struct pt_regs *regs, un + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; ++ flags |= FAULT_FLAG_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +@@ -1113,6 +1112,9 @@ __do_page_fault(struct pt_regs *regs, un + return; + } + ++ if (error_code & PF_WRITE) ++ flags |= FAULT_FLAG_WRITE; ++ + /* + * When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in +--- a/arch/xtensa/mm/fault.c ++++ b/arch/xtensa/mm/fault.c +@@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs) + address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); + #endif + ++ if (user_mode(regs)) ++ flags |= FAULT_FLAG_USER; + retry: + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -167,6 +167,7 @@ extern pgprot_t protection_map[16]; + #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ + #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ + #define FAULT_FLAG_TRIED 0x40 /* second try */ ++#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ + + /* + * vm_fault is filled by the the pagefault handler and passed to the vma's diff --git a/queue-3.10/arch-mm-remove-obsolete-init-oom-protection.patch b/queue-3.10/arch-mm-remove-obsolete-init-oom-protection.patch new file mode 100644 index 00000000000..7bb5cc44dea --- /dev/null +++ b/queue-3.10/arch-mm-remove-obsolete-init-oom-protection.patch @@ -0,0 +1,127 @@ +From 94bce453c78996cc4373d5da6cfabe07fcc6d9f9 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Thu, 12 Sep 2013 15:13:36 -0700 +Subject: arch: mm: remove obsolete init OOM protection + +From: Johannes Weiner + +commit 94bce453c78996cc4373d5da6cfabe07fcc6d9f9 upstream. + +The memcg code can trap tasks in the context of the failing allocation +until an OOM situation is resolved. They can hold all kinds of locks +(fs, mm) at this point, which makes it prone to deadlocking. + +This series converts memcg OOM handling into a two step process that is +started in the charge context, but any waiting is done after the fault +stack is fully unwound. + +Patches 1-4 prepare architecture handlers to support the new memcg +requirements, but in doing so they also remove old cruft and unify +out-of-memory behavior across architectures. + +Patch 5 disables the memcg OOM handling for syscalls, readahead, kernel +faults, because they can gracefully unwind the stack with -ENOMEM. OOM +handling is restricted to user triggered faults that have no other +option. + +Patch 6 reworks memcg's hierarchical OOM locking to make it a little +more obvious wth is going on in there: reduce locked regions, rename +locking functions, reorder and document. + +Patch 7 implements the two-part OOM handling such that tasks are never +trapped with the full charge stack in an OOM situation. + +This patch: + +Back before smart OOM killing, when faulting tasks were killed directly on +allocation failures, the arch-specific fault handlers needed special +protection for the init process. + +Now that all fault handlers call into the generic OOM killer (see commit +609838cfed97: "mm: invoke oom-killer from remaining unconverted page +fault handlers"), which already provides init protection, the +arch-specific leftovers can be removed. + +Signed-off-by: Johannes Weiner +Reviewed-by: Michal Hocko +Acked-by: KOSAKI Motohiro +Cc: David Rientjes +Cc: KAMEZAWA Hiroyuki +Cc: azurIt +Acked-by: Vineet Gupta [arch/arc bits] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Cong Wang +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arc/mm/fault.c | 5 ----- + arch/score/mm/fault.c | 6 ------ + arch/tile/mm/fault.c | 6 ------ + 3 files changed, 17 deletions(-) + +--- a/arch/arc/mm/fault.c ++++ b/arch/arc/mm/fault.c +@@ -120,7 +120,6 @@ good_area: + goto bad_area; + } + +-survive: + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo +@@ -200,10 +199,6 @@ no_context: + die("Oops", regs, address, cause_code); + + out_of_memory: +- if (is_global_init(tsk)) { +- yield(); +- goto survive; +- } + up_read(&mm->mmap_sem); + + if (user_mode(regs)) { +--- a/arch/score/mm/fault.c ++++ b/arch/score/mm/fault.c +@@ -100,7 +100,6 @@ good_area: + goto bad_area; + } + +-survive: + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo +@@ -167,11 +166,6 @@ no_context: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (is_global_init(tsk)) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } + if (!user_mode(regs)) + goto no_context; + pagefault_out_of_memory(); +--- a/arch/tile/mm/fault.c ++++ b/arch/tile/mm/fault.c +@@ -430,7 +430,6 @@ good_area: + goto bad_area; + } + +- survive: + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo +@@ -568,11 +567,6 @@ no_context: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (is_global_init(tsk)) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } + if (is_kernel_mode) + goto no_context; + pagefault_out_of_memory(); diff --git a/queue-3.10/series b/queue-3.10/series index 83fea6dea13..b532774f346 100644 --- a/queue-3.10/series +++ b/queue-3.10/series @@ -60,3 +60,6 @@ net-sctp-fix-remote-memory-pressure-from-excessive-queueing.patch net-sctp-fix-panic-on-duplicate-asconf-chunks.patch net-sctp-fix-skb_over_panic-when-receiving-malformed-asconf-chunks.patch mm-invoke-oom-killer-from-remaining-unconverted-page-fault-handlers.patch +arch-mm-remove-obsolete-init-oom-protection.patch +arch-mm-do-not-invoke-oom-killer-on-kernel-fault-oom.patch +arch-mm-pass-userspace-fault-flag-to-generic-fault-handler.patch