From 23bbcf0fcce27a4783c63cafb9836230c11e7633 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 19 Nov 2014 11:19:11 -0800 Subject: [PATCH] 3.10-stable patches added patches: mm-memcg-handle-non-error-oom-situations-more-gracefully.patch --- ...error-oom-situations-more-gracefully.patch | 421 ++++++++++++++++++ queue-3.10/series | 1 + 2 files changed, 422 insertions(+) create mode 100644 queue-3.10/mm-memcg-handle-non-error-oom-situations-more-gracefully.patch diff --git a/queue-3.10/mm-memcg-handle-non-error-oom-situations-more-gracefully.patch b/queue-3.10/mm-memcg-handle-non-error-oom-situations-more-gracefully.patch new file mode 100644 index 00000000000..3dafb0f0f4e --- /dev/null +++ b/queue-3.10/mm-memcg-handle-non-error-oom-situations-more-gracefully.patch @@ -0,0 +1,421 @@ +From 4942642080ea82d99ab5b653abb9a12b7ba31f4a Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 16 Oct 2013 13:46:59 -0700 +Subject: mm: memcg: handle non-error OOM situations more gracefully + +From: Johannes Weiner + +commit 4942642080ea82d99ab5b653abb9a12b7ba31f4a upstream. + +Commit 3812c8c8f395 ("mm: memcg: do not trap chargers with full +callstack on OOM") assumed that only a few places that can trigger a +memcg OOM situation do not return VM_FAULT_OOM, like optional page cache +readahead. But there are many more and it's impractical to annotate +them all. + +First of all, we don't want to invoke the OOM killer when the failed +allocation is gracefully handled, so defer the actual kill to the end of +the fault handling as well. This simplifies the code quite a bit for +added bonus. + +Second, since a failed allocation might not be the abrupt end of the +fault, the memcg OOM handler needs to be re-entrant until the fault +finishes for subsequent allocation attempts. If an allocation is +attempted after the task already OOMed, allow it to bypass the limit so +that it can quickly finish the fault and invoke the OOM killer. + +Reported-by: azurIt +Signed-off-by: Johannes Weiner +Cc: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Cong Wang +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/memcontrol.h | 50 +++------------- + include/linux/sched.h | 7 -- + mm/filemap.c | 11 --- + mm/memcontrol.c | 139 ++++++++++++++++----------------------------- + mm/memory.c | 18 +++-- + mm/oom_kill.c | 2 + 6 files changed, 79 insertions(+), 148 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -124,47 +124,24 @@ extern void mem_cgroup_print_oom_info(st + extern void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage); + +-/** +- * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task +- * @new: true to enable, false to disable +- * +- * Toggle whether a failed memcg charge should invoke the OOM killer +- * or just return -ENOMEM. Returns the previous toggle state. +- * +- * NOTE: Any path that enables the OOM killer before charging must +- * call mem_cgroup_oom_synchronize() afterward to finalize the +- * OOM handling and clean up. +- */ +-static inline bool mem_cgroup_toggle_oom(bool new) ++static inline void mem_cgroup_oom_enable(void) + { +- bool old; +- +- old = current->memcg_oom.may_oom; +- current->memcg_oom.may_oom = new; +- +- return old; ++ WARN_ON(current->memcg_oom.may_oom); ++ current->memcg_oom.may_oom = 1; + } + +-static inline void mem_cgroup_enable_oom(void) ++static inline void mem_cgroup_oom_disable(void) + { +- bool old = mem_cgroup_toggle_oom(true); +- +- WARN_ON(old == true); +-} +- +-static inline void mem_cgroup_disable_oom(void) +-{ +- bool old = mem_cgroup_toggle_oom(false); +- +- WARN_ON(old == false); ++ WARN_ON(!current->memcg_oom.may_oom); ++ current->memcg_oom.may_oom = 0; + } + + static inline bool task_in_memcg_oom(struct task_struct *p) + { +- return p->memcg_oom.in_memcg_oom; ++ return p->memcg_oom.memcg; + } + +-bool mem_cgroup_oom_synchronize(void); ++bool mem_cgroup_oom_synchronize(bool wait); + + #ifdef CONFIG_MEMCG_SWAP + extern int do_swap_account; +@@ -389,16 +366,11 @@ static inline void mem_cgroup_end_update + { + } + +-static inline bool mem_cgroup_toggle_oom(bool new) +-{ +- return false; +-} +- +-static inline void mem_cgroup_enable_oom(void) ++static inline void mem_cgroup_oom_enable(void) + { + } + +-static inline void mem_cgroup_disable_oom(void) ++static inline void mem_cgroup_oom_disable(void) + { + } + +@@ -407,7 +379,7 @@ static inline bool task_in_memcg_oom(str + return false; + } + +-static inline bool mem_cgroup_oom_synchronize(void) ++static inline bool mem_cgroup_oom_synchronize(bool wait) + { + return false; + } +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1412,11 +1412,10 @@ struct task_struct { + } memcg_batch; + unsigned int memcg_kmem_skip_account; + struct memcg_oom_info { ++ struct mem_cgroup *memcg; ++ gfp_t gfp_mask; ++ int order; + unsigned int may_oom:1; +- unsigned int in_memcg_oom:1; +- unsigned int oom_locked:1; +- int wakeups; +- struct mem_cgroup *wait_on_memcg; + } memcg_oom; + #endif + #ifdef CONFIG_HAVE_HW_BREAKPOINT +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -1614,7 +1614,6 @@ int filemap_fault(struct vm_area_struct + struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; + struct page *page; +- bool memcg_oom; + pgoff_t size; + int ret = 0; + +@@ -1623,11 +1622,7 @@ int filemap_fault(struct vm_area_struct + return VM_FAULT_SIGBUS; + + /* +- * Do we have something in the page cache already? Either +- * way, try readahead, but disable the memcg OOM killer for it +- * as readahead is optional and no errors are propagated up +- * the fault stack. The OOM killer is enabled while trying to +- * instantiate the faulting page individually below. ++ * Do we have something in the page cache already? + */ + page = find_get_page(mapping, offset); + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { +@@ -1635,14 +1630,10 @@ int filemap_fault(struct vm_area_struct + * We found the page, so try async readahead before + * waiting for the lock. + */ +- memcg_oom = mem_cgroup_toggle_oom(false); + do_async_mmap_readahead(vma, ra, file, page, offset); +- mem_cgroup_toggle_oom(memcg_oom); + } else if (!page) { + /* No page in the page cache at all */ +- memcg_oom = mem_cgroup_toggle_oom(false); + do_sync_mmap_readahead(vma, ra, file, offset); +- mem_cgroup_toggle_oom(memcg_oom); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + ret = VM_FAULT_MAJOR; +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2191,110 +2191,59 @@ static void memcg_oom_recover(struct mem + memcg_wakeup_oom(memcg); + } + +-/* +- * try to call OOM killer +- */ + static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) + { +- bool locked; +- int wakeups; +- + if (!current->memcg_oom.may_oom) + return; +- +- current->memcg_oom.in_memcg_oom = 1; +- + /* +- * As with any blocking lock, a contender needs to start +- * listening for wakeups before attempting the trylock, +- * otherwise it can miss the wakeup from the unlock and sleep +- * indefinitely. This is just open-coded because our locking +- * is so particular to memcg hierarchies. ++ * We are in the middle of the charge context here, so we ++ * don't want to block when potentially sitting on a callstack ++ * that holds all kinds of filesystem and mm locks. ++ * ++ * Also, the caller may handle a failed allocation gracefully ++ * (like optional page cache readahead) and so an OOM killer ++ * invocation might not even be necessary. ++ * ++ * That's why we don't do anything here except remember the ++ * OOM context and then deal with it at the end of the page ++ * fault when the stack is unwound, the locks are released, ++ * and when we know whether the fault was overall successful. + */ +- wakeups = atomic_read(&memcg->oom_wakeups); +- mem_cgroup_mark_under_oom(memcg); +- +- locked = mem_cgroup_oom_trylock(memcg); +- +- if (locked) +- mem_cgroup_oom_notify(memcg); +- +- if (locked && !memcg->oom_kill_disable) { +- mem_cgroup_unmark_under_oom(memcg); +- mem_cgroup_out_of_memory(memcg, mask, order); +- mem_cgroup_oom_unlock(memcg); +- /* +- * There is no guarantee that an OOM-lock contender +- * sees the wakeups triggered by the OOM kill +- * uncharges. Wake any sleepers explicitely. +- */ +- memcg_oom_recover(memcg); +- } else { +- /* +- * A system call can just return -ENOMEM, but if this +- * is a page fault and somebody else is handling the +- * OOM already, we need to sleep on the OOM waitqueue +- * for this memcg until the situation is resolved. +- * Which can take some time because it might be +- * handled by a userspace task. +- * +- * However, this is the charge context, which means +- * that we may sit on a large call stack and hold +- * various filesystem locks, the mmap_sem etc. and we +- * don't want the OOM handler to deadlock on them +- * while we sit here and wait. Store the current OOM +- * context in the task_struct, then return -ENOMEM. +- * At the end of the page fault handler, with the +- * stack unwound, pagefault_out_of_memory() will check +- * back with us by calling +- * mem_cgroup_oom_synchronize(), possibly putting the +- * task to sleep. +- */ +- current->memcg_oom.oom_locked = locked; +- current->memcg_oom.wakeups = wakeups; +- css_get(&memcg->css); +- current->memcg_oom.wait_on_memcg = memcg; +- } ++ css_get(&memcg->css); ++ current->memcg_oom.memcg = memcg; ++ current->memcg_oom.gfp_mask = mask; ++ current->memcg_oom.order = order; + } + + /** + * mem_cgroup_oom_synchronize - complete memcg OOM handling ++ * @handle: actually kill/wait or just clean up the OOM state + * +- * This has to be called at the end of a page fault if the the memcg +- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. ++ * This has to be called at the end of a page fault if the memcg OOM ++ * handler was enabled. + * +- * Memcg supports userspace OOM handling, so failed allocations must ++ * Memcg supports userspace OOM handling where failed allocations must + * sleep on a waitqueue until the userspace task resolves the + * situation. Sleeping directly in the charge context with all kinds + * of locks held is not a good idea, instead we remember an OOM state + * in the task and mem_cgroup_oom_synchronize() has to be called at +- * the end of the page fault to put the task to sleep and clean up the +- * OOM state. ++ * the end of the page fault to complete the OOM handling. + * + * Returns %true if an ongoing memcg OOM situation was detected and +- * finalized, %false otherwise. ++ * completed, %false otherwise. + */ +-bool mem_cgroup_oom_synchronize(void) ++bool mem_cgroup_oom_synchronize(bool handle) + { ++ struct mem_cgroup *memcg = current->memcg_oom.memcg; + struct oom_wait_info owait; +- struct mem_cgroup *memcg; ++ bool locked; + + /* OOM is global, do not handle */ +- if (!current->memcg_oom.in_memcg_oom) +- return false; +- +- /* +- * We invoked the OOM killer but there is a chance that a kill +- * did not free up any charges. Everybody else might already +- * be sleeping, so restart the fault and keep the rampage +- * going until some charges are released. +- */ +- memcg = current->memcg_oom.wait_on_memcg; + if (!memcg) +- goto out; ++ return false; + +- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) +- goto out_memcg; ++ if (!handle) ++ goto cleanup; + + owait.memcg = memcg; + owait.wait.flags = 0; +@@ -2303,13 +2252,25 @@ bool mem_cgroup_oom_synchronize(void) + INIT_LIST_HEAD(&owait.wait.task_list); + + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); +- /* Only sleep if we didn't miss any wakeups since OOM */ +- if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) ++ mem_cgroup_mark_under_oom(memcg); ++ ++ locked = mem_cgroup_oom_trylock(memcg); ++ ++ if (locked) ++ mem_cgroup_oom_notify(memcg); ++ ++ if (locked && !memcg->oom_kill_disable) { ++ mem_cgroup_unmark_under_oom(memcg); ++ finish_wait(&memcg_oom_waitq, &owait.wait); ++ mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, ++ current->memcg_oom.order); ++ } else { + schedule(); +- finish_wait(&memcg_oom_waitq, &owait.wait); +-out_memcg: +- mem_cgroup_unmark_under_oom(memcg); +- if (current->memcg_oom.oom_locked) { ++ mem_cgroup_unmark_under_oom(memcg); ++ finish_wait(&memcg_oom_waitq, &owait.wait); ++ } ++ ++ if (locked) { + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender +@@ -2318,10 +2279,9 @@ out_memcg: + */ + memcg_oom_recover(memcg); + } ++cleanup: ++ current->memcg_oom.memcg = NULL; + css_put(&memcg->css); +- current->memcg_oom.wait_on_memcg = NULL; +-out: +- current->memcg_oom.in_memcg_oom = 0; + return true; + } + +@@ -2742,6 +2702,9 @@ static int __mem_cgroup_try_charge(struc + || fatal_signal_pending(current))) + goto bypass; + ++ if (unlikely(task_in_memcg_oom(current))) ++ goto bypass; ++ + /* + * We always charge the cgroup the mm_struct belongs to. + * The mm_struct's mem_cgroup changes on task migration if the +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -3860,15 +3860,21 @@ int handle_mm_fault(struct mm_struct *mm + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) +- mem_cgroup_enable_oom(); ++ mem_cgroup_oom_enable(); + + ret = __handle_mm_fault(mm, vma, address, flags); + +- if (flags & FAULT_FLAG_USER) +- mem_cgroup_disable_oom(); +- +- if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) +- mem_cgroup_oom_synchronize(); ++ if (flags & FAULT_FLAG_USER) { ++ mem_cgroup_oom_disable(); ++ /* ++ * The task may have entered a memcg OOM situation but ++ * if the allocation error was handled gracefully (no ++ * VM_FAULT_OOM), there is no need to kill anything. ++ * Just clean up the OOM state peacefully. ++ */ ++ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) ++ mem_cgroup_oom_synchronize(false); ++ } + + return ret; + } +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -704,7 +704,7 @@ void pagefault_out_of_memory(void) + { + struct zonelist *zonelist; + +- if (mem_cgroup_oom_synchronize()) ++ if (mem_cgroup_oom_synchronize(true)) + return; + + zonelist = node_zonelist(first_online_node, GFP_KERNEL); diff --git a/queue-3.10/series b/queue-3.10/series index 5525e476713..94fc458e91d 100644 --- a/queue-3.10/series +++ b/queue-3.10/series @@ -67,3 +67,4 @@ x86-finish-user-fault-error-path-with-fatal-signal.patch mm-memcg-enable-memcg-oom-killer-only-for-user-faults.patch mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch +mm-memcg-handle-non-error-oom-situations-more-gracefully.patch -- 2.47.3