From d62f5c2cc111e40a038074dc0f1bbcadc941af8b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 19 Nov 2014 10:48:46 -0800 Subject: [PATCH] 3.10-stable patches added patches: mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch --- ...-chargers-with-full-callstack-on-oom.patch | 436 ++++++++++++++++++ ...-and-document-oom-waiting-and-wakeup.patch | 217 +++++++++ queue-3.10/series | 2 + 3 files changed, 655 insertions(+) create mode 100644 queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch create mode 100644 queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch diff --git a/queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch b/queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch new file mode 100644 index 00000000000..95e5f53b0d0 --- /dev/null +++ b/queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch @@ -0,0 +1,436 @@ +From 3812c8c8f3953921ef18544110dafc3505c1ac62 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Thu, 12 Sep 2013 15:13:44 -0700 +Subject: mm: memcg: do not trap chargers with full callstack on OOM + +From: Johannes Weiner + +commit 3812c8c8f3953921ef18544110dafc3505c1ac62 upstream. + +The memcg OOM handling is incredibly fragile and can deadlock. When a +task fails to charge memory, it invokes the OOM killer and loops right +there in the charge code until it succeeds. Comparably, any other task +that enters the charge path at this point will go to a waitqueue right +then and there and sleep until the OOM situation is resolved. The problem +is that these tasks may hold filesystem locks and the mmap_sem; locks that +the selected OOM victim may need to exit. + +For example, in one reported case, the task invoking the OOM killer was +about to charge a page cache page during a write(), which holds the +i_mutex. The OOM killer selected a task that was just entering truncate() +and trying to acquire the i_mutex: + +OOM invoking task: + mem_cgroup_handle_oom+0x241/0x3b0 + mem_cgroup_cache_charge+0xbe/0xe0 + add_to_page_cache_locked+0x4c/0x140 + add_to_page_cache_lru+0x22/0x50 + grab_cache_page_write_begin+0x8b/0xe0 + ext3_write_begin+0x88/0x270 + generic_file_buffered_write+0x116/0x290 + __generic_file_aio_write+0x27c/0x480 + generic_file_aio_write+0x76/0xf0 # takes ->i_mutex + do_sync_write+0xea/0x130 + vfs_write+0xf3/0x1f0 + sys_write+0x51/0x90 + system_call_fastpath+0x18/0x1d + +OOM kill victim: + do_truncate+0x58/0xa0 # takes i_mutex + do_last+0x250/0xa30 + path_openat+0xd7/0x440 + do_filp_open+0x49/0xa0 + do_sys_open+0x106/0x240 + sys_open+0x20/0x30 + system_call_fastpath+0x18/0x1d + +The OOM handling task will retry the charge indefinitely while the OOM +killed task is not releasing any resources. + +A similar scenario can happen when the kernel OOM killer for a memcg is +disabled and a userspace task is in charge of resolving OOM situations. +In this case, ALL tasks that enter the OOM path will be made to sleep on +the OOM waitqueue and wait for userspace to free resources or increase +the group's limit. But a userspace OOM handler is prone to deadlock +itself on the locks held by the waiting tasks. For example one of the +sleeping tasks may be stuck in a brk() call with the mmap_sem held for +writing but the userspace handler, in order to pick an optimal victim, +may need to read files from /proc/, which tries to acquire the same +mmap_sem for reading and deadlocks. + +This patch changes the way tasks behave after detecting a memcg OOM and +makes sure nobody loops or sleeps with locks held: + +1. When OOMing in a user fault, invoke the OOM killer and restart the + fault instead of looping on the charge attempt. This way, the OOM + victim can not get stuck on locks the looping task may hold. + +2. When OOMing in a user fault but somebody else is handling it + (either the kernel OOM killer or a userspace handler), don't go to + sleep in the charge context. Instead, remember the OOMing memcg in + the task struct and then fully unwind the page fault stack with + -ENOMEM. pagefault_out_of_memory() will then call back into the + memcg code to check if the -ENOMEM came from the memcg, and then + either put the task to sleep on the memcg's OOM waitqueue or just + restart the fault. The OOM victim can no longer get stuck on any + lock a sleeping task may hold. + +Debugged by Michal Hocko. + +Signed-off-by: Johannes Weiner +Reported-by: azurIt +Acked-by: Michal Hocko +Cc: David Rientjes +Cc: KAMEZAWA Hiroyuki +Cc: KOSAKI Motohiro +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Cong Wang +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/memcontrol.h | 21 ++++++ + include/linux/sched.h | 4 + + mm/memcontrol.c | 154 +++++++++++++++++++++++++++++++-------------- + mm/memory.c | 3 + mm/oom_kill.c | 7 +- + 5 files changed, 140 insertions(+), 49 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -130,6 +130,10 @@ extern void mem_cgroup_replace_page_cach + * + * Toggle whether a failed memcg charge should invoke the OOM killer + * or just return -ENOMEM. Returns the previous toggle state. ++ * ++ * NOTE: Any path that enables the OOM killer before charging must ++ * call mem_cgroup_oom_synchronize() afterward to finalize the ++ * OOM handling and clean up. + */ + static inline bool mem_cgroup_toggle_oom(bool new) + { +@@ -155,6 +159,13 @@ static inline void mem_cgroup_disable_oo + WARN_ON(old == false); + } + ++static inline bool task_in_memcg_oom(struct task_struct *p) ++{ ++ return p->memcg_oom.in_memcg_oom; ++} ++ ++bool mem_cgroup_oom_synchronize(void); ++ + #ifdef CONFIG_MEMCG_SWAP + extern int do_swap_account; + #endif +@@ -391,6 +402,16 @@ static inline void mem_cgroup_disable_oo + { + } + ++static inline bool task_in_memcg_oom(struct task_struct *p) ++{ ++ return false; ++} ++ ++static inline bool mem_cgroup_oom_synchronize(void) ++{ ++ return false; ++} ++ + static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) + { +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1413,6 +1413,10 @@ struct task_struct { + unsigned int memcg_kmem_skip_account; + struct memcg_oom_info { + unsigned int may_oom:1; ++ unsigned int in_memcg_oom:1; ++ unsigned int oom_locked:1; ++ int wakeups; ++ struct mem_cgroup *wait_on_memcg; + } memcg_oom; + #endif + #ifdef CONFIG_HAVE_HW_BREAKPOINT +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -302,6 +302,7 @@ struct mem_cgroup { + + bool oom_lock; + atomic_t under_oom; ++ atomic_t oom_wakeups; + + atomic_t refcnt; + +@@ -2179,6 +2180,7 @@ static int memcg_oom_wake_function(wait_ + + static void memcg_wakeup_oom(struct mem_cgroup *memcg) + { ++ atomic_inc(&memcg->oom_wakeups); + /* for filtering, pass "memcg" as argument. */ + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); + } +@@ -2190,19 +2192,17 @@ static void memcg_oom_recover(struct mem + } + + /* +- * try to call OOM killer. returns false if we should exit memory-reclaim loop. ++ * try to call OOM killer + */ +-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, +- int order) ++static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) + { +- struct oom_wait_info owait; + bool locked; ++ int wakeups; + +- owait.memcg = memcg; +- owait.wait.flags = 0; +- owait.wait.func = memcg_oom_wake_function; +- owait.wait.private = current; +- INIT_LIST_HEAD(&owait.wait.task_list); ++ if (!current->memcg_oom.may_oom) ++ return; ++ ++ current->memcg_oom.in_memcg_oom = 1; + + /* + * As with any blocking lock, a contender needs to start +@@ -2210,12 +2210,8 @@ static bool mem_cgroup_handle_oom(struct + * otherwise it can miss the wakeup from the unlock and sleep + * indefinitely. This is just open-coded because our locking + * is so particular to memcg hierarchies. +- * +- * Even if signal_pending(), we can't quit charge() loop without +- * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL +- * under OOM is always welcomed, use TASK_KILLABLE here. + */ +- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); ++ wakeups = atomic_read(&memcg->oom_wakeups); + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); +@@ -2225,15 +2221,95 @@ static bool mem_cgroup_handle_oom(struct + + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); +- finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, mask, order); ++ mem_cgroup_oom_unlock(memcg); ++ /* ++ * There is no guarantee that an OOM-lock contender ++ * sees the wakeups triggered by the OOM kill ++ * uncharges. Wake any sleepers explicitely. ++ */ ++ memcg_oom_recover(memcg); + } else { +- schedule(); +- mem_cgroup_unmark_under_oom(memcg); +- finish_wait(&memcg_oom_waitq, &owait.wait); ++ /* ++ * A system call can just return -ENOMEM, but if this ++ * is a page fault and somebody else is handling the ++ * OOM already, we need to sleep on the OOM waitqueue ++ * for this memcg until the situation is resolved. ++ * Which can take some time because it might be ++ * handled by a userspace task. ++ * ++ * However, this is the charge context, which means ++ * that we may sit on a large call stack and hold ++ * various filesystem locks, the mmap_sem etc. and we ++ * don't want the OOM handler to deadlock on them ++ * while we sit here and wait. Store the current OOM ++ * context in the task_struct, then return -ENOMEM. ++ * At the end of the page fault handler, with the ++ * stack unwound, pagefault_out_of_memory() will check ++ * back with us by calling ++ * mem_cgroup_oom_synchronize(), possibly putting the ++ * task to sleep. ++ */ ++ current->memcg_oom.oom_locked = locked; ++ current->memcg_oom.wakeups = wakeups; ++ css_get(&memcg->css); ++ current->memcg_oom.wait_on_memcg = memcg; + } ++} + +- if (locked) { ++/** ++ * mem_cgroup_oom_synchronize - complete memcg OOM handling ++ * ++ * This has to be called at the end of a page fault if the the memcg ++ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. ++ * ++ * Memcg supports userspace OOM handling, so failed allocations must ++ * sleep on a waitqueue until the userspace task resolves the ++ * situation. Sleeping directly in the charge context with all kinds ++ * of locks held is not a good idea, instead we remember an OOM state ++ * in the task and mem_cgroup_oom_synchronize() has to be called at ++ * the end of the page fault to put the task to sleep and clean up the ++ * OOM state. ++ * ++ * Returns %true if an ongoing memcg OOM situation was detected and ++ * finalized, %false otherwise. ++ */ ++bool mem_cgroup_oom_synchronize(void) ++{ ++ struct oom_wait_info owait; ++ struct mem_cgroup *memcg; ++ ++ /* OOM is global, do not handle */ ++ if (!current->memcg_oom.in_memcg_oom) ++ return false; ++ ++ /* ++ * We invoked the OOM killer but there is a chance that a kill ++ * did not free up any charges. Everybody else might already ++ * be sleeping, so restart the fault and keep the rampage ++ * going until some charges are released. ++ */ ++ memcg = current->memcg_oom.wait_on_memcg; ++ if (!memcg) ++ goto out; ++ ++ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) ++ goto out_memcg; ++ ++ owait.memcg = memcg; ++ owait.wait.flags = 0; ++ owait.wait.func = memcg_oom_wake_function; ++ owait.wait.private = current; ++ INIT_LIST_HEAD(&owait.wait.task_list); ++ ++ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); ++ /* Only sleep if we didn't miss any wakeups since OOM */ ++ if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) ++ schedule(); ++ finish_wait(&memcg_oom_waitq, &owait.wait); ++out_memcg: ++ mem_cgroup_unmark_under_oom(memcg); ++ if (current->memcg_oom.oom_locked) { + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender +@@ -2242,11 +2318,10 @@ static bool mem_cgroup_handle_oom(struct + */ + memcg_oom_recover(memcg); + } +- +- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) +- return false; +- /* Give chance to dying process */ +- schedule_timeout_uninterruptible(1); ++ css_put(&memcg->css); ++ current->memcg_oom.wait_on_memcg = NULL; ++out: ++ current->memcg_oom.in_memcg_oom = 0; + return true; + } + +@@ -2559,12 +2634,11 @@ enum { + CHARGE_RETRY, /* need to retry but retry is not bad */ + CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ + CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ +- CHARGE_OOM_DIE, /* the current is killed because of OOM */ + }; + + static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages, unsigned int min_pages, +- bool oom_check) ++ bool invoke_oom) + { + unsigned long csize = nr_pages * PAGE_SIZE; + struct mem_cgroup *mem_over_limit; +@@ -2621,14 +2695,10 @@ static int mem_cgroup_do_charge(struct m + if (mem_cgroup_wait_acct_move(mem_over_limit)) + return CHARGE_RETRY; + +- /* If we don't need to call oom-killer at el, return immediately */ +- if (!oom_check || !current->memcg_oom.may_oom) +- return CHARGE_NOMEM; +- /* check OOM */ +- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) +- return CHARGE_OOM_DIE; ++ if (invoke_oom) ++ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); + +- return CHARGE_RETRY; ++ return CHARGE_NOMEM; + } + + /* +@@ -2731,7 +2801,7 @@ again: + } + + do { +- bool oom_check; ++ bool invoke_oom = oom && !nr_oom_retries; + + /* If killed, bypass charge */ + if (fatal_signal_pending(current)) { +@@ -2739,14 +2809,8 @@ again: + goto bypass; + } + +- oom_check = false; +- if (oom && !nr_oom_retries) { +- oom_check = true; +- nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; +- } +- +- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, +- oom_check); ++ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, ++ nr_pages, invoke_oom); + switch (ret) { + case CHARGE_OK: + break; +@@ -2759,16 +2823,12 @@ again: + css_put(&memcg->css); + goto nomem; + case CHARGE_NOMEM: /* OOM routine works */ +- if (!oom) { ++ if (!oom || invoke_oom) { + css_put(&memcg->css); + goto nomem; + } +- /* If oom, we never return -ENOMEM */ + nr_oom_retries--; + break; +- case CHARGE_OOM_DIE: /* Killed by OOM Killer */ +- css_put(&memcg->css); +- goto bypass; + } + } while (ret != CHARGE_OK); + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm + if (flags & FAULT_FLAG_USER) + mem_cgroup_disable_oom(); + ++ if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) ++ mem_cgroup_oom_synchronize(); ++ + return ret; + } + +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -702,9 +702,12 @@ out: + */ + void pagefault_out_of_memory(void) + { +- struct zonelist *zonelist = node_zonelist(first_online_node, +- GFP_KERNEL); ++ struct zonelist *zonelist; + ++ if (mem_cgroup_oom_synchronize()) ++ return; ++ ++ zonelist = node_zonelist(first_online_node, GFP_KERNEL); + if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { + out_of_memory(NULL, 0, 0, NULL, false); + clear_zonelist_oom(zonelist, GFP_KERNEL); diff --git a/queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch b/queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch new file mode 100644 index 00000000000..2709693909f --- /dev/null +++ b/queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch @@ -0,0 +1,217 @@ +From fb2a6fc56be66c169f8b80e07ed999ba453a2db2 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Thu, 12 Sep 2013 15:13:43 -0700 +Subject: mm: memcg: rework and document OOM waiting and wakeup + +From: Johannes Weiner + +commit fb2a6fc56be66c169f8b80e07ed999ba453a2db2 upstream. + +The memcg OOM handler open-codes a sleeping lock for OOM serialization +(trylock, wait, repeat) because the required locking is so specific to +memcg hierarchies. However, it would be nice if this construct would be +clearly recognizable and not be as obfuscated as it is right now. Clean +up as follows: + +1. Remove the return value of mem_cgroup_oom_unlock() + +2. Rename mem_cgroup_oom_lock() to mem_cgroup_oom_trylock(). + +3. Pull the prepare_to_wait() out of the memcg_oom_lock scope. This + makes it more obvious that the task has to be on the waitqueue + before attempting to OOM-trylock the hierarchy, to not miss any + wakeups before going to sleep. It just didn't matter until now + because it was all lumped together into the global memcg_oom_lock + spinlock section. + +4. Pull the mem_cgroup_oom_notify() out of the memcg_oom_lock scope. + It is proctected by the hierarchical OOM-lock. + +5. The memcg_oom_lock spinlock is only required to propagate the OOM + lock in any given hierarchy atomically. Restrict its scope to + mem_cgroup_oom_(trylock|unlock). + +6. Do not wake up the waitqueue unconditionally at the end of the + function. Only the lockholder has to wake up the next in line + after releasing the lock. + + Note that the lockholder kicks off the OOM-killer, which in turn + leads to wakeups from the uncharges of the exiting task. But a + contender is not guaranteed to see them if it enters the OOM path + after the OOM kills but before the lockholder releases the lock. + Thus there has to be an explicit wakeup after releasing the lock. + +7. Put the OOM task on the waitqueue before marking the hierarchy as + under OOM as that is the point where we start to receive wakeups. + No point in listening before being on the waitqueue. + +8. Likewise, unmark the hierarchy before finishing the sleep, for + symmetry. + +Signed-off-by: Johannes Weiner +Acked-by: Michal Hocko +Cc: David Rientjes +Cc: KAMEZAWA Hiroyuki +Cc: azurIt +Cc: KOSAKI Motohiro +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Cong Wang +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 83 +++++++++++++++++++++++++++++++------------------------- + 1 file changed, 46 insertions(+), 37 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2075,15 +2075,18 @@ static int mem_cgroup_soft_reclaim(struc + return total; + } + ++static DEFINE_SPINLOCK(memcg_oom_lock); ++ + /* + * Check OOM-Killer is already running under our hierarchy. + * If someone is running, return false. +- * Has to be called with memcg_oom_lock + */ +-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) ++static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) + { + struct mem_cgroup *iter, *failed = NULL; + ++ spin_lock(&memcg_oom_lock); ++ + for_each_mem_cgroup_tree(iter, memcg) { + if (iter->oom_lock) { + /* +@@ -2097,33 +2100,33 @@ static bool mem_cgroup_oom_lock(struct m + iter->oom_lock = true; + } + +- if (!failed) +- return true; +- +- /* +- * OK, we failed to lock the whole subtree so we have to clean up +- * what we set up to the failing subtree +- */ +- for_each_mem_cgroup_tree(iter, memcg) { +- if (iter == failed) { +- mem_cgroup_iter_break(memcg, iter); +- break; ++ if (failed) { ++ /* ++ * OK, we failed to lock the whole subtree so we have ++ * to clean up what we set up to the failing subtree ++ */ ++ for_each_mem_cgroup_tree(iter, memcg) { ++ if (iter == failed) { ++ mem_cgroup_iter_break(memcg, iter); ++ break; ++ } ++ iter->oom_lock = false; + } +- iter->oom_lock = false; + } +- return false; ++ ++ spin_unlock(&memcg_oom_lock); ++ ++ return !failed; + } + +-/* +- * Has to be called with memcg_oom_lock +- */ +-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) ++static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) + { + struct mem_cgroup *iter; + ++ spin_lock(&memcg_oom_lock); + for_each_mem_cgroup_tree(iter, memcg) + iter->oom_lock = false; +- return 0; ++ spin_unlock(&memcg_oom_lock); + } + + static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) +@@ -2147,7 +2150,6 @@ static void mem_cgroup_unmark_under_oom( + atomic_add_unless(&iter->under_oom, -1, 0); + } + +-static DEFINE_SPINLOCK(memcg_oom_lock); + static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); + + struct oom_wait_info { +@@ -2194,45 +2196,52 @@ static bool mem_cgroup_handle_oom(struct + int order) + { + struct oom_wait_info owait; +- bool locked, need_to_kill; ++ bool locked; + + owait.memcg = memcg; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); +- need_to_kill = true; +- mem_cgroup_mark_under_oom(memcg); + +- /* At first, try to OOM lock hierarchy under memcg.*/ +- spin_lock(&memcg_oom_lock); +- locked = mem_cgroup_oom_lock(memcg); + /* ++ * As with any blocking lock, a contender needs to start ++ * listening for wakeups before attempting the trylock, ++ * otherwise it can miss the wakeup from the unlock and sleep ++ * indefinitely. This is just open-coded because our locking ++ * is so particular to memcg hierarchies. ++ * + * Even if signal_pending(), we can't quit charge() loop without + * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL + * under OOM is always welcomed, use TASK_KILLABLE here. + */ + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); +- if (!locked || memcg->oom_kill_disable) +- need_to_kill = false; ++ mem_cgroup_mark_under_oom(memcg); ++ ++ locked = mem_cgroup_oom_trylock(memcg); ++ + if (locked) + mem_cgroup_oom_notify(memcg); +- spin_unlock(&memcg_oom_lock); + +- if (need_to_kill) { ++ if (locked && !memcg->oom_kill_disable) { ++ mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, mask, order); + } else { + schedule(); ++ mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + } +- spin_lock(&memcg_oom_lock); +- if (locked) +- mem_cgroup_oom_unlock(memcg); +- memcg_wakeup_oom(memcg); +- spin_unlock(&memcg_oom_lock); + +- mem_cgroup_unmark_under_oom(memcg); ++ if (locked) { ++ mem_cgroup_oom_unlock(memcg); ++ /* ++ * There is no guarantee that an OOM-lock contender ++ * sees the wakeups triggered by the OOM kill ++ * uncharges. Wake any sleepers explicitely. ++ */ ++ memcg_oom_recover(memcg); ++ } + + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) + return false; diff --git a/queue-3.10/series b/queue-3.10/series index 2bda20e3f48..5525e476713 100644 --- a/queue-3.10/series +++ b/queue-3.10/series @@ -65,3 +65,5 @@ arch-mm-do-not-invoke-oom-killer-on-kernel-fault-oom.patch arch-mm-pass-userspace-fault-flag-to-generic-fault-handler.patch x86-finish-user-fault-error-path-with-fatal-signal.patch mm-memcg-enable-memcg-oom-killer-only-for-user-faults.patch +mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch +mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch -- 2.47.3