--- /dev/null
+From 3812c8c8f3953921ef18544110dafc3505c1ac62 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 12 Sep 2013 15:13:44 -0700
+Subject: mm: memcg: do not trap chargers with full callstack on OOM
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 3812c8c8f3953921ef18544110dafc3505c1ac62 upstream.
+
+The memcg OOM handling is incredibly fragile and can deadlock. When a
+task fails to charge memory, it invokes the OOM killer and loops right
+there in the charge code until it succeeds. Comparably, any other task
+that enters the charge path at this point will go to a waitqueue right
+then and there and sleep until the OOM situation is resolved. The problem
+is that these tasks may hold filesystem locks and the mmap_sem; locks that
+the selected OOM victim may need to exit.
+
+For example, in one reported case, the task invoking the OOM killer was
+about to charge a page cache page during a write(), which holds the
+i_mutex. The OOM killer selected a task that was just entering truncate()
+and trying to acquire the i_mutex:
+
+OOM invoking task:
+ mem_cgroup_handle_oom+0x241/0x3b0
+ mem_cgroup_cache_charge+0xbe/0xe0
+ add_to_page_cache_locked+0x4c/0x140
+ add_to_page_cache_lru+0x22/0x50
+ grab_cache_page_write_begin+0x8b/0xe0
+ ext3_write_begin+0x88/0x270
+ generic_file_buffered_write+0x116/0x290
+ __generic_file_aio_write+0x27c/0x480
+ generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
+ do_sync_write+0xea/0x130
+ vfs_write+0xf3/0x1f0
+ sys_write+0x51/0x90
+ system_call_fastpath+0x18/0x1d
+
+OOM kill victim:
+ do_truncate+0x58/0xa0 # takes i_mutex
+ do_last+0x250/0xa30
+ path_openat+0xd7/0x440
+ do_filp_open+0x49/0xa0
+ do_sys_open+0x106/0x240
+ sys_open+0x20/0x30
+ system_call_fastpath+0x18/0x1d
+
+The OOM handling task will retry the charge indefinitely while the OOM
+killed task is not releasing any resources.
+
+A similar scenario can happen when the kernel OOM killer for a memcg is
+disabled and a userspace task is in charge of resolving OOM situations.
+In this case, ALL tasks that enter the OOM path will be made to sleep on
+the OOM waitqueue and wait for userspace to free resources or increase
+the group's limit. But a userspace OOM handler is prone to deadlock
+itself on the locks held by the waiting tasks. For example one of the
+sleeping tasks may be stuck in a brk() call with the mmap_sem held for
+writing but the userspace handler, in order to pick an optimal victim,
+may need to read files from /proc/<pid>, which tries to acquire the same
+mmap_sem for reading and deadlocks.
+
+This patch changes the way tasks behave after detecting a memcg OOM and
+makes sure nobody loops or sleeps with locks held:
+
+1. When OOMing in a user fault, invoke the OOM killer and restart the
+ fault instead of looping on the charge attempt. This way, the OOM
+ victim can not get stuck on locks the looping task may hold.
+
+2. When OOMing in a user fault but somebody else is handling it
+ (either the kernel OOM killer or a userspace handler), don't go to
+ sleep in the charge context. Instead, remember the OOMing memcg in
+ the task struct and then fully unwind the page fault stack with
+ -ENOMEM. pagefault_out_of_memory() will then call back into the
+ memcg code to check if the -ENOMEM came from the memcg, and then
+ either put the task to sleep on the memcg's OOM waitqueue or just
+ restart the fault. The OOM victim can no longer get stuck on any
+ lock a sleeping task may hold.
+
+Debugged by Michal Hocko.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: azurIt <azurit@pobox.sk>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/memcontrol.h | 21 ++++++
+ include/linux/sched.h | 4 +
+ mm/memcontrol.c | 154 +++++++++++++++++++++++++++++++--------------
+ mm/memory.c | 3
+ mm/oom_kill.c | 7 +-
+ 5 files changed, 140 insertions(+), 49 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -130,6 +130,10 @@ extern void mem_cgroup_replace_page_cach
+ *
+ * Toggle whether a failed memcg charge should invoke the OOM killer
+ * or just return -ENOMEM. Returns the previous toggle state.
++ *
++ * NOTE: Any path that enables the OOM killer before charging must
++ * call mem_cgroup_oom_synchronize() afterward to finalize the
++ * OOM handling and clean up.
+ */
+ static inline bool mem_cgroup_toggle_oom(bool new)
+ {
+@@ -155,6 +159,13 @@ static inline void mem_cgroup_disable_oo
+ WARN_ON(old == false);
+ }
+
++static inline bool task_in_memcg_oom(struct task_struct *p)
++{
++ return p->memcg_oom.in_memcg_oom;
++}
++
++bool mem_cgroup_oom_synchronize(void);
++
+ #ifdef CONFIG_MEMCG_SWAP
+ extern int do_swap_account;
+ #endif
+@@ -391,6 +402,16 @@ static inline void mem_cgroup_disable_oo
+ {
+ }
+
++static inline bool task_in_memcg_oom(struct task_struct *p)
++{
++ return false;
++}
++
++static inline bool mem_cgroup_oom_synchronize(void)
++{
++ return false;
++}
++
+ static inline void mem_cgroup_inc_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx)
+ {
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1413,6 +1413,10 @@ struct task_struct {
+ unsigned int memcg_kmem_skip_account;
+ struct memcg_oom_info {
+ unsigned int may_oom:1;
++ unsigned int in_memcg_oom:1;
++ unsigned int oom_locked:1;
++ int wakeups;
++ struct mem_cgroup *wait_on_memcg;
+ } memcg_oom;
+ #endif
+ #ifdef CONFIG_HAVE_HW_BREAKPOINT
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -302,6 +302,7 @@ struct mem_cgroup {
+
+ bool oom_lock;
+ atomic_t under_oom;
++ atomic_t oom_wakeups;
+
+ atomic_t refcnt;
+
+@@ -2179,6 +2180,7 @@ static int memcg_oom_wake_function(wait_
+
+ static void memcg_wakeup_oom(struct mem_cgroup *memcg)
+ {
++ atomic_inc(&memcg->oom_wakeups);
+ /* for filtering, pass "memcg" as argument. */
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
+ }
+@@ -2190,19 +2192,17 @@ static void memcg_oom_recover(struct mem
+ }
+
+ /*
+- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
++ * try to call OOM killer
+ */
+-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+- int order)
++static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+ {
+- struct oom_wait_info owait;
+ bool locked;
++ int wakeups;
+
+- owait.memcg = memcg;
+- owait.wait.flags = 0;
+- owait.wait.func = memcg_oom_wake_function;
+- owait.wait.private = current;
+- INIT_LIST_HEAD(&owait.wait.task_list);
++ if (!current->memcg_oom.may_oom)
++ return;
++
++ current->memcg_oom.in_memcg_oom = 1;
+
+ /*
+ * As with any blocking lock, a contender needs to start
+@@ -2210,12 +2210,8 @@ static bool mem_cgroup_handle_oom(struct
+ * otherwise it can miss the wakeup from the unlock and sleep
+ * indefinitely. This is just open-coded because our locking
+ * is so particular to memcg hierarchies.
+- *
+- * Even if signal_pending(), we can't quit charge() loop without
+- * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+- * under OOM is always welcomed, use TASK_KILLABLE here.
+ */
+- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
++ wakeups = atomic_read(&memcg->oom_wakeups);
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+@@ -2225,15 +2221,95 @@ static bool mem_cgroup_handle_oom(struct
+
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
+- finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(memcg, mask, order);
++ mem_cgroup_oom_unlock(memcg);
++ /*
++ * There is no guarantee that an OOM-lock contender
++ * sees the wakeups triggered by the OOM kill
++ * uncharges. Wake any sleepers explicitely.
++ */
++ memcg_oom_recover(memcg);
+ } else {
+- schedule();
+- mem_cgroup_unmark_under_oom(memcg);
+- finish_wait(&memcg_oom_waitq, &owait.wait);
++ /*
++ * A system call can just return -ENOMEM, but if this
++ * is a page fault and somebody else is handling the
++ * OOM already, we need to sleep on the OOM waitqueue
++ * for this memcg until the situation is resolved.
++ * Which can take some time because it might be
++ * handled by a userspace task.
++ *
++ * However, this is the charge context, which means
++ * that we may sit on a large call stack and hold
++ * various filesystem locks, the mmap_sem etc. and we
++ * don't want the OOM handler to deadlock on them
++ * while we sit here and wait. Store the current OOM
++ * context in the task_struct, then return -ENOMEM.
++ * At the end of the page fault handler, with the
++ * stack unwound, pagefault_out_of_memory() will check
++ * back with us by calling
++ * mem_cgroup_oom_synchronize(), possibly putting the
++ * task to sleep.
++ */
++ current->memcg_oom.oom_locked = locked;
++ current->memcg_oom.wakeups = wakeups;
++ css_get(&memcg->css);
++ current->memcg_oom.wait_on_memcg = memcg;
+ }
++}
+
+- if (locked) {
++/**
++ * mem_cgroup_oom_synchronize - complete memcg OOM handling
++ *
++ * This has to be called at the end of a page fault if the the memcg
++ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
++ *
++ * Memcg supports userspace OOM handling, so failed allocations must
++ * sleep on a waitqueue until the userspace task resolves the
++ * situation. Sleeping directly in the charge context with all kinds
++ * of locks held is not a good idea, instead we remember an OOM state
++ * in the task and mem_cgroup_oom_synchronize() has to be called at
++ * the end of the page fault to put the task to sleep and clean up the
++ * OOM state.
++ *
++ * Returns %true if an ongoing memcg OOM situation was detected and
++ * finalized, %false otherwise.
++ */
++bool mem_cgroup_oom_synchronize(void)
++{
++ struct oom_wait_info owait;
++ struct mem_cgroup *memcg;
++
++ /* OOM is global, do not handle */
++ if (!current->memcg_oom.in_memcg_oom)
++ return false;
++
++ /*
++ * We invoked the OOM killer but there is a chance that a kill
++ * did not free up any charges. Everybody else might already
++ * be sleeping, so restart the fault and keep the rampage
++ * going until some charges are released.
++ */
++ memcg = current->memcg_oom.wait_on_memcg;
++ if (!memcg)
++ goto out;
++
++ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
++ goto out_memcg;
++
++ owait.memcg = memcg;
++ owait.wait.flags = 0;
++ owait.wait.func = memcg_oom_wake_function;
++ owait.wait.private = current;
++ INIT_LIST_HEAD(&owait.wait.task_list);
++
++ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
++ /* Only sleep if we didn't miss any wakeups since OOM */
++ if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
++ schedule();
++ finish_wait(&memcg_oom_waitq, &owait.wait);
++out_memcg:
++ mem_cgroup_unmark_under_oom(memcg);
++ if (current->memcg_oom.oom_locked) {
+ mem_cgroup_oom_unlock(memcg);
+ /*
+ * There is no guarantee that an OOM-lock contender
+@@ -2242,11 +2318,10 @@ static bool mem_cgroup_handle_oom(struct
+ */
+ memcg_oom_recover(memcg);
+ }
+-
+- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+- return false;
+- /* Give chance to dying process */
+- schedule_timeout_uninterruptible(1);
++ css_put(&memcg->css);
++ current->memcg_oom.wait_on_memcg = NULL;
++out:
++ current->memcg_oom.in_memcg_oom = 0;
+ return true;
+ }
+
+@@ -2559,12 +2634,11 @@ enum {
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+- CHARGE_OOM_DIE, /* the current is killed because of OOM */
+ };
+
+ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages, unsigned int min_pages,
+- bool oom_check)
++ bool invoke_oom)
+ {
+ unsigned long csize = nr_pages * PAGE_SIZE;
+ struct mem_cgroup *mem_over_limit;
+@@ -2621,14 +2695,10 @@ static int mem_cgroup_do_charge(struct m
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+- /* If we don't need to call oom-killer at el, return immediately */
+- if (!oom_check || !current->memcg_oom.may_oom)
+- return CHARGE_NOMEM;
+- /* check OOM */
+- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
+- return CHARGE_OOM_DIE;
++ if (invoke_oom)
++ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
+
+- return CHARGE_RETRY;
++ return CHARGE_NOMEM;
+ }
+
+ /*
+@@ -2731,7 +2801,7 @@ again:
+ }
+
+ do {
+- bool oom_check;
++ bool invoke_oom = oom && !nr_oom_retries;
+
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current)) {
+@@ -2739,14 +2809,8 @@ again:
+ goto bypass;
+ }
+
+- oom_check = false;
+- if (oom && !nr_oom_retries) {
+- oom_check = true;
+- nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+- }
+-
+- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+- oom_check);
++ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
++ nr_pages, invoke_oom);
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+@@ -2759,16 +2823,12 @@ again:
+ css_put(&memcg->css);
+ goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
+- if (!oom) {
++ if (!oom || invoke_oom) {
+ css_put(&memcg->css);
+ goto nomem;
+ }
+- /* If oom, we never return -ENOMEM */
+ nr_oom_retries--;
+ break;
+- case CHARGE_OOM_DIE: /* Killed by OOM Killer */
+- css_put(&memcg->css);
+- goto bypass;
+ }
+ } while (ret != CHARGE_OK);
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm
+ if (flags & FAULT_FLAG_USER)
+ mem_cgroup_disable_oom();
+
++ if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
++ mem_cgroup_oom_synchronize();
++
+ return ret;
+ }
+
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -702,9 +702,12 @@ out:
+ */
+ void pagefault_out_of_memory(void)
+ {
+- struct zonelist *zonelist = node_zonelist(first_online_node,
+- GFP_KERNEL);
++ struct zonelist *zonelist;
+
++ if (mem_cgroup_oom_synchronize())
++ return;
++
++ zonelist = node_zonelist(first_online_node, GFP_KERNEL);
+ if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
+ out_of_memory(NULL, 0, 0, NULL, false);
+ clear_zonelist_oom(zonelist, GFP_KERNEL);
--- /dev/null
+From fb2a6fc56be66c169f8b80e07ed999ba453a2db2 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 12 Sep 2013 15:13:43 -0700
+Subject: mm: memcg: rework and document OOM waiting and wakeup
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit fb2a6fc56be66c169f8b80e07ed999ba453a2db2 upstream.
+
+The memcg OOM handler open-codes a sleeping lock for OOM serialization
+(trylock, wait, repeat) because the required locking is so specific to
+memcg hierarchies. However, it would be nice if this construct would be
+clearly recognizable and not be as obfuscated as it is right now. Clean
+up as follows:
+
+1. Remove the return value of mem_cgroup_oom_unlock()
+
+2. Rename mem_cgroup_oom_lock() to mem_cgroup_oom_trylock().
+
+3. Pull the prepare_to_wait() out of the memcg_oom_lock scope. This
+ makes it more obvious that the task has to be on the waitqueue
+ before attempting to OOM-trylock the hierarchy, to not miss any
+ wakeups before going to sleep. It just didn't matter until now
+ because it was all lumped together into the global memcg_oom_lock
+ spinlock section.
+
+4. Pull the mem_cgroup_oom_notify() out of the memcg_oom_lock scope.
+ It is proctected by the hierarchical OOM-lock.
+
+5. The memcg_oom_lock spinlock is only required to propagate the OOM
+ lock in any given hierarchy atomically. Restrict its scope to
+ mem_cgroup_oom_(trylock|unlock).
+
+6. Do not wake up the waitqueue unconditionally at the end of the
+ function. Only the lockholder has to wake up the next in line
+ after releasing the lock.
+
+ Note that the lockholder kicks off the OOM-killer, which in turn
+ leads to wakeups from the uncharges of the exiting task. But a
+ contender is not guaranteed to see them if it enters the OOM path
+ after the OOM kills but before the lockholder releases the lock.
+ Thus there has to be an explicit wakeup after releasing the lock.
+
+7. Put the OOM task on the waitqueue before marking the hierarchy as
+ under OOM as that is the point where we start to receive wakeups.
+ No point in listening before being on the waitqueue.
+
+8. Likewise, unmark the hierarchy before finishing the sleep, for
+ symmetry.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: azurIt <azurit@pobox.sk>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c | 83 +++++++++++++++++++++++++++++++-------------------------
+ 1 file changed, 46 insertions(+), 37 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2075,15 +2075,18 @@ static int mem_cgroup_soft_reclaim(struc
+ return total;
+ }
+
++static DEFINE_SPINLOCK(memcg_oom_lock);
++
+ /*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+- * Has to be called with memcg_oom_lock
+ */
+-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
++static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
+ {
+ struct mem_cgroup *iter, *failed = NULL;
+
++ spin_lock(&memcg_oom_lock);
++
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter->oom_lock) {
+ /*
+@@ -2097,33 +2100,33 @@ static bool mem_cgroup_oom_lock(struct m
+ iter->oom_lock = true;
+ }
+
+- if (!failed)
+- return true;
+-
+- /*
+- * OK, we failed to lock the whole subtree so we have to clean up
+- * what we set up to the failing subtree
+- */
+- for_each_mem_cgroup_tree(iter, memcg) {
+- if (iter == failed) {
+- mem_cgroup_iter_break(memcg, iter);
+- break;
++ if (failed) {
++ /*
++ * OK, we failed to lock the whole subtree so we have
++ * to clean up what we set up to the failing subtree
++ */
++ for_each_mem_cgroup_tree(iter, memcg) {
++ if (iter == failed) {
++ mem_cgroup_iter_break(memcg, iter);
++ break;
++ }
++ iter->oom_lock = false;
+ }
+- iter->oom_lock = false;
+ }
+- return false;
++
++ spin_unlock(&memcg_oom_lock);
++
++ return !failed;
+ }
+
+-/*
+- * Has to be called with memcg_oom_lock
+- */
+-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
++static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+ {
+ struct mem_cgroup *iter;
+
++ spin_lock(&memcg_oom_lock);
+ for_each_mem_cgroup_tree(iter, memcg)
+ iter->oom_lock = false;
+- return 0;
++ spin_unlock(&memcg_oom_lock);
+ }
+
+ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
+@@ -2147,7 +2150,6 @@ static void mem_cgroup_unmark_under_oom(
+ atomic_add_unless(&iter->under_oom, -1, 0);
+ }
+
+-static DEFINE_SPINLOCK(memcg_oom_lock);
+ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+ struct oom_wait_info {
+@@ -2194,45 +2196,52 @@ static bool mem_cgroup_handle_oom(struct
+ int order)
+ {
+ struct oom_wait_info owait;
+- bool locked, need_to_kill;
++ bool locked;
+
+ owait.memcg = memcg;
+ owait.wait.flags = 0;
+ owait.wait.func = memcg_oom_wake_function;
+ owait.wait.private = current;
+ INIT_LIST_HEAD(&owait.wait.task_list);
+- need_to_kill = true;
+- mem_cgroup_mark_under_oom(memcg);
+
+- /* At first, try to OOM lock hierarchy under memcg.*/
+- spin_lock(&memcg_oom_lock);
+- locked = mem_cgroup_oom_lock(memcg);
+ /*
++ * As with any blocking lock, a contender needs to start
++ * listening for wakeups before attempting the trylock,
++ * otherwise it can miss the wakeup from the unlock and sleep
++ * indefinitely. This is just open-coded because our locking
++ * is so particular to memcg hierarchies.
++ *
+ * Even if signal_pending(), we can't quit charge() loop without
+ * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+ * under OOM is always welcomed, use TASK_KILLABLE here.
+ */
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+- if (!locked || memcg->oom_kill_disable)
+- need_to_kill = false;
++ mem_cgroup_mark_under_oom(memcg);
++
++ locked = mem_cgroup_oom_trylock(memcg);
++
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+- spin_unlock(&memcg_oom_lock);
+
+- if (need_to_kill) {
++ if (locked && !memcg->oom_kill_disable) {
++ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(memcg, mask, order);
+ } else {
+ schedule();
++ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ }
+- spin_lock(&memcg_oom_lock);
+- if (locked)
+- mem_cgroup_oom_unlock(memcg);
+- memcg_wakeup_oom(memcg);
+- spin_unlock(&memcg_oom_lock);
+
+- mem_cgroup_unmark_under_oom(memcg);
++ if (locked) {
++ mem_cgroup_oom_unlock(memcg);
++ /*
++ * There is no guarantee that an OOM-lock contender
++ * sees the wakeups triggered by the OOM kill
++ * uncharges. Wake any sleepers explicitely.
++ */
++ memcg_oom_recover(memcg);
++ }
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ return false;