From d62f5c2cc111e40a038074dc0f1bbcadc941af8b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 19 Nov 2014 10:48:46 -0800
Subject: [PATCH] 3.10-stable patches

added patches:
	mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch
	mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch
---
 ...-chargers-with-full-callstack-on-oom.patch | 436 ++++++++++++++++++
 ...-and-document-oom-waiting-and-wakeup.patch | 217 +++++++++
 queue-3.10/series                             |   2 +
 3 files changed, 655 insertions(+)
 create mode 100644 queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch
 create mode 100644 queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch
diff --git a/queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch b/queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch
new file mode 100644
index 00000000000..95e5f53b0d0
--- /dev/null
+++ b/queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch
@@ -0,0 +1,436 @@
+From 3812c8c8f3953921ef18544110dafc3505c1ac62 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 12 Sep 2013 15:13:44 -0700
+Subject: mm: memcg: do not trap chargers with full callstack on OOM
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 3812c8c8f3953921ef18544110dafc3505c1ac62 upstream.
+
+The memcg OOM handling is incredibly fragile and can deadlock.  When a
+task fails to charge memory, it invokes the OOM killer and loops right
+there in the charge code until it succeeds.  Comparably, any other task
+that enters the charge path at this point will go to a waitqueue right
+then and there and sleep until the OOM situation is resolved.  The problem
+is that these tasks may hold filesystem locks and the mmap_sem; locks that
+the selected OOM victim may need to exit.
+
+For example, in one reported case, the task invoking the OOM killer was
+about to charge a page cache page during a write(), which holds the
+i_mutex.  The OOM killer selected a task that was just entering truncate()
+and trying to acquire the i_mutex:
+
+OOM invoking task:
+  mem_cgroup_handle_oom+0x241/0x3b0
+  mem_cgroup_cache_charge+0xbe/0xe0
+  add_to_page_cache_locked+0x4c/0x140
+  add_to_page_cache_lru+0x22/0x50
+  grab_cache_page_write_begin+0x8b/0xe0
+  ext3_write_begin+0x88/0x270
+  generic_file_buffered_write+0x116/0x290
+  __generic_file_aio_write+0x27c/0x480
+  generic_file_aio_write+0x76/0xf0           # takes ->i_mutex
+  do_sync_write+0xea/0x130
+  vfs_write+0xf3/0x1f0
+  sys_write+0x51/0x90
+  system_call_fastpath+0x18/0x1d
+
+OOM kill victim:
+  do_truncate+0x58/0xa0              # takes i_mutex
+  do_last+0x250/0xa30
+  path_openat+0xd7/0x440
+  do_filp_open+0x49/0xa0
+  do_sys_open+0x106/0x240
+  sys_open+0x20/0x30
+  system_call_fastpath+0x18/0x1d
+
+The OOM handling task will retry the charge indefinitely while the OOM
+killed task is not releasing any resources.
+
+A similar scenario can happen when the kernel OOM killer for a memcg is
+disabled and a userspace task is in charge of resolving OOM situations.
+In this case, ALL tasks that enter the OOM path will be made to sleep on
+the OOM waitqueue and wait for userspace to free resources or increase
+the group's limit.  But a userspace OOM handler is prone to deadlock
+itself on the locks held by the waiting tasks.  For example one of the
+sleeping tasks may be stuck in a brk() call with the mmap_sem held for
+writing but the userspace handler, in order to pick an optimal victim,
+may need to read files from /proc/<pid>, which tries to acquire the same
+mmap_sem for reading and deadlocks.
+
+This patch changes the way tasks behave after detecting a memcg OOM and
+makes sure nobody loops or sleeps with locks held:
+
+1. When OOMing in a user fault, invoke the OOM killer and restart the
+   fault instead of looping on the charge attempt.  This way, the OOM
+   victim can not get stuck on locks the looping task may hold.
+
+2. When OOMing in a user fault but somebody else is handling it
+   (either the kernel OOM killer or a userspace handler), don't go to
+   sleep in the charge context.  Instead, remember the OOMing memcg in
+   the task struct and then fully unwind the page fault stack with
+   -ENOMEM.  pagefault_out_of_memory() will then call back into the
+   memcg code to check if the -ENOMEM came from the memcg, and then
+   either put the task to sleep on the memcg's OOM waitqueue or just
+   restart the fault.  The OOM victim can no longer get stuck on any
+   lock a sleeping task may hold.
+
+Debugged by Michal Hocko.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: azurIt <azurit@pobox.sk>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/memcontrol.h |   21 ++++++
+ include/linux/sched.h      |    4 +
+ mm/memcontrol.c            |  154 +++++++++++++++++++++++++++++++--------------
+ mm/memory.c                |    3 
+ mm/oom_kill.c              |    7 +-
+ 5 files changed, 140 insertions(+), 49 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -130,6 +130,10 @@ extern void mem_cgroup_replace_page_cach
+  *
+  * Toggle whether a failed memcg charge should invoke the OOM killer
+  * or just return -ENOMEM.  Returns the previous toggle state.
++ *
++ * NOTE: Any path that enables the OOM killer before charging must
++ *       call mem_cgroup_oom_synchronize() afterward to finalize the
++ *       OOM handling and clean up.
+  */
+ static inline bool mem_cgroup_toggle_oom(bool new)
+ {
+@@ -155,6 +159,13 @@ static inline void mem_cgroup_disable_oo
+ 	WARN_ON(old == false);
+ }
+ 
++static inline bool task_in_memcg_oom(struct task_struct *p)
++{
++	return p->memcg_oom.in_memcg_oom;
++}
++
++bool mem_cgroup_oom_synchronize(void);
++
+ #ifdef CONFIG_MEMCG_SWAP
+ extern int do_swap_account;
+ #endif
+@@ -391,6 +402,16 @@ static inline void mem_cgroup_disable_oo
+ {
+ }
+ 
++static inline bool task_in_memcg_oom(struct task_struct *p)
++{
++	return false;
++}
++
++static inline bool mem_cgroup_oom_synchronize(void)
++{
++	return false;
++}
++
+ static inline void mem_cgroup_inc_page_stat(struct page *page,
+ 					    enum mem_cgroup_page_stat_item idx)
+ {
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1413,6 +1413,10 @@ struct task_struct {
+ 	unsigned int memcg_kmem_skip_account;
+ 	struct memcg_oom_info {
+ 		unsigned int may_oom:1;
++		unsigned int in_memcg_oom:1;
++		unsigned int oom_locked:1;
++		int wakeups;
++		struct mem_cgroup *wait_on_memcg;
+ 	} memcg_oom;
+ #endif
+ #ifdef CONFIG_HAVE_HW_BREAKPOINT
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -302,6 +302,7 @@ struct mem_cgroup {
+ 
+ 	bool		oom_lock;
+ 	atomic_t	under_oom;
++	atomic_t	oom_wakeups;
+ 
+ 	atomic_t	refcnt;
+ 
+@@ -2179,6 +2180,7 @@ static int memcg_oom_wake_function(wait_
+ 
+ static void memcg_wakeup_oom(struct mem_cgroup *memcg)
+ {
++	atomic_inc(&memcg->oom_wakeups);
+ 	/* for filtering, pass "memcg" as argument. */
+ 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
+ }
+@@ -2190,19 +2192,17 @@ static void memcg_oom_recover(struct mem
+ }
+ 
+ /*
+- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
++ * try to call OOM killer
+  */
+-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+-				  int order)
++static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+ {
+-	struct oom_wait_info owait;
+ 	bool locked;
++	int wakeups;
+ 
+-	owait.memcg = memcg;
+-	owait.wait.flags = 0;
+-	owait.wait.func = memcg_oom_wake_function;
+-	owait.wait.private = current;
+-	INIT_LIST_HEAD(&owait.wait.task_list);
++	if (!current->memcg_oom.may_oom)
++		return;
++
++	current->memcg_oom.in_memcg_oom = 1;
+ 
+ 	/*
+ 	 * As with any blocking lock, a contender needs to start
+@@ -2210,12 +2210,8 @@ static bool mem_cgroup_handle_oom(struct
+ 	 * otherwise it can miss the wakeup from the unlock and sleep
+ 	 * indefinitely.  This is just open-coded because our locking
+ 	 * is so particular to memcg hierarchies.
+-	 *
+-	 * Even if signal_pending(), we can't quit charge() loop without
+-	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+-	 * under OOM is always welcomed, use TASK_KILLABLE here.
+ 	 */
+-	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
++	wakeups = atomic_read(&memcg->oom_wakeups);
+ 	mem_cgroup_mark_under_oom(memcg);
+ 
+ 	locked = mem_cgroup_oom_trylock(memcg);
+@@ -2225,15 +2221,95 @@ static bool mem_cgroup_handle_oom(struct
+ 
+ 	if (locked && !memcg->oom_kill_disable) {
+ 		mem_cgroup_unmark_under_oom(memcg);
+-		finish_wait(&memcg_oom_waitq, &owait.wait);
+ 		mem_cgroup_out_of_memory(memcg, mask, order);
++		mem_cgroup_oom_unlock(memcg);
++		/*
++		 * There is no guarantee that an OOM-lock contender
++		 * sees the wakeups triggered by the OOM kill
++		 * uncharges.  Wake any sleepers explicitely.
++		 */
++		memcg_oom_recover(memcg);
+ 	} else {
+-		schedule();
+-		mem_cgroup_unmark_under_oom(memcg);
+-		finish_wait(&memcg_oom_waitq, &owait.wait);
++		/*
++		 * A system call can just return -ENOMEM, but if this
++		 * is a page fault and somebody else is handling the
++		 * OOM already, we need to sleep on the OOM waitqueue
++		 * for this memcg until the situation is resolved.
++		 * Which can take some time because it might be
++		 * handled by a userspace task.
++		 *
++		 * However, this is the charge context, which means
++		 * that we may sit on a large call stack and hold
++		 * various filesystem locks, the mmap_sem etc. and we
++		 * don't want the OOM handler to deadlock on them
++		 * while we sit here and wait.  Store the current OOM
++		 * context in the task_struct, then return -ENOMEM.
++		 * At the end of the page fault handler, with the
++		 * stack unwound, pagefault_out_of_memory() will check
++		 * back with us by calling
++		 * mem_cgroup_oom_synchronize(), possibly putting the
++		 * task to sleep.
++		 */
++		current->memcg_oom.oom_locked = locked;
++		current->memcg_oom.wakeups = wakeups;
++		css_get(&memcg->css);
++		current->memcg_oom.wait_on_memcg = memcg;
+ 	}
++}
+ 
+-	if (locked) {
++/**
++ * mem_cgroup_oom_synchronize - complete memcg OOM handling
++ *
++ * This has to be called at the end of a page fault if the the memcg
++ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
++ *
++ * Memcg supports userspace OOM handling, so failed allocations must
++ * sleep on a waitqueue until the userspace task resolves the
++ * situation.  Sleeping directly in the charge context with all kinds
++ * of locks held is not a good idea, instead we remember an OOM state
++ * in the task and mem_cgroup_oom_synchronize() has to be called at
++ * the end of the page fault to put the task to sleep and clean up the
++ * OOM state.
++ *
++ * Returns %true if an ongoing memcg OOM situation was detected and
++ * finalized, %false otherwise.
++ */
++bool mem_cgroup_oom_synchronize(void)
++{
++	struct oom_wait_info owait;
++	struct mem_cgroup *memcg;
++
++	/* OOM is global, do not handle */
++	if (!current->memcg_oom.in_memcg_oom)
++		return false;
++
++	/*
++	 * We invoked the OOM killer but there is a chance that a kill
++	 * did not free up any charges.  Everybody else might already
++	 * be sleeping, so restart the fault and keep the rampage
++	 * going until some charges are released.
++	 */
++	memcg = current->memcg_oom.wait_on_memcg;
++	if (!memcg)
++		goto out;
++
++	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
++		goto out_memcg;
++
++	owait.memcg = memcg;
++	owait.wait.flags = 0;
++	owait.wait.func = memcg_oom_wake_function;
++	owait.wait.private = current;
++	INIT_LIST_HEAD(&owait.wait.task_list);
++
++	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
++	/* Only sleep if we didn't miss any wakeups since OOM */
++	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
++		schedule();
++	finish_wait(&memcg_oom_waitq, &owait.wait);
++out_memcg:
++	mem_cgroup_unmark_under_oom(memcg);
++	if (current->memcg_oom.oom_locked) {
+ 		mem_cgroup_oom_unlock(memcg);
+ 		/*
+ 		 * There is no guarantee that an OOM-lock contender
+@@ -2242,11 +2318,10 @@ static bool mem_cgroup_handle_oom(struct
+ 		 */
+ 		memcg_oom_recover(memcg);
+ 	}
+-
+-	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+-		return false;
+-	/* Give chance to dying process */
+-	schedule_timeout_uninterruptible(1);
++	css_put(&memcg->css);
++	current->memcg_oom.wait_on_memcg = NULL;
++out:
++	current->memcg_oom.in_memcg_oom = 0;
+ 	return true;
+ }
+ 
+@@ -2559,12 +2634,11 @@ enum {
+ 	CHARGE_RETRY,		/* need to retry but retry is not bad */
+ 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
+ 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
+-	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
+ };
+ 
+ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ 				unsigned int nr_pages, unsigned int min_pages,
+-				bool oom_check)
++				bool invoke_oom)
+ {
+ 	unsigned long csize = nr_pages * PAGE_SIZE;
+ 	struct mem_cgroup *mem_over_limit;
+@@ -2621,14 +2695,10 @@ static int mem_cgroup_do_charge(struct m
+ 	if (mem_cgroup_wait_acct_move(mem_over_limit))
+ 		return CHARGE_RETRY;
+ 
+-	/* If we don't need to call oom-killer at el, return immediately */
+-	if (!oom_check || !current->memcg_oom.may_oom)
+-		return CHARGE_NOMEM;
+-	/* check OOM */
+-	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
+-		return CHARGE_OOM_DIE;
++	if (invoke_oom)
++		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
+ 
+-	return CHARGE_RETRY;
++	return CHARGE_NOMEM;
+ }
+ 
+ /*
+@@ -2731,7 +2801,7 @@ again:
+ 	}
+ 
+ 	do {
+-		bool oom_check;
++		bool invoke_oom = oom && !nr_oom_retries;
+ 
+ 		/* If killed, bypass charge */
+ 		if (fatal_signal_pending(current)) {
+@@ -2739,14 +2809,8 @@ again:
+ 			goto bypass;
+ 		}
+ 
+-		oom_check = false;
+-		if (oom && !nr_oom_retries) {
+-			oom_check = true;
+-			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+-		}
+-
+-		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+-		    oom_check);
++		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
++					   nr_pages, invoke_oom);
+ 		switch (ret) {
+ 		case CHARGE_OK:
+ 			break;
+@@ -2759,16 +2823,12 @@ again:
+ 			css_put(&memcg->css);
+ 			goto nomem;
+ 		case CHARGE_NOMEM: /* OOM routine works */
+-			if (!oom) {
++			if (!oom || invoke_oom) {
+ 				css_put(&memcg->css);
+ 				goto nomem;
+ 			}
+-			/* If oom, we never return -ENOMEM */
+ 			nr_oom_retries--;
+ 			break;
+-		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
+-			css_put(&memcg->css);
+-			goto bypass;
+ 		}
+ 	} while (ret != CHARGE_OK);
+ 
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm
+ 	if (flags & FAULT_FLAG_USER)
+ 		mem_cgroup_disable_oom();
+ 
++	if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
++		mem_cgroup_oom_synchronize();
++
+ 	return ret;
+ }
+ 
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -702,9 +702,12 @@ out:
+  */
+ void pagefault_out_of_memory(void)
+ {
+-	struct zonelist *zonelist = node_zonelist(first_online_node,
+-						  GFP_KERNEL);
++	struct zonelist *zonelist;
+ 
++	if (mem_cgroup_oom_synchronize())
++		return;
++
++	zonelist = node_zonelist(first_online_node, GFP_KERNEL);
+ 	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
+ 		out_of_memory(NULL, 0, 0, NULL, false);
+ 		clear_zonelist_oom(zonelist, GFP_KERNEL);
diff --git a/queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch b/queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch
new file mode 100644
index 00000000000..2709693909f
--- /dev/null
+++ b/queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch
@@ -0,0 +1,217 @@
+From fb2a6fc56be66c169f8b80e07ed999ba453a2db2 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 12 Sep 2013 15:13:43 -0700
+Subject: mm: memcg: rework and document OOM waiting and wakeup
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit fb2a6fc56be66c169f8b80e07ed999ba453a2db2 upstream.
+
+The memcg OOM handler open-codes a sleeping lock for OOM serialization
+(trylock, wait, repeat) because the required locking is so specific to
+memcg hierarchies.  However, it would be nice if this construct would be
+clearly recognizable and not be as obfuscated as it is right now.  Clean
+up as follows:
+
+1. Remove the return value of mem_cgroup_oom_unlock()
+
+2. Rename mem_cgroup_oom_lock() to mem_cgroup_oom_trylock().
+
+3. Pull the prepare_to_wait() out of the memcg_oom_lock scope.  This
+   makes it more obvious that the task has to be on the waitqueue
+   before attempting to OOM-trylock the hierarchy, to not miss any
+   wakeups before going to sleep.  It just didn't matter until now
+   because it was all lumped together into the global memcg_oom_lock
+   spinlock section.
+
+4. Pull the mem_cgroup_oom_notify() out of the memcg_oom_lock scope.
+   It is proctected by the hierarchical OOM-lock.
+
+5. The memcg_oom_lock spinlock is only required to propagate the OOM
+   lock in any given hierarchy atomically.  Restrict its scope to
+   mem_cgroup_oom_(trylock|unlock).
+
+6. Do not wake up the waitqueue unconditionally at the end of the
+   function.  Only the lockholder has to wake up the next in line
+   after releasing the lock.
+
+   Note that the lockholder kicks off the OOM-killer, which in turn
+   leads to wakeups from the uncharges of the exiting task.  But a
+   contender is not guaranteed to see them if it enters the OOM path
+   after the OOM kills but before the lockholder releases the lock.
+   Thus there has to be an explicit wakeup after releasing the lock.
+
+7. Put the OOM task on the waitqueue before marking the hierarchy as
+   under OOM as that is the point where we start to receive wakeups.
+   No point in listening before being on the waitqueue.
+
+8. Likewise, unmark the hierarchy before finishing the sleep, for
+   symmetry.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: azurIt <azurit@pobox.sk>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |   83 +++++++++++++++++++++++++++++++-------------------------
+ 1 file changed, 46 insertions(+), 37 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2075,15 +2075,18 @@ static int mem_cgroup_soft_reclaim(struc
+ 	return total;
+ }
+ 
++static DEFINE_SPINLOCK(memcg_oom_lock);
++
+ /*
+  * Check OOM-Killer is already running under our hierarchy.
+  * If someone is running, return false.
+- * Has to be called with memcg_oom_lock
+  */
+-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
++static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
+ {
+ 	struct mem_cgroup *iter, *failed = NULL;
+ 
++	spin_lock(&memcg_oom_lock);
++
+ 	for_each_mem_cgroup_tree(iter, memcg) {
+ 		if (iter->oom_lock) {
+ 			/*
+@@ -2097,33 +2100,33 @@ static bool mem_cgroup_oom_lock(struct m
+ 			iter->oom_lock = true;
+ 	}
+ 
+-	if (!failed)
+-		return true;
+-
+-	/*
+-	 * OK, we failed to lock the whole subtree so we have to clean up
+-	 * what we set up to the failing subtree
+-	 */
+-	for_each_mem_cgroup_tree(iter, memcg) {
+-		if (iter == failed) {
+-			mem_cgroup_iter_break(memcg, iter);
+-			break;
++	if (failed) {
++		/*
++		 * OK, we failed to lock the whole subtree so we have
++		 * to clean up what we set up to the failing subtree
++		 */
++		for_each_mem_cgroup_tree(iter, memcg) {
++			if (iter == failed) {
++				mem_cgroup_iter_break(memcg, iter);
++				break;
++			}
++			iter->oom_lock = false;
+ 		}
+-		iter->oom_lock = false;
+ 	}
+-	return false;
++
++	spin_unlock(&memcg_oom_lock);
++
++	return !failed;
+ }
+ 
+-/*
+- * Has to be called with memcg_oom_lock
+- */
+-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
++static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+ {
+ 	struct mem_cgroup *iter;
+ 
++	spin_lock(&memcg_oom_lock);
+ 	for_each_mem_cgroup_tree(iter, memcg)
+ 		iter->oom_lock = false;
+-	return 0;
++	spin_unlock(&memcg_oom_lock);
+ }
+ 
+ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
+@@ -2147,7 +2150,6 @@ static void mem_cgroup_unmark_under_oom(
+ 		atomic_add_unless(&iter->under_oom, -1, 0);
+ }
+ 
+-static DEFINE_SPINLOCK(memcg_oom_lock);
+ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+ 
+ struct oom_wait_info {
+@@ -2194,45 +2196,52 @@ static bool mem_cgroup_handle_oom(struct
+ 				  int order)
+ {
+ 	struct oom_wait_info owait;
+-	bool locked, need_to_kill;
++	bool locked;
+ 
+ 	owait.memcg = memcg;
+ 	owait.wait.flags = 0;
+ 	owait.wait.func = memcg_oom_wake_function;
+ 	owait.wait.private = current;
+ 	INIT_LIST_HEAD(&owait.wait.task_list);
+-	need_to_kill = true;
+-	mem_cgroup_mark_under_oom(memcg);
+ 
+-	/* At first, try to OOM lock hierarchy under memcg.*/
+-	spin_lock(&memcg_oom_lock);
+-	locked = mem_cgroup_oom_lock(memcg);
+ 	/*
++	 * As with any blocking lock, a contender needs to start
++	 * listening for wakeups before attempting the trylock,
++	 * otherwise it can miss the wakeup from the unlock and sleep
++	 * indefinitely.  This is just open-coded because our locking
++	 * is so particular to memcg hierarchies.
++	 *
+ 	 * Even if signal_pending(), we can't quit charge() loop without
+ 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+ 	 * under OOM is always welcomed, use TASK_KILLABLE here.
+ 	 */
+ 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+-	if (!locked || memcg->oom_kill_disable)
+-		need_to_kill = false;
++	mem_cgroup_mark_under_oom(memcg);
++
++	locked = mem_cgroup_oom_trylock(memcg);
++
+ 	if (locked)
+ 		mem_cgroup_oom_notify(memcg);
+-	spin_unlock(&memcg_oom_lock);
+ 
+-	if (need_to_kill) {
++	if (locked && !memcg->oom_kill_disable) {
++		mem_cgroup_unmark_under_oom(memcg);
+ 		finish_wait(&memcg_oom_waitq, &owait.wait);
+ 		mem_cgroup_out_of_memory(memcg, mask, order);
+ 	} else {
+ 		schedule();
++		mem_cgroup_unmark_under_oom(memcg);
+ 		finish_wait(&memcg_oom_waitq, &owait.wait);
+ 	}
+-	spin_lock(&memcg_oom_lock);
+-	if (locked)
+-		mem_cgroup_oom_unlock(memcg);
+-	memcg_wakeup_oom(memcg);
+-	spin_unlock(&memcg_oom_lock);
+ 
+-	mem_cgroup_unmark_under_oom(memcg);
++	if (locked) {
++		mem_cgroup_oom_unlock(memcg);
++		/*
++		 * There is no guarantee that an OOM-lock contender
++		 * sees the wakeups triggered by the OOM kill
++		 * uncharges.  Wake any sleepers explicitely.
++		 */
++		memcg_oom_recover(memcg);
++	}
+ 
+ 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ 		return false;
diff --git a/queue-3.10/series b/queue-3.10/series
index 2bda20e3f48..5525e476713 100644
--- a/queue-3.10/series
+++ b/queue-3.10/series
@@ -65,3 +65,5 @@ arch-mm-do-not-invoke-oom-killer-on-kernel-fault-oom.patch
 arch-mm-pass-userspace-fault-flag-to-generic-fault-handler.patch
 x86-finish-user-fault-error-path-with-fatal-signal.patch
 mm-memcg-enable-memcg-oom-killer-only-for-user-faults.patch
+mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch
+mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch
-- 
2.47.3