]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 19 Nov 2014 18:48:46 +0000 (10:48 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 19 Nov 2014 18:48:46 +0000 (10:48 -0800)
added patches:
mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch
mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch

queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch [new file with mode: 0644]
queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch [new file with mode: 0644]
queue-3.10/series

diff --git a/queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch b/queue-3.10/mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch
new file mode 100644 (file)
index 0000000..95e5f53
--- /dev/null
@@ -0,0 +1,436 @@
+From 3812c8c8f3953921ef18544110dafc3505c1ac62 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 12 Sep 2013 15:13:44 -0700
+Subject: mm: memcg: do not trap chargers with full callstack on OOM
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 3812c8c8f3953921ef18544110dafc3505c1ac62 upstream.
+
+The memcg OOM handling is incredibly fragile and can deadlock.  When a
+task fails to charge memory, it invokes the OOM killer and loops right
+there in the charge code until it succeeds.  Comparably, any other task
+that enters the charge path at this point will go to a waitqueue right
+then and there and sleep until the OOM situation is resolved.  The problem
+is that these tasks may hold filesystem locks and the mmap_sem; locks that
+the selected OOM victim may need to exit.
+
+For example, in one reported case, the task invoking the OOM killer was
+about to charge a page cache page during a write(), which holds the
+i_mutex.  The OOM killer selected a task that was just entering truncate()
+and trying to acquire the i_mutex:
+
+OOM invoking task:
+  mem_cgroup_handle_oom+0x241/0x3b0
+  mem_cgroup_cache_charge+0xbe/0xe0
+  add_to_page_cache_locked+0x4c/0x140
+  add_to_page_cache_lru+0x22/0x50
+  grab_cache_page_write_begin+0x8b/0xe0
+  ext3_write_begin+0x88/0x270
+  generic_file_buffered_write+0x116/0x290
+  __generic_file_aio_write+0x27c/0x480
+  generic_file_aio_write+0x76/0xf0           # takes ->i_mutex
+  do_sync_write+0xea/0x130
+  vfs_write+0xf3/0x1f0
+  sys_write+0x51/0x90
+  system_call_fastpath+0x18/0x1d
+
+OOM kill victim:
+  do_truncate+0x58/0xa0              # takes i_mutex
+  do_last+0x250/0xa30
+  path_openat+0xd7/0x440
+  do_filp_open+0x49/0xa0
+  do_sys_open+0x106/0x240
+  sys_open+0x20/0x30
+  system_call_fastpath+0x18/0x1d
+
+The OOM handling task will retry the charge indefinitely while the OOM
+killed task is not releasing any resources.
+
+A similar scenario can happen when the kernel OOM killer for a memcg is
+disabled and a userspace task is in charge of resolving OOM situations.
+In this case, ALL tasks that enter the OOM path will be made to sleep on
+the OOM waitqueue and wait for userspace to free resources or increase
+the group's limit.  But a userspace OOM handler is prone to deadlock
+itself on the locks held by the waiting tasks.  For example one of the
+sleeping tasks may be stuck in a brk() call with the mmap_sem held for
+writing but the userspace handler, in order to pick an optimal victim,
+may need to read files from /proc/<pid>, which tries to acquire the same
+mmap_sem for reading and deadlocks.
+
+This patch changes the way tasks behave after detecting a memcg OOM and
+makes sure nobody loops or sleeps with locks held:
+
+1. When OOMing in a user fault, invoke the OOM killer and restart the
+   fault instead of looping on the charge attempt.  This way, the OOM
+   victim can not get stuck on locks the looping task may hold.
+
+2. When OOMing in a user fault but somebody else is handling it
+   (either the kernel OOM killer or a userspace handler), don't go to
+   sleep in the charge context.  Instead, remember the OOMing memcg in
+   the task struct and then fully unwind the page fault stack with
+   -ENOMEM.  pagefault_out_of_memory() will then call back into the
+   memcg code to check if the -ENOMEM came from the memcg, and then
+   either put the task to sleep on the memcg's OOM waitqueue or just
+   restart the fault.  The OOM victim can no longer get stuck on any
+   lock a sleeping task may hold.
+
+Debugged by Michal Hocko.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: azurIt <azurit@pobox.sk>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/memcontrol.h |   21 ++++++
+ include/linux/sched.h      |    4 +
+ mm/memcontrol.c            |  154 +++++++++++++++++++++++++++++++--------------
+ mm/memory.c                |    3 
+ mm/oom_kill.c              |    7 +-
+ 5 files changed, 140 insertions(+), 49 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -130,6 +130,10 @@ extern void mem_cgroup_replace_page_cach
+  *
+  * Toggle whether a failed memcg charge should invoke the OOM killer
+  * or just return -ENOMEM.  Returns the previous toggle state.
++ *
++ * NOTE: Any path that enables the OOM killer before charging must
++ *       call mem_cgroup_oom_synchronize() afterward to finalize the
++ *       OOM handling and clean up.
+  */
+ static inline bool mem_cgroup_toggle_oom(bool new)
+ {
+@@ -155,6 +159,13 @@ static inline void mem_cgroup_disable_oo
+       WARN_ON(old == false);
+ }
++static inline bool task_in_memcg_oom(struct task_struct *p)
++{
++      return p->memcg_oom.in_memcg_oom;
++}
++
++bool mem_cgroup_oom_synchronize(void);
++
+ #ifdef CONFIG_MEMCG_SWAP
+ extern int do_swap_account;
+ #endif
+@@ -391,6 +402,16 @@ static inline void mem_cgroup_disable_oo
+ {
+ }
++static inline bool task_in_memcg_oom(struct task_struct *p)
++{
++      return false;
++}
++
++static inline bool mem_cgroup_oom_synchronize(void)
++{
++      return false;
++}
++
+ static inline void mem_cgroup_inc_page_stat(struct page *page,
+                                           enum mem_cgroup_page_stat_item idx)
+ {
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1413,6 +1413,10 @@ struct task_struct {
+       unsigned int memcg_kmem_skip_account;
+       struct memcg_oom_info {
+               unsigned int may_oom:1;
++              unsigned int in_memcg_oom:1;
++              unsigned int oom_locked:1;
++              int wakeups;
++              struct mem_cgroup *wait_on_memcg;
+       } memcg_oom;
+ #endif
+ #ifdef CONFIG_HAVE_HW_BREAKPOINT
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -302,6 +302,7 @@ struct mem_cgroup {
+       bool            oom_lock;
+       atomic_t        under_oom;
++      atomic_t        oom_wakeups;
+       atomic_t        refcnt;
+@@ -2179,6 +2180,7 @@ static int memcg_oom_wake_function(wait_
+ static void memcg_wakeup_oom(struct mem_cgroup *memcg)
+ {
++      atomic_inc(&memcg->oom_wakeups);
+       /* for filtering, pass "memcg" as argument. */
+       __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
+ }
+@@ -2190,19 +2192,17 @@ static void memcg_oom_recover(struct mem
+ }
+ /*
+- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
++ * try to call OOM killer
+  */
+-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+-                                int order)
++static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+ {
+-      struct oom_wait_info owait;
+       bool locked;
++      int wakeups;
+-      owait.memcg = memcg;
+-      owait.wait.flags = 0;
+-      owait.wait.func = memcg_oom_wake_function;
+-      owait.wait.private = current;
+-      INIT_LIST_HEAD(&owait.wait.task_list);
++      if (!current->memcg_oom.may_oom)
++              return;
++
++      current->memcg_oom.in_memcg_oom = 1;
+       /*
+        * As with any blocking lock, a contender needs to start
+@@ -2210,12 +2210,8 @@ static bool mem_cgroup_handle_oom(struct
+        * otherwise it can miss the wakeup from the unlock and sleep
+        * indefinitely.  This is just open-coded because our locking
+        * is so particular to memcg hierarchies.
+-       *
+-       * Even if signal_pending(), we can't quit charge() loop without
+-       * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+-       * under OOM is always welcomed, use TASK_KILLABLE here.
+        */
+-      prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
++      wakeups = atomic_read(&memcg->oom_wakeups);
+       mem_cgroup_mark_under_oom(memcg);
+       locked = mem_cgroup_oom_trylock(memcg);
+@@ -2225,15 +2221,95 @@ static bool mem_cgroup_handle_oom(struct
+       if (locked && !memcg->oom_kill_disable) {
+               mem_cgroup_unmark_under_oom(memcg);
+-              finish_wait(&memcg_oom_waitq, &owait.wait);
+               mem_cgroup_out_of_memory(memcg, mask, order);
++              mem_cgroup_oom_unlock(memcg);
++              /*
++               * There is no guarantee that an OOM-lock contender
++               * sees the wakeups triggered by the OOM kill
++               * uncharges.  Wake any sleepers explicitely.
++               */
++              memcg_oom_recover(memcg);
+       } else {
+-              schedule();
+-              mem_cgroup_unmark_under_oom(memcg);
+-              finish_wait(&memcg_oom_waitq, &owait.wait);
++              /*
++               * A system call can just return -ENOMEM, but if this
++               * is a page fault and somebody else is handling the
++               * OOM already, we need to sleep on the OOM waitqueue
++               * for this memcg until the situation is resolved.
++               * Which can take some time because it might be
++               * handled by a userspace task.
++               *
++               * However, this is the charge context, which means
++               * that we may sit on a large call stack and hold
++               * various filesystem locks, the mmap_sem etc. and we
++               * don't want the OOM handler to deadlock on them
++               * while we sit here and wait.  Store the current OOM
++               * context in the task_struct, then return -ENOMEM.
++               * At the end of the page fault handler, with the
++               * stack unwound, pagefault_out_of_memory() will check
++               * back with us by calling
++               * mem_cgroup_oom_synchronize(), possibly putting the
++               * task to sleep.
++               */
++              current->memcg_oom.oom_locked = locked;
++              current->memcg_oom.wakeups = wakeups;
++              css_get(&memcg->css);
++              current->memcg_oom.wait_on_memcg = memcg;
+       }
++}
+-      if (locked) {
++/**
++ * mem_cgroup_oom_synchronize - complete memcg OOM handling
++ *
++ * This has to be called at the end of a page fault if the the memcg
++ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
++ *
++ * Memcg supports userspace OOM handling, so failed allocations must
++ * sleep on a waitqueue until the userspace task resolves the
++ * situation.  Sleeping directly in the charge context with all kinds
++ * of locks held is not a good idea, instead we remember an OOM state
++ * in the task and mem_cgroup_oom_synchronize() has to be called at
++ * the end of the page fault to put the task to sleep and clean up the
++ * OOM state.
++ *
++ * Returns %true if an ongoing memcg OOM situation was detected and
++ * finalized, %false otherwise.
++ */
++bool mem_cgroup_oom_synchronize(void)
++{
++      struct oom_wait_info owait;
++      struct mem_cgroup *memcg;
++
++      /* OOM is global, do not handle */
++      if (!current->memcg_oom.in_memcg_oom)
++              return false;
++
++      /*
++       * We invoked the OOM killer but there is a chance that a kill
++       * did not free up any charges.  Everybody else might already
++       * be sleeping, so restart the fault and keep the rampage
++       * going until some charges are released.
++       */
++      memcg = current->memcg_oom.wait_on_memcg;
++      if (!memcg)
++              goto out;
++
++      if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
++              goto out_memcg;
++
++      owait.memcg = memcg;
++      owait.wait.flags = 0;
++      owait.wait.func = memcg_oom_wake_function;
++      owait.wait.private = current;
++      INIT_LIST_HEAD(&owait.wait.task_list);
++
++      prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
++      /* Only sleep if we didn't miss any wakeups since OOM */
++      if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
++              schedule();
++      finish_wait(&memcg_oom_waitq, &owait.wait);
++out_memcg:
++      mem_cgroup_unmark_under_oom(memcg);
++      if (current->memcg_oom.oom_locked) {
+               mem_cgroup_oom_unlock(memcg);
+               /*
+                * There is no guarantee that an OOM-lock contender
+@@ -2242,11 +2318,10 @@ static bool mem_cgroup_handle_oom(struct
+                */
+               memcg_oom_recover(memcg);
+       }
+-
+-      if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+-              return false;
+-      /* Give chance to dying process */
+-      schedule_timeout_uninterruptible(1);
++      css_put(&memcg->css);
++      current->memcg_oom.wait_on_memcg = NULL;
++out:
++      current->memcg_oom.in_memcg_oom = 0;
+       return true;
+ }
+@@ -2559,12 +2634,11 @@ enum {
+       CHARGE_RETRY,           /* need to retry but retry is not bad */
+       CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
+       CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
+-      CHARGE_OOM_DIE,         /* the current is killed because of OOM */
+ };
+ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                               unsigned int nr_pages, unsigned int min_pages,
+-                              bool oom_check)
++                              bool invoke_oom)
+ {
+       unsigned long csize = nr_pages * PAGE_SIZE;
+       struct mem_cgroup *mem_over_limit;
+@@ -2621,14 +2695,10 @@ static int mem_cgroup_do_charge(struct m
+       if (mem_cgroup_wait_acct_move(mem_over_limit))
+               return CHARGE_RETRY;
+-      /* If we don't need to call oom-killer at el, return immediately */
+-      if (!oom_check || !current->memcg_oom.may_oom)
+-              return CHARGE_NOMEM;
+-      /* check OOM */
+-      if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
+-              return CHARGE_OOM_DIE;
++      if (invoke_oom)
++              mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
+-      return CHARGE_RETRY;
++      return CHARGE_NOMEM;
+ }
+ /*
+@@ -2731,7 +2801,7 @@ again:
+       }
+       do {
+-              bool oom_check;
++              bool invoke_oom = oom && !nr_oom_retries;
+               /* If killed, bypass charge */
+               if (fatal_signal_pending(current)) {
+@@ -2739,14 +2809,8 @@ again:
+                       goto bypass;
+               }
+-              oom_check = false;
+-              if (oom && !nr_oom_retries) {
+-                      oom_check = true;
+-                      nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+-              }
+-
+-              ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+-                  oom_check);
++              ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
++                                         nr_pages, invoke_oom);
+               switch (ret) {
+               case CHARGE_OK:
+                       break;
+@@ -2759,16 +2823,12 @@ again:
+                       css_put(&memcg->css);
+                       goto nomem;
+               case CHARGE_NOMEM: /* OOM routine works */
+-                      if (!oom) {
++                      if (!oom || invoke_oom) {
+                               css_put(&memcg->css);
+                               goto nomem;
+                       }
+-                      /* If oom, we never return -ENOMEM */
+                       nr_oom_retries--;
+                       break;
+-              case CHARGE_OOM_DIE: /* Killed by OOM Killer */
+-                      css_put(&memcg->css);
+-                      goto bypass;
+               }
+       } while (ret != CHARGE_OK);
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm
+       if (flags & FAULT_FLAG_USER)
+               mem_cgroup_disable_oom();
++      if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
++              mem_cgroup_oom_synchronize();
++
+       return ret;
+ }
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -702,9 +702,12 @@ out:
+  */
+ void pagefault_out_of_memory(void)
+ {
+-      struct zonelist *zonelist = node_zonelist(first_online_node,
+-                                                GFP_KERNEL);
++      struct zonelist *zonelist;
++      if (mem_cgroup_oom_synchronize())
++              return;
++
++      zonelist = node_zonelist(first_online_node, GFP_KERNEL);
+       if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
+               out_of_memory(NULL, 0, 0, NULL, false);
+               clear_zonelist_oom(zonelist, GFP_KERNEL);
diff --git a/queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch b/queue-3.10/mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch
new file mode 100644 (file)
index 0000000..2709693
--- /dev/null
@@ -0,0 +1,217 @@
+From fb2a6fc56be66c169f8b80e07ed999ba453a2db2 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 12 Sep 2013 15:13:43 -0700
+Subject: mm: memcg: rework and document OOM waiting and wakeup
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit fb2a6fc56be66c169f8b80e07ed999ba453a2db2 upstream.
+
+The memcg OOM handler open-codes a sleeping lock for OOM serialization
+(trylock, wait, repeat) because the required locking is so specific to
+memcg hierarchies.  However, it would be nice if this construct would be
+clearly recognizable and not be as obfuscated as it is right now.  Clean
+up as follows:
+
+1. Remove the return value of mem_cgroup_oom_unlock()
+
+2. Rename mem_cgroup_oom_lock() to mem_cgroup_oom_trylock().
+
+3. Pull the prepare_to_wait() out of the memcg_oom_lock scope.  This
+   makes it more obvious that the task has to be on the waitqueue
+   before attempting to OOM-trylock the hierarchy, to not miss any
+   wakeups before going to sleep.  It just didn't matter until now
+   because it was all lumped together into the global memcg_oom_lock
+   spinlock section.
+
+4. Pull the mem_cgroup_oom_notify() out of the memcg_oom_lock scope.
+   It is proctected by the hierarchical OOM-lock.
+
+5. The memcg_oom_lock spinlock is only required to propagate the OOM
+   lock in any given hierarchy atomically.  Restrict its scope to
+   mem_cgroup_oom_(trylock|unlock).
+
+6. Do not wake up the waitqueue unconditionally at the end of the
+   function.  Only the lockholder has to wake up the next in line
+   after releasing the lock.
+
+   Note that the lockholder kicks off the OOM-killer, which in turn
+   leads to wakeups from the uncharges of the exiting task.  But a
+   contender is not guaranteed to see them if it enters the OOM path
+   after the OOM kills but before the lockholder releases the lock.
+   Thus there has to be an explicit wakeup after releasing the lock.
+
+7. Put the OOM task on the waitqueue before marking the hierarchy as
+   under OOM as that is the point where we start to receive wakeups.
+   No point in listening before being on the waitqueue.
+
+8. Likewise, unmark the hierarchy before finishing the sleep, for
+   symmetry.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: azurIt <azurit@pobox.sk>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |   83 +++++++++++++++++++++++++++++++-------------------------
+ 1 file changed, 46 insertions(+), 37 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2075,15 +2075,18 @@ static int mem_cgroup_soft_reclaim(struc
+       return total;
+ }
++static DEFINE_SPINLOCK(memcg_oom_lock);
++
+ /*
+  * Check OOM-Killer is already running under our hierarchy.
+  * If someone is running, return false.
+- * Has to be called with memcg_oom_lock
+  */
+-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
++static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
+ {
+       struct mem_cgroup *iter, *failed = NULL;
++      spin_lock(&memcg_oom_lock);
++
+       for_each_mem_cgroup_tree(iter, memcg) {
+               if (iter->oom_lock) {
+                       /*
+@@ -2097,33 +2100,33 @@ static bool mem_cgroup_oom_lock(struct m
+                       iter->oom_lock = true;
+       }
+-      if (!failed)
+-              return true;
+-
+-      /*
+-       * OK, we failed to lock the whole subtree so we have to clean up
+-       * what we set up to the failing subtree
+-       */
+-      for_each_mem_cgroup_tree(iter, memcg) {
+-              if (iter == failed) {
+-                      mem_cgroup_iter_break(memcg, iter);
+-                      break;
++      if (failed) {
++              /*
++               * OK, we failed to lock the whole subtree so we have
++               * to clean up what we set up to the failing subtree
++               */
++              for_each_mem_cgroup_tree(iter, memcg) {
++                      if (iter == failed) {
++                              mem_cgroup_iter_break(memcg, iter);
++                              break;
++                      }
++                      iter->oom_lock = false;
+               }
+-              iter->oom_lock = false;
+       }
+-      return false;
++
++      spin_unlock(&memcg_oom_lock);
++
++      return !failed;
+ }
+-/*
+- * Has to be called with memcg_oom_lock
+- */
+-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
++static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+ {
+       struct mem_cgroup *iter;
++      spin_lock(&memcg_oom_lock);
+       for_each_mem_cgroup_tree(iter, memcg)
+               iter->oom_lock = false;
+-      return 0;
++      spin_unlock(&memcg_oom_lock);
+ }
+ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
+@@ -2147,7 +2150,6 @@ static void mem_cgroup_unmark_under_oom(
+               atomic_add_unless(&iter->under_oom, -1, 0);
+ }
+-static DEFINE_SPINLOCK(memcg_oom_lock);
+ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+ struct oom_wait_info {
+@@ -2194,45 +2196,52 @@ static bool mem_cgroup_handle_oom(struct
+                                 int order)
+ {
+       struct oom_wait_info owait;
+-      bool locked, need_to_kill;
++      bool locked;
+       owait.memcg = memcg;
+       owait.wait.flags = 0;
+       owait.wait.func = memcg_oom_wake_function;
+       owait.wait.private = current;
+       INIT_LIST_HEAD(&owait.wait.task_list);
+-      need_to_kill = true;
+-      mem_cgroup_mark_under_oom(memcg);
+-      /* At first, try to OOM lock hierarchy under memcg.*/
+-      spin_lock(&memcg_oom_lock);
+-      locked = mem_cgroup_oom_lock(memcg);
+       /*
++       * As with any blocking lock, a contender needs to start
++       * listening for wakeups before attempting the trylock,
++       * otherwise it can miss the wakeup from the unlock and sleep
++       * indefinitely.  This is just open-coded because our locking
++       * is so particular to memcg hierarchies.
++       *
+        * Even if signal_pending(), we can't quit charge() loop without
+        * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+        * under OOM is always welcomed, use TASK_KILLABLE here.
+        */
+       prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+-      if (!locked || memcg->oom_kill_disable)
+-              need_to_kill = false;
++      mem_cgroup_mark_under_oom(memcg);
++
++      locked = mem_cgroup_oom_trylock(memcg);
++
+       if (locked)
+               mem_cgroup_oom_notify(memcg);
+-      spin_unlock(&memcg_oom_lock);
+-      if (need_to_kill) {
++      if (locked && !memcg->oom_kill_disable) {
++              mem_cgroup_unmark_under_oom(memcg);
+               finish_wait(&memcg_oom_waitq, &owait.wait);
+               mem_cgroup_out_of_memory(memcg, mask, order);
+       } else {
+               schedule();
++              mem_cgroup_unmark_under_oom(memcg);
+               finish_wait(&memcg_oom_waitq, &owait.wait);
+       }
+-      spin_lock(&memcg_oom_lock);
+-      if (locked)
+-              mem_cgroup_oom_unlock(memcg);
+-      memcg_wakeup_oom(memcg);
+-      spin_unlock(&memcg_oom_lock);
+-      mem_cgroup_unmark_under_oom(memcg);
++      if (locked) {
++              mem_cgroup_oom_unlock(memcg);
++              /*
++               * There is no guarantee that an OOM-lock contender
++               * sees the wakeups triggered by the OOM kill
++               * uncharges.  Wake any sleepers explicitely.
++               */
++              memcg_oom_recover(memcg);
++      }
+       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+               return false;
index 2bda20e3f483a4bdfa3444ffbbe6e46d782133be..5525e4767136cda43eb37374b92ff2a94f4a04ae 100644 (file)
@@ -65,3 +65,5 @@ arch-mm-do-not-invoke-oom-killer-on-kernel-fault-oom.patch
 arch-mm-pass-userspace-fault-flag-to-generic-fault-handler.patch
 x86-finish-user-fault-error-path-with-fatal-signal.patch
 mm-memcg-enable-memcg-oom-killer-only-for-user-faults.patch
+mm-memcg-rework-and-document-oom-waiting-and-wakeup.patch
+mm-memcg-do-not-trap-chargers-with-full-callstack-on-oom.patch