]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 3 Apr 2021 08:59:46 +0000 (10:59 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 3 Apr 2021 08:59:46 +0000 (10:59 +0200)
added patches:
mem_cgroup-make-sure-moving_account-move_lock_task-and-stat_cpu-in-the-same-cacheline.patch
mm-fix-oom_kill-event-handling.patch
mm-memcg-make-sure-memory.events-is-uptodate-when-waking-pollers.patch
mm-memcontrol-fix-nr_writeback-leak-in-memcg-and-system-stats.patch
mm-writeback-use-exact-memcg-dirty-counts.patch

queue-4.14/mem_cgroup-make-sure-moving_account-move_lock_task-and-stat_cpu-in-the-same-cacheline.patch [new file with mode: 0644]
queue-4.14/mm-fix-oom_kill-event-handling.patch [new file with mode: 0644]
queue-4.14/mm-memcg-make-sure-memory.events-is-uptodate-when-waking-pollers.patch [new file with mode: 0644]
queue-4.14/mm-memcontrol-fix-nr_writeback-leak-in-memcg-and-system-stats.patch [new file with mode: 0644]
queue-4.14/mm-writeback-use-exact-memcg-dirty-counts.patch [new file with mode: 0644]
queue-4.14/series

diff --git a/queue-4.14/mem_cgroup-make-sure-moving_account-move_lock_task-and-stat_cpu-in-the-same-cacheline.patch b/queue-4.14/mem_cgroup-make-sure-moving_account-move_lock_task-and-stat_cpu-in-the-same-cacheline.patch
new file mode 100644 (file)
index 0000000..39419c9
--- /dev/null
@@ -0,0 +1,140 @@
+From foo@baz Sat Apr  3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:08 +0000
+Subject: mem_cgroup: make sure moving_account, move_lock_task and stat_cpu in the same cacheline
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-4-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Aaron Lu <aaron.lu@intel.com>
+
+commit e81bf9793b1861d74953ef041b4f6c7faecc2dbd upstream.
+
+The LKP robot found a 27% will-it-scale/page_fault3 performance
+regression regarding commit e27be240df53("mm: memcg: make sure
+memory.events is uptodate when waking pollers").
+
+What the test does is:
+ 1 mkstemp() a 128M file on a tmpfs;
+ 2 start $nr_cpu processes, each to loop the following:
+   2.1 mmap() this file in shared write mode;
+   2.2 write 0 to this file in a PAGE_SIZE step till the end of the file;
+   2.3 unmap() this file and repeat this process.
+ 3 After 5 minutes, check how many loops they managed to complete, the
+   higher the better.
+
+The commit itself looks innocent enough as it merely changed some event
+counting mechanism and this test didn't trigger those events at all.
+Perf shows increased cycles spent on accessing root_mem_cgroup->stat_cpu
+in count_memcg_event_mm()(called by handle_mm_fault()) and in
+__mod_memcg_state() called by page_add_file_rmap().  So it's likely due
+to the changed layout of 'struct mem_cgroup' that either make stat_cpu
+falling into a constantly modifying cacheline or some hot fields stop
+being in the same cacheline.
+
+I verified this by moving memory_events[] back to where it was:
+
+: --- a/include/linux/memcontrol.h
+: +++ b/include/linux/memcontrol.h
+: @@ -205,7 +205,6 @@ struct mem_cgroup {
+:      int             oom_kill_disable;
+:
+:      /* memory.events */
+: -    atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+:      struct cgroup_file events_file;
+:
+:      /* protect arrays of thresholds */
+: @@ -238,6 +237,7 @@ struct mem_cgroup {
+:      struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+:      atomic_long_t           stat[MEMCG_NR_STAT];
+:      atomic_long_t           events[NR_VM_EVENT_ITEMS];
+: +    atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+:
+:      unsigned long           socket_pressure;
+
+And performance restored.
+
+Later investigation found that as long as the following 3 fields
+moving_account, move_lock_task and stat_cpu are in the same cacheline,
+performance will be good.  To avoid future performance surprise by other
+commits changing the layout of 'struct mem_cgroup', this patch makes
+sure the 3 fields stay in the same cacheline.
+
+One concern of this approach is, moving_account and move_lock_task could
+be modified when a process changes memory cgroup while stat_cpu is a
+always read field, it might hurt to place them in the same cacheline.  I
+assume it is rare for a process to change memory cgroup so this should
+be OK.
+
+Link: https://lkml.kernel.org/r/20180528114019.GF9904@yexl-desktop
+Link: http://lkml.kernel.org/r/20180601071115.GA27302@intel.com
+Signed-off-by: Aaron Lu <aaron.lu@intel.com>
+Reported-by: kernel test robot <xiaolong.ye@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Tejun Heo <tj@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h |   23 +++++++++++++++++++----
+ 1 file changed, 19 insertions(+), 4 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -155,6 +155,15 @@ enum memcg_kmem_state {
+       KMEM_ONLINE,
+ };
++#if defined(CONFIG_SMP)
++struct memcg_padding {
++      char x[0];
++} ____cacheline_internodealigned_in_smp;
++#define MEMCG_PADDING(name)      struct memcg_padding name;
++#else
++#define MEMCG_PADDING(name)
++#endif
++
+ /*
+  * The memory controller data structure. The memory controller controls both
+  * page cache and RSS per cgroup. We would eventually like to provide
+@@ -202,7 +211,6 @@ struct mem_cgroup {
+       int             oom_kill_disable;
+       /* memory.events */
+-      atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+       struct cgroup_file events_file;
+       /* protect arrays of thresholds */
+@@ -222,19 +230,26 @@ struct mem_cgroup {
+        * mem_cgroup ? And what type of charges should we move ?
+        */
+       unsigned long move_charge_at_immigrate;
++      /* taken only while moving_account > 0 */
++      spinlock_t              move_lock;
++      unsigned long           move_lock_flags;
++
++      MEMCG_PADDING(_pad1_);
++
+       /*
+        * set > 0 if pages under this cgroup are moving to other cgroup.
+        */
+       atomic_t                moving_account;
+-      /* taken only while moving_account > 0 */
+-      spinlock_t              move_lock;
+       struct task_struct      *move_lock_task;
+-      unsigned long           move_lock_flags;
+       /* memory.stat */
+       struct mem_cgroup_stat_cpu __percpu *stat_cpu;
++
++      MEMCG_PADDING(_pad2_);
++
+       atomic_long_t           stat[MEMCG_NR_STAT];
+       atomic_long_t           events[NR_VM_EVENT_ITEMS];
++      atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+       unsigned long           socket_pressure;
diff --git a/queue-4.14/mm-fix-oom_kill-event-handling.patch b/queue-4.14/mm-fix-oom_kill-event-handling.patch
new file mode 100644 (file)
index 0000000..bca6c2f
--- /dev/null
@@ -0,0 +1,134 @@
+From foo@baz Sat Apr  3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:09 +0000
+Subject: mm: fix oom_kill event handling
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-5-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Roman Gushchin <guro@fb.com>
+
+commit fe6bdfc8e1e131720abbe77a2eb990c94c9024cb upstream.
+
+Commit e27be240df53 ("mm: memcg: make sure memory.events is uptodate
+when waking pollers") converted most of memcg event counters to
+per-memcg atomics, which made them less confusing for a user.  The
+"oom_kill" counter remained untouched, so now it behaves differently
+than other counters (including "oom").  This adds nothing but confusion.
+
+Let's fix this by adding the MEMCG_OOM_KILL event, and follow the
+MEMCG_OOM approach.
+
+This also removes a hack from count_memcg_event_mm(), introduced earlier
+specially for the OOM_KILL counter.
+
+[akpm@linux-foundation.org: fix for droppage of memcg-replace-mm-owner-with-mm-memcg.patch]
+Link: http://lkml.kernel.org/r/20180508124637.29984-1-guro@fb.com
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[fllinden@amazon.com: backport to 4.14, minor contextual changes]
+Signed-off-by: Frank van der Linden <fllinden@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h |   26 ++++++++++++++++++++++----
+ mm/memcontrol.c            |    6 ++++--
+ mm/oom_kill.c              |    2 +-
+ 3 files changed, 27 insertions(+), 7 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -53,6 +53,7 @@ enum memcg_memory_event {
+       MEMCG_HIGH,
+       MEMCG_MAX,
+       MEMCG_OOM,
++      MEMCG_OOM_KILL,
+       MEMCG_NR_MEMORY_EVENTS,
+ };
+@@ -706,11 +707,8 @@ static inline void count_memcg_event_mm(
+       rcu_read_lock();
+       memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+-      if (likely(memcg)) {
++      if (likely(memcg))
+               count_memcg_events(memcg, idx, 1);
+-              if (idx == OOM_KILL)
+-                      cgroup_file_notify(&memcg->events_file);
+-      }
+       rcu_read_unlock();
+ }
+@@ -721,6 +719,21 @@ static inline void memcg_memory_event(st
+       cgroup_file_notify(&memcg->events_file);
+ }
++static inline void memcg_memory_event_mm(struct mm_struct *mm,
++                                       enum memcg_memory_event event)
++{
++      struct mem_cgroup *memcg;
++
++      if (mem_cgroup_disabled())
++              return;
++
++      rcu_read_lock();
++      memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
++      if (likely(memcg))
++              memcg_memory_event(memcg, event);
++      rcu_read_unlock();
++}
++
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ void mem_cgroup_split_huge_fixup(struct page *head);
+ #endif
+@@ -742,6 +755,11 @@ static inline void memcg_memory_event(st
+ {
+ }
++static inline void memcg_memory_event_mm(struct mm_struct *mm,
++                                       enum memcg_memory_event event)
++{
++}
++
+ static inline bool mem_cgroup_low(struct mem_cgroup *root,
+                                 struct mem_cgroup *memcg)
+ {
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3648,7 +3648,8 @@ static int mem_cgroup_oom_control_read(s
+       seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+       seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
+-      seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
++      seq_printf(sf, "oom_kill %lu\n",
++                 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+       return 0;
+ }
+@@ -5320,7 +5321,8 @@ static int memory_events_show(struct seq
+                  atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
+       seq_printf(m, "oom %lu\n",
+                  atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
+-      seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
++      seq_printf(m, "oom_kill %lu\n",
++                 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+       return 0;
+ }
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -917,7 +917,7 @@ static void oom_kill_process(struct oom_
+       /* Raise event before sending signal: task reaper must see this */
+       count_vm_event(OOM_KILL);
+-      count_memcg_event_mm(mm, OOM_KILL);
++      memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
+       /*
+        * We should send SIGKILL before granting access to memory reserves
diff --git a/queue-4.14/mm-memcg-make-sure-memory.events-is-uptodate-when-waking-pollers.patch b/queue-4.14/mm-memcg-make-sure-memory.events-is-uptodate-when-waking-pollers.patch
new file mode 100644 (file)
index 0000000..c6490fa
--- /dev/null
@@ -0,0 +1,260 @@
+From foo@baz Sat Apr  3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:07 +0000
+Subject: mm: memcg: make sure memory.events is uptodate when waking pollers
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-3-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit e27be240df53f1a20c659168e722b5d9f16cc7f4 upstream.
+
+Commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
+memory.stat reporting") added per-cpu drift to all memory cgroup stats
+and events shown in memory.stat and memory.events.
+
+For memory.stat this is acceptable.  But memory.events issues file
+notifications, and somebody polling the file for changes will be
+confused when the counters in it are unchanged after a wakeup.
+
+Luckily, the events in memory.events - MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX,
+MEMCG_OOM - are sufficiently rare and high-level that we don't need
+per-cpu buffering for them: MEMCG_HIGH and MEMCG_MAX would be the most
+frequent, but they're counting invocations of reclaim, which is a
+complex operation that touches many shared cachelines.
+
+This splits memory.events from the generic VM events and tracks them in
+their own, unbuffered atomic counters.  That's also cleaner, as it
+eliminates the ugly enum nesting of VM and cgroup events.
+
+[hannes@cmpxchg.org: "array subscript is above array bounds"]
+  Link: http://lkml.kernel.org/r/20180406155441.GA20806@cmpxchg.org
+Link: http://lkml.kernel.org/r/20180405175507.GA24817@cmpxchg.org
+Fixes: a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting")
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Tejun Heo <tj@kernel.org>
+Acked-by: Tejun Heo <tj@kernel.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Roman Gushchin <guro@fb.com>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h |   35 ++++++++++++++++++-----------------
+ mm/memcontrol.c            |   28 ++++++++++++++++------------
+ mm/vmscan.c                |    2 +-
+ 3 files changed, 35 insertions(+), 30 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -48,13 +48,12 @@ enum memcg_stat_item {
+       MEMCG_NR_STAT,
+ };
+-/* Cgroup-specific events, on top of universal VM events */
+-enum memcg_event_item {
+-      MEMCG_LOW = NR_VM_EVENT_ITEMS,
++enum memcg_memory_event {
++      MEMCG_LOW,
+       MEMCG_HIGH,
+       MEMCG_MAX,
+       MEMCG_OOM,
+-      MEMCG_NR_EVENTS,
++      MEMCG_NR_MEMORY_EVENTS,
+ };
+ struct mem_cgroup_reclaim_cookie {
+@@ -88,7 +87,7 @@ enum mem_cgroup_events_target {
+ struct mem_cgroup_stat_cpu {
+       long count[MEMCG_NR_STAT];
+-      unsigned long events[MEMCG_NR_EVENTS];
++      unsigned long events[NR_VM_EVENT_ITEMS];
+       unsigned long nr_page_events;
+       unsigned long targets[MEM_CGROUP_NTARGETS];
+ };
+@@ -202,7 +201,8 @@ struct mem_cgroup {
+       /* OOM-Killer disable */
+       int             oom_kill_disable;
+-      /* handle for "memory.events" */
++      /* memory.events */
++      atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+       struct cgroup_file events_file;
+       /* protect arrays of thresholds */
+@@ -231,9 +231,10 @@ struct mem_cgroup {
+       struct task_struct      *move_lock_task;
+       unsigned long           move_lock_flags;
++      /* memory.stat */
+       struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+       atomic_long_t           stat[MEMCG_NR_STAT];
+-      atomic_long_t           events[MEMCG_NR_EVENTS];
++      atomic_long_t           events[NR_VM_EVENT_ITEMS];
+       unsigned long           socket_pressure;
+@@ -645,9 +646,9 @@ unsigned long mem_cgroup_soft_limit_recl
+                                               gfp_t gfp_mask,
+                                               unsigned long *total_scanned);
+-/* idx can be of type enum memcg_event_item or vm_event_item */
+ static inline void __count_memcg_events(struct mem_cgroup *memcg,
+-                                      int idx, unsigned long count)
++                                      enum vm_event_item idx,
++                                      unsigned long count)
+ {
+       unsigned long x;
+@@ -663,7 +664,8 @@ static inline void __count_memcg_events(
+ }
+ static inline void count_memcg_events(struct mem_cgroup *memcg,
+-                                    int idx, unsigned long count)
++                                    enum vm_event_item idx,
++                                    unsigned long count)
+ {
+       unsigned long flags;
+@@ -672,9 +674,8 @@ static inline void count_memcg_events(st
+       local_irq_restore(flags);
+ }
+-/* idx can be of type enum memcg_event_item or vm_event_item */
+ static inline void count_memcg_page_event(struct page *page,
+-                                        int idx)
++                                        enum vm_event_item idx)
+ {
+       if (page->mem_cgroup)
+               count_memcg_events(page->mem_cgroup, idx, 1);
+@@ -698,10 +699,10 @@ static inline void count_memcg_event_mm(
+       rcu_read_unlock();
+ }
+-static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+-                                  enum memcg_event_item event)
++static inline void memcg_memory_event(struct mem_cgroup *memcg,
++                                    enum memcg_memory_event event)
+ {
+-      count_memcg_events(memcg, event, 1);
++      atomic_long_inc(&memcg->memory_events[event]);
+       cgroup_file_notify(&memcg->events_file);
+ }
+@@ -721,8 +722,8 @@ static inline bool mem_cgroup_disabled(v
+       return true;
+ }
+-static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+-                                  enum memcg_event_item event)
++static inline void memcg_memory_event(struct mem_cgroup *memcg,
++                                    enum memcg_memory_event event)
+ {
+ }
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -1872,7 +1872,7 @@ static int memcg_hotplug_cpu_dead(unsign
+                       }
+               }
+-              for (i = 0; i < MEMCG_NR_EVENTS; i++) {
++              for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+                       long x;
+                       x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+@@ -1891,7 +1891,7 @@ static void reclaim_high(struct mem_cgro
+       do {
+               if (page_counter_read(&memcg->memory) <= memcg->high)
+                       continue;
+-              mem_cgroup_event(memcg, MEMCG_HIGH);
++              memcg_memory_event(memcg, MEMCG_HIGH);
+               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+       } while ((memcg = parent_mem_cgroup(memcg)));
+ }
+@@ -1982,7 +1982,7 @@ retry:
+       if (!gfpflags_allow_blocking(gfp_mask))
+               goto nomem;
+-      mem_cgroup_event(mem_over_limit, MEMCG_MAX);
++      memcg_memory_event(mem_over_limit, MEMCG_MAX);
+       nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+                                                   gfp_mask, may_swap);
+@@ -2025,7 +2025,7 @@ retry:
+       if (fatal_signal_pending(current))
+               goto force;
+-      mem_cgroup_event(mem_over_limit, MEMCG_OOM);
++      memcg_memory_event(mem_over_limit, MEMCG_OOM);
+       mem_cgroup_oom(mem_over_limit, gfp_mask,
+                      get_order(nr_pages * PAGE_SIZE));
+@@ -2790,10 +2790,10 @@ static void tree_events(struct mem_cgrou
+       struct mem_cgroup *iter;
+       int i;
+-      memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
++      memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
+       for_each_mem_cgroup_tree(iter, memcg) {
+-              for (i = 0; i < MEMCG_NR_EVENTS; i++)
++              for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+                       events[i] += memcg_sum_events(iter, i);
+       }
+ }
+@@ -5299,7 +5299,7 @@ static ssize_t memory_max_write(struct k
+                       continue;
+               }
+-              mem_cgroup_event(memcg, MEMCG_OOM);
++              memcg_memory_event(memcg, MEMCG_OOM);
+               if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+                       break;
+       }
+@@ -5312,10 +5312,14 @@ static int memory_events_show(struct seq
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+-      seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW));
+-      seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
+-      seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
+-      seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
++      seq_printf(m, "low %lu\n",
++                 atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
++      seq_printf(m, "high %lu\n",
++                 atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
++      seq_printf(m, "max %lu\n",
++                 atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
++      seq_printf(m, "oom %lu\n",
++                 atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
+       seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
+       return 0;
+@@ -5325,7 +5329,7 @@ static int memory_stat_show(struct seq_f
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+       unsigned long stat[MEMCG_NR_STAT];
+-      unsigned long events[MEMCG_NR_EVENTS];
++      unsigned long events[NR_VM_EVENT_ITEMS];
+       int i;
+       /*
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2628,7 +2628,7 @@ static bool shrink_node(pg_data_t *pgdat
+                                       sc->memcg_low_skipped = 1;
+                                       continue;
+                               }
+-                              mem_cgroup_event(memcg, MEMCG_LOW);
++                              memcg_memory_event(memcg, MEMCG_LOW);
+                       }
+                       reclaimed = sc->nr_reclaimed;
diff --git a/queue-4.14/mm-memcontrol-fix-nr_writeback-leak-in-memcg-and-system-stats.patch b/queue-4.14/mm-memcontrol-fix-nr_writeback-leak-in-memcg-and-system-stats.patch
new file mode 100644 (file)
index 0000000..561d412
--- /dev/null
@@ -0,0 +1,107 @@
+From foo@baz Sat Apr  3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:06 +0000
+Subject: mm: memcontrol: fix NR_WRITEBACK leak in memcg and system stats
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-2-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit c3cc39118c3610eb6ab4711bc624af7fc48a35fe upstream.
+
+After commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
+memory.stat reporting"), we observed slowly upward creeping NR_WRITEBACK
+counts over the course of several days, both the per-memcg stats as well
+as the system counter in e.g.  /proc/meminfo.
+
+The conversion from full per-cpu stat counts to per-cpu cached atomic
+stat counts introduced an irq-unsafe RMW operation into the updates.
+
+Most stat updates come from process context, but one notable exception
+is the NR_WRITEBACK counter.  While writebacks are issued from process
+context, they are retired from (soft)irq context.
+
+When writeback completions interrupt the RMW counter updates of new
+writebacks being issued, the decs from the completions are lost.
+
+Since the global updates are routed through the joint lruvec API, both
+the memcg counters as well as the system counters are affected.
+
+This patch makes the joint stat and event API irq safe.
+
+Link: http://lkml.kernel.org/r/20180203082353.17284-1-hannes@cmpxchg.org
+Fixes: a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting")
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Debugged-by: Tejun Heo <tj@kernel.org>
+Reviewed-by: Rik van Riel <riel@surriel.com>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h |   24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -523,9 +523,11 @@ static inline void __mod_memcg_state(str
+ static inline void mod_memcg_state(struct mem_cgroup *memcg,
+                                  int idx, int val)
+ {
+-      preempt_disable();
++      unsigned long flags;
++
++      local_irq_save(flags);
+       __mod_memcg_state(memcg, idx, val);
+-      preempt_enable();
++      local_irq_restore(flags);
+ }
+ /**
+@@ -606,9 +608,11 @@ static inline void __mod_lruvec_state(st
+ static inline void mod_lruvec_state(struct lruvec *lruvec,
+                                   enum node_stat_item idx, int val)
+ {
+-      preempt_disable();
++      unsigned long flags;
++
++      local_irq_save(flags);
+       __mod_lruvec_state(lruvec, idx, val);
+-      preempt_enable();
++      local_irq_restore(flags);
+ }
+ static inline void __mod_lruvec_page_state(struct page *page,
+@@ -630,9 +634,11 @@ static inline void __mod_lruvec_page_sta
+ static inline void mod_lruvec_page_state(struct page *page,
+                                        enum node_stat_item idx, int val)
+ {
+-      preempt_disable();
++      unsigned long flags;
++
++      local_irq_save(flags);
+       __mod_lruvec_page_state(page, idx, val);
+-      preempt_enable();
++      local_irq_restore(flags);
+ }
+ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+@@ -659,9 +665,11 @@ static inline void __count_memcg_events(
+ static inline void count_memcg_events(struct mem_cgroup *memcg,
+                                     int idx, unsigned long count)
+ {
+-      preempt_disable();
++      unsigned long flags;
++
++      local_irq_save(flags);
+       __count_memcg_events(memcg, idx, count);
+-      preempt_enable();
++      local_irq_restore(flags);
+ }
+ /* idx can be of type enum memcg_event_item or vm_event_item */
diff --git a/queue-4.14/mm-writeback-use-exact-memcg-dirty-counts.patch b/queue-4.14/mm-writeback-use-exact-memcg-dirty-counts.patch
new file mode 100644 (file)
index 0000000..97643c0
--- /dev/null
@@ -0,0 +1,227 @@
+From foo@baz Sat Apr  3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:10 +0000
+Subject: mm: writeback: use exact memcg dirty counts
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-6-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Greg Thelen <gthelen@google.com>
+
+commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.
+
+Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
+memory.stat reporting") memcg dirty and writeback counters are managed
+as:
+
+ 1) per-memcg per-cpu values in range of [-32..32]
+
+ 2) per-memcg atomic counter
+
+When a per-cpu counter cannot fit in [-32..32] it's flushed to the
+atomic.  Stat readers only check the atomic.  Thus readers such as
+balance_dirty_pages() may see a nontrivial error margin: 32 pages per
+cpu.
+
+Assuming 100 cpus:
+   4k x86 page_size:  13 MiB error per memcg
+  64k ppc page_size: 200 MiB error per memcg
+
+Considering that dirty+writeback are used together for some decisions the
+errors double.
+
+This inaccuracy can lead to undeserved oom kills.  One nasty case is
+when all per-cpu counters hold positive values offsetting an atomic
+negative value (i.e.  per_cpu[*]=32, atomic=n_cpu*-32).
+balance_dirty_pages() only consults the atomic and does not consider
+throttling the next n_cpu*32 dirty pages.  If the file_lru is in the
+13..200 MiB range then there's absolutely no dirty throttling, which
+burdens vmscan with only dirty+writeback pages thus resorting to oom
+kill.
+
+It could be argued that tiny containers are not supported, but it's more
+subtle.  It's the amount the space available for file lru that matters.
+If a container has memory.max-200MiB of non reclaimable memory, then it
+will also suffer such oom kills on a 100 cpu machine.
+
+The following test reliably ooms without this patch.  This patch avoids
+oom kills.
+
+  $ cat test
+  mount -t cgroup2 none /dev/cgroup
+  cd /dev/cgroup
+  echo +io +memory > cgroup.subtree_control
+  mkdir test
+  cd test
+  echo 10M > memory.max
+  (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
+  (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
+
+  $ cat memcg-writeback-stress.c
+  /*
+   * Dirty pages from all but one cpu.
+   * Clean pages from the non dirtying cpu.
+   * This is to stress per cpu counter imbalance.
+   * On a 100 cpu machine:
+   * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
+   * - per memcg atomic is -99*32 pages
+   * - thus the complete dirty limit: sum of all counters 0
+   * - balance_dirty_pages() only sees atomic count -99*32 pages, which
+   *   it max()s to 0.
+   * - So a workload can dirty -99*32 pages before balance_dirty_pages()
+   *   cares.
+   */
+  #define _GNU_SOURCE
+  #include <err.h>
+  #include <fcntl.h>
+  #include <sched.h>
+  #include <stdlib.h>
+  #include <stdio.h>
+  #include <sys/stat.h>
+  #include <sys/sysinfo.h>
+  #include <sys/types.h>
+  #include <unistd.h>
+
+  static char *buf;
+  static int bufSize;
+
+  static void set_affinity(int cpu)
+  {
+       cpu_set_t affinity;
+
+       CPU_ZERO(&affinity);
+       CPU_SET(cpu, &affinity);
+       if (sched_setaffinity(0, sizeof(affinity), &affinity))
+               err(1, "sched_setaffinity");
+  }
+
+  static void dirty_on(int output_fd, int cpu)
+  {
+       int i, wrote;
+
+       set_affinity(cpu);
+       for (i = 0; i < 32; i++) {
+               for (wrote = 0; wrote < bufSize; ) {
+                       int ret = write(output_fd, buf+wrote, bufSize-wrote);
+                       if (ret == -1)
+                               err(1, "write");
+                       wrote += ret;
+               }
+       }
+  }
+
+  int main(int argc, char **argv)
+  {
+       int cpu, flush_cpu = 1, output_fd;
+       const char *output;
+
+       if (argc != 2)
+               errx(1, "usage: output_file");
+
+       output = argv[1];
+       bufSize = getpagesize();
+       buf = malloc(getpagesize());
+       if (buf == NULL)
+               errx(1, "malloc failed");
+
+       output_fd = open(output, O_CREAT|O_RDWR);
+       if (output_fd == -1)
+               err(1, "open(%s)", output);
+
+       for (cpu = 0; cpu < get_nprocs(); cpu++) {
+               if (cpu != flush_cpu)
+                       dirty_on(output_fd, cpu);
+       }
+
+       set_affinity(flush_cpu);
+       if (fsync(output_fd))
+               err(1, "fsync(%s)", output);
+       if (close(output_fd))
+               err(1, "close(%s)", output);
+       free(buf);
+  }
+
+Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
+collect exact per memcg counters.  This avoids the aforementioned oom
+kills.
+
+This does not affect the overhead of memory.stat, which still reads the
+single atomic counter.
+
+Why not use percpu_counter? memcg already handles cpus going offline, so
+no need for that overhead from percpu_counter.  And the percpu_counter
+spinlocks are more heavyweight than is required.
+
+It probably also makes sense to use exact dirty and writeback counters
+in memcg oom reports.  But that is saved for later.
+
+Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
+Signed-off-by: Greg Thelen <gthelen@google.com>
+Reviewed-by: Roman Gushchin <guro@fb.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: <stable@vger.kernel.org>   [4.16+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h |    5 ++++-
+ mm/memcontrol.c            |   20 ++++++++++++++++++--
+ 2 files changed, 22 insertions(+), 3 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -507,7 +507,10 @@ struct mem_cgroup *lock_page_memcg(struc
+ void __unlock_page_memcg(struct mem_cgroup *memcg);
+ void unlock_page_memcg(struct page *page);
+-/* idx can be of type enum memcg_stat_item or node_stat_item */
++/*
++ * idx can be of type enum memcg_stat_item or node_stat_item.
++ * Keep in sync with memcg_exact_page_state().
++ */
+ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
+                                            int idx)
+ {
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3701,6 +3701,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
+       return &memcg->cgwb_domain;
+ }
++/*
++ * idx can be of type enum memcg_stat_item or node_stat_item.
++ * Keep in sync with memcg_exact_page().
++ */
++static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
++{
++      long x = atomic_long_read(&memcg->stat[idx]);
++      int cpu;
++
++      for_each_online_cpu(cpu)
++              x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
++      if (x < 0)
++              x = 0;
++      return x;
++}
++
+ /**
+  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+  * @wb: bdi_writeback in question
+@@ -3726,10 +3742,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       struct mem_cgroup *parent;
+-      *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
++      *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+       /* this should eventually include NR_UNSTABLE_NFS */
+-      *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
++      *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
+       *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                    (1 << LRU_ACTIVE_FILE));
+       *pheadroom = PAGE_COUNTER_MAX;
index a165ff5d45078c27227e0a6909334a28df43042a..4179322da8073b05aa142d442c0371f16fc824e6 100644 (file)
@@ -29,3 +29,8 @@ mm-fix-race-by-making-init_zero_pfn-early_initcall.patch
 drm-amdgpu-fix-offset-calculation-in-amdgpu_vm_bo_clear_mappings.patch
 drm-amdgpu-check-alignment-on-cpu-page-for-bo-map.patch
 reiserfs-update-reiserfs_xattrs_initialized-condition.patch
+mm-memcontrol-fix-nr_writeback-leak-in-memcg-and-system-stats.patch
+mm-memcg-make-sure-memory.events-is-uptodate-when-waking-pollers.patch
+mem_cgroup-make-sure-moving_account-move_lock_task-and-stat_cpu-in-the-same-cacheline.patch
+mm-fix-oom_kill-event-handling.patch
+mm-writeback-use-exact-memcg-dirty-counts.patch