--- /dev/null
+From foo@baz Sat Apr 3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:08 +0000
+Subject: mem_cgroup: make sure moving_account, move_lock_task and stat_cpu in the same cacheline
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-4-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Aaron Lu <aaron.lu@intel.com>
+
+commit e81bf9793b1861d74953ef041b4f6c7faecc2dbd upstream.
+
+The LKP robot found a 27% will-it-scale/page_fault3 performance
+regression regarding commit e27be240df53("mm: memcg: make sure
+memory.events is uptodate when waking pollers").
+
+What the test does is:
+ 1 mkstemp() a 128M file on a tmpfs;
+ 2 start $nr_cpu processes, each to loop the following:
+ 2.1 mmap() this file in shared write mode;
+ 2.2 write 0 to this file in a PAGE_SIZE step till the end of the file;
+ 2.3 unmap() this file and repeat this process.
+ 3 After 5 minutes, check how many loops they managed to complete, the
+ higher the better.
+
+The commit itself looks innocent enough as it merely changed some event
+counting mechanism and this test didn't trigger those events at all.
+Perf shows increased cycles spent on accessing root_mem_cgroup->stat_cpu
+in count_memcg_event_mm()(called by handle_mm_fault()) and in
+__mod_memcg_state() called by page_add_file_rmap(). So it's likely due
+to the changed layout of 'struct mem_cgroup' that either make stat_cpu
+falling into a constantly modifying cacheline or some hot fields stop
+being in the same cacheline.
+
+I verified this by moving memory_events[] back to where it was:
+
+: --- a/include/linux/memcontrol.h
+: +++ b/include/linux/memcontrol.h
+: @@ -205,7 +205,6 @@ struct mem_cgroup {
+: int oom_kill_disable;
+:
+: /* memory.events */
+: - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+: struct cgroup_file events_file;
+:
+: /* protect arrays of thresholds */
+: @@ -238,6 +237,7 @@ struct mem_cgroup {
+: struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+: atomic_long_t stat[MEMCG_NR_STAT];
+: atomic_long_t events[NR_VM_EVENT_ITEMS];
+: + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+:
+: unsigned long socket_pressure;
+
+And performance restored.
+
+Later investigation found that as long as the following 3 fields
+moving_account, move_lock_task and stat_cpu are in the same cacheline,
+performance will be good. To avoid future performance surprise by other
+commits changing the layout of 'struct mem_cgroup', this patch makes
+sure the 3 fields stay in the same cacheline.
+
+One concern of this approach is, moving_account and move_lock_task could
+be modified when a process changes memory cgroup while stat_cpu is a
+always read field, it might hurt to place them in the same cacheline. I
+assume it is rare for a process to change memory cgroup so this should
+be OK.
+
+Link: https://lkml.kernel.org/r/20180528114019.GF9904@yexl-desktop
+Link: http://lkml.kernel.org/r/20180601071115.GA27302@intel.com
+Signed-off-by: Aaron Lu <aaron.lu@intel.com>
+Reported-by: kernel test robot <xiaolong.ye@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Tejun Heo <tj@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h | 23 +++++++++++++++++++----
+ 1 file changed, 19 insertions(+), 4 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -155,6 +155,15 @@ enum memcg_kmem_state {
+ KMEM_ONLINE,
+ };
+
++#if defined(CONFIG_SMP)
++struct memcg_padding {
++ char x[0];
++} ____cacheline_internodealigned_in_smp;
++#define MEMCG_PADDING(name) struct memcg_padding name;
++#else
++#define MEMCG_PADDING(name)
++#endif
++
+ /*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+@@ -202,7 +211,6 @@ struct mem_cgroup {
+ int oom_kill_disable;
+
+ /* memory.events */
+- atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+ struct cgroup_file events_file;
+
+ /* protect arrays of thresholds */
+@@ -222,19 +230,26 @@ struct mem_cgroup {
+ * mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
++ /* taken only while moving_account > 0 */
++ spinlock_t move_lock;
++ unsigned long move_lock_flags;
++
++ MEMCG_PADDING(_pad1_);
++
+ /*
+ * set > 0 if pages under this cgroup are moving to other cgroup.
+ */
+ atomic_t moving_account;
+- /* taken only while moving_account > 0 */
+- spinlock_t move_lock;
+ struct task_struct *move_lock_task;
+- unsigned long move_lock_flags;
+
+ /* memory.stat */
+ struct mem_cgroup_stat_cpu __percpu *stat_cpu;
++
++ MEMCG_PADDING(_pad2_);
++
+ atomic_long_t stat[MEMCG_NR_STAT];
+ atomic_long_t events[NR_VM_EVENT_ITEMS];
++ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+
+ unsigned long socket_pressure;
+
--- /dev/null
+From foo@baz Sat Apr 3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:09 +0000
+Subject: mm: fix oom_kill event handling
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-5-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Roman Gushchin <guro@fb.com>
+
+commit fe6bdfc8e1e131720abbe77a2eb990c94c9024cb upstream.
+
+Commit e27be240df53 ("mm: memcg: make sure memory.events is uptodate
+when waking pollers") converted most of memcg event counters to
+per-memcg atomics, which made them less confusing for a user. The
+"oom_kill" counter remained untouched, so now it behaves differently
+than other counters (including "oom"). This adds nothing but confusion.
+
+Let's fix this by adding the MEMCG_OOM_KILL event, and follow the
+MEMCG_OOM approach.
+
+This also removes a hack from count_memcg_event_mm(), introduced earlier
+specially for the OOM_KILL counter.
+
+[akpm@linux-foundation.org: fix for droppage of memcg-replace-mm-owner-with-mm-memcg.patch]
+Link: http://lkml.kernel.org/r/20180508124637.29984-1-guro@fb.com
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[fllinden@amazon.com: backport to 4.14, minor contextual changes]
+Signed-off-by: Frank van der Linden <fllinden@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h | 26 ++++++++++++++++++++++----
+ mm/memcontrol.c | 6 ++++--
+ mm/oom_kill.c | 2 +-
+ 3 files changed, 27 insertions(+), 7 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -53,6 +53,7 @@ enum memcg_memory_event {
+ MEMCG_HIGH,
+ MEMCG_MAX,
+ MEMCG_OOM,
++ MEMCG_OOM_KILL,
+ MEMCG_NR_MEMORY_EVENTS,
+ };
+
+@@ -706,11 +707,8 @@ static inline void count_memcg_event_mm(
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+- if (likely(memcg)) {
++ if (likely(memcg))
+ count_memcg_events(memcg, idx, 1);
+- if (idx == OOM_KILL)
+- cgroup_file_notify(&memcg->events_file);
+- }
+ rcu_read_unlock();
+ }
+
+@@ -721,6 +719,21 @@ static inline void memcg_memory_event(st
+ cgroup_file_notify(&memcg->events_file);
+ }
+
++static inline void memcg_memory_event_mm(struct mm_struct *mm,
++ enum memcg_memory_event event)
++{
++ struct mem_cgroup *memcg;
++
++ if (mem_cgroup_disabled())
++ return;
++
++ rcu_read_lock();
++ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
++ if (likely(memcg))
++ memcg_memory_event(memcg, event);
++ rcu_read_unlock();
++}
++
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ void mem_cgroup_split_huge_fixup(struct page *head);
+ #endif
+@@ -742,6 +755,11 @@ static inline void memcg_memory_event(st
+ {
+ }
+
++static inline void memcg_memory_event_mm(struct mm_struct *mm,
++ enum memcg_memory_event event)
++{
++}
++
+ static inline bool mem_cgroup_low(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
+ {
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3648,7 +3648,8 @@ static int mem_cgroup_oom_control_read(s
+
+ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
+- seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
++ seq_printf(sf, "oom_kill %lu\n",
++ atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+ return 0;
+ }
+
+@@ -5320,7 +5321,8 @@ static int memory_events_show(struct seq
+ atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
+ seq_printf(m, "oom %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
+- seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
++ seq_printf(m, "oom_kill %lu\n",
++ atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+
+ return 0;
+ }
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -917,7 +917,7 @@ static void oom_kill_process(struct oom_
+
+ /* Raise event before sending signal: task reaper must see this */
+ count_vm_event(OOM_KILL);
+- count_memcg_event_mm(mm, OOM_KILL);
++ memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
+
+ /*
+ * We should send SIGKILL before granting access to memory reserves
--- /dev/null
+From foo@baz Sat Apr 3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:07 +0000
+Subject: mm: memcg: make sure memory.events is uptodate when waking pollers
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-3-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit e27be240df53f1a20c659168e722b5d9f16cc7f4 upstream.
+
+Commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
+memory.stat reporting") added per-cpu drift to all memory cgroup stats
+and events shown in memory.stat and memory.events.
+
+For memory.stat this is acceptable. But memory.events issues file
+notifications, and somebody polling the file for changes will be
+confused when the counters in it are unchanged after a wakeup.
+
+Luckily, the events in memory.events - MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX,
+MEMCG_OOM - are sufficiently rare and high-level that we don't need
+per-cpu buffering for them: MEMCG_HIGH and MEMCG_MAX would be the most
+frequent, but they're counting invocations of reclaim, which is a
+complex operation that touches many shared cachelines.
+
+This splits memory.events from the generic VM events and tracks them in
+their own, unbuffered atomic counters. That's also cleaner, as it
+eliminates the ugly enum nesting of VM and cgroup events.
+
+[hannes@cmpxchg.org: "array subscript is above array bounds"]
+ Link: http://lkml.kernel.org/r/20180406155441.GA20806@cmpxchg.org
+Link: http://lkml.kernel.org/r/20180405175507.GA24817@cmpxchg.org
+Fixes: a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting")
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Tejun Heo <tj@kernel.org>
+Acked-by: Tejun Heo <tj@kernel.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Roman Gushchin <guro@fb.com>
+Cc: Rik van Riel <riel@surriel.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h | 35 ++++++++++++++++++-----------------
+ mm/memcontrol.c | 28 ++++++++++++++++------------
+ mm/vmscan.c | 2 +-
+ 3 files changed, 35 insertions(+), 30 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -48,13 +48,12 @@ enum memcg_stat_item {
+ MEMCG_NR_STAT,
+ };
+
+-/* Cgroup-specific events, on top of universal VM events */
+-enum memcg_event_item {
+- MEMCG_LOW = NR_VM_EVENT_ITEMS,
++enum memcg_memory_event {
++ MEMCG_LOW,
+ MEMCG_HIGH,
+ MEMCG_MAX,
+ MEMCG_OOM,
+- MEMCG_NR_EVENTS,
++ MEMCG_NR_MEMORY_EVENTS,
+ };
+
+ struct mem_cgroup_reclaim_cookie {
+@@ -88,7 +87,7 @@ enum mem_cgroup_events_target {
+
+ struct mem_cgroup_stat_cpu {
+ long count[MEMCG_NR_STAT];
+- unsigned long events[MEMCG_NR_EVENTS];
++ unsigned long events[NR_VM_EVENT_ITEMS];
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+ };
+@@ -202,7 +201,8 @@ struct mem_cgroup {
+ /* OOM-Killer disable */
+ int oom_kill_disable;
+
+- /* handle for "memory.events" */
++ /* memory.events */
++ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+ struct cgroup_file events_file;
+
+ /* protect arrays of thresholds */
+@@ -231,9 +231,10 @@ struct mem_cgroup {
+ struct task_struct *move_lock_task;
+ unsigned long move_lock_flags;
+
++ /* memory.stat */
+ struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+ atomic_long_t stat[MEMCG_NR_STAT];
+- atomic_long_t events[MEMCG_NR_EVENTS];
++ atomic_long_t events[NR_VM_EVENT_ITEMS];
+
+ unsigned long socket_pressure;
+
+@@ -645,9 +646,9 @@ unsigned long mem_cgroup_soft_limit_recl
+ gfp_t gfp_mask,
+ unsigned long *total_scanned);
+
+-/* idx can be of type enum memcg_event_item or vm_event_item */
+ static inline void __count_memcg_events(struct mem_cgroup *memcg,
+- int idx, unsigned long count)
++ enum vm_event_item idx,
++ unsigned long count)
+ {
+ unsigned long x;
+
+@@ -663,7 +664,8 @@ static inline void __count_memcg_events(
+ }
+
+ static inline void count_memcg_events(struct mem_cgroup *memcg,
+- int idx, unsigned long count)
++ enum vm_event_item idx,
++ unsigned long count)
+ {
+ unsigned long flags;
+
+@@ -672,9 +674,8 @@ static inline void count_memcg_events(st
+ local_irq_restore(flags);
+ }
+
+-/* idx can be of type enum memcg_event_item or vm_event_item */
+ static inline void count_memcg_page_event(struct page *page,
+- int idx)
++ enum vm_event_item idx)
+ {
+ if (page->mem_cgroup)
+ count_memcg_events(page->mem_cgroup, idx, 1);
+@@ -698,10 +699,10 @@ static inline void count_memcg_event_mm(
+ rcu_read_unlock();
+ }
+
+-static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+- enum memcg_event_item event)
++static inline void memcg_memory_event(struct mem_cgroup *memcg,
++ enum memcg_memory_event event)
+ {
+- count_memcg_events(memcg, event, 1);
++ atomic_long_inc(&memcg->memory_events[event]);
+ cgroup_file_notify(&memcg->events_file);
+ }
+
+@@ -721,8 +722,8 @@ static inline bool mem_cgroup_disabled(v
+ return true;
+ }
+
+-static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+- enum memcg_event_item event)
++static inline void memcg_memory_event(struct mem_cgroup *memcg,
++ enum memcg_memory_event event)
+ {
+ }
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -1872,7 +1872,7 @@ static int memcg_hotplug_cpu_dead(unsign
+ }
+ }
+
+- for (i = 0; i < MEMCG_NR_EVENTS; i++) {
++ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+ long x;
+
+ x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+@@ -1891,7 +1891,7 @@ static void reclaim_high(struct mem_cgro
+ do {
+ if (page_counter_read(&memcg->memory) <= memcg->high)
+ continue;
+- mem_cgroup_event(memcg, MEMCG_HIGH);
++ memcg_memory_event(memcg, MEMCG_HIGH);
+ try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ } while ((memcg = parent_mem_cgroup(memcg)));
+ }
+@@ -1982,7 +1982,7 @@ retry:
+ if (!gfpflags_allow_blocking(gfp_mask))
+ goto nomem;
+
+- mem_cgroup_event(mem_over_limit, MEMCG_MAX);
++ memcg_memory_event(mem_over_limit, MEMCG_MAX);
+
+ nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+ gfp_mask, may_swap);
+@@ -2025,7 +2025,7 @@ retry:
+ if (fatal_signal_pending(current))
+ goto force;
+
+- mem_cgroup_event(mem_over_limit, MEMCG_OOM);
++ memcg_memory_event(mem_over_limit, MEMCG_OOM);
+
+ mem_cgroup_oom(mem_over_limit, gfp_mask,
+ get_order(nr_pages * PAGE_SIZE));
+@@ -2790,10 +2790,10 @@ static void tree_events(struct mem_cgrou
+ struct mem_cgroup *iter;
+ int i;
+
+- memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
++ memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+- for (i = 0; i < MEMCG_NR_EVENTS; i++)
++ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += memcg_sum_events(iter, i);
+ }
+ }
+@@ -5299,7 +5299,7 @@ static ssize_t memory_max_write(struct k
+ continue;
+ }
+
+- mem_cgroup_event(memcg, MEMCG_OOM);
++ memcg_memory_event(memcg, MEMCG_OOM);
+ if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+ break;
+ }
+@@ -5312,10 +5312,14 @@ static int memory_events_show(struct seq
+ {
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+- seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW));
+- seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
+- seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
+- seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
++ seq_printf(m, "low %lu\n",
++ atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
++ seq_printf(m, "high %lu\n",
++ atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
++ seq_printf(m, "max %lu\n",
++ atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
++ seq_printf(m, "oom %lu\n",
++ atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
+ seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
+
+ return 0;
+@@ -5325,7 +5329,7 @@ static int memory_stat_show(struct seq_f
+ {
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long stat[MEMCG_NR_STAT];
+- unsigned long events[MEMCG_NR_EVENTS];
++ unsigned long events[NR_VM_EVENT_ITEMS];
+ int i;
+
+ /*
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2628,7 +2628,7 @@ static bool shrink_node(pg_data_t *pgdat
+ sc->memcg_low_skipped = 1;
+ continue;
+ }
+- mem_cgroup_event(memcg, MEMCG_LOW);
++ memcg_memory_event(memcg, MEMCG_LOW);
+ }
+
+ reclaimed = sc->nr_reclaimed;
--- /dev/null
+From foo@baz Sat Apr 3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:06 +0000
+Subject: mm: memcontrol: fix NR_WRITEBACK leak in memcg and system stats
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-2-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit c3cc39118c3610eb6ab4711bc624af7fc48a35fe upstream.
+
+After commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
+memory.stat reporting"), we observed slowly upward creeping NR_WRITEBACK
+counts over the course of several days, both the per-memcg stats as well
+as the system counter in e.g. /proc/meminfo.
+
+The conversion from full per-cpu stat counts to per-cpu cached atomic
+stat counts introduced an irq-unsafe RMW operation into the updates.
+
+Most stat updates come from process context, but one notable exception
+is the NR_WRITEBACK counter. While writebacks are issued from process
+context, they are retired from (soft)irq context.
+
+When writeback completions interrupt the RMW counter updates of new
+writebacks being issued, the decs from the completions are lost.
+
+Since the global updates are routed through the joint lruvec API, both
+the memcg counters as well as the system counters are affected.
+
+This patch makes the joint stat and event API irq safe.
+
+Link: http://lkml.kernel.org/r/20180203082353.17284-1-hannes@cmpxchg.org
+Fixes: a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting")
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Debugged-by: Tejun Heo <tj@kernel.org>
+Reviewed-by: Rik van Riel <riel@surriel.com>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -523,9 +523,11 @@ static inline void __mod_memcg_state(str
+ static inline void mod_memcg_state(struct mem_cgroup *memcg,
+ int idx, int val)
+ {
+- preempt_disable();
++ unsigned long flags;
++
++ local_irq_save(flags);
+ __mod_memcg_state(memcg, idx, val);
+- preempt_enable();
++ local_irq_restore(flags);
+ }
+
+ /**
+@@ -606,9 +608,11 @@ static inline void __mod_lruvec_state(st
+ static inline void mod_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx, int val)
+ {
+- preempt_disable();
++ unsigned long flags;
++
++ local_irq_save(flags);
+ __mod_lruvec_state(lruvec, idx, val);
+- preempt_enable();
++ local_irq_restore(flags);
+ }
+
+ static inline void __mod_lruvec_page_state(struct page *page,
+@@ -630,9 +634,11 @@ static inline void __mod_lruvec_page_sta
+ static inline void mod_lruvec_page_state(struct page *page,
+ enum node_stat_item idx, int val)
+ {
+- preempt_disable();
++ unsigned long flags;
++
++ local_irq_save(flags);
+ __mod_lruvec_page_state(page, idx, val);
+- preempt_enable();
++ local_irq_restore(flags);
+ }
+
+ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+@@ -659,9 +665,11 @@ static inline void __count_memcg_events(
+ static inline void count_memcg_events(struct mem_cgroup *memcg,
+ int idx, unsigned long count)
+ {
+- preempt_disable();
++ unsigned long flags;
++
++ local_irq_save(flags);
+ __count_memcg_events(memcg, idx, count);
+- preempt_enable();
++ local_irq_restore(flags);
+ }
+
+ /* idx can be of type enum memcg_event_item or vm_event_item */
--- /dev/null
+From foo@baz Sat Apr 3 10:50:40 AM CEST 2021
+From: Frank van der Linden <fllinden@amazon.com>
+Date: Tue, 30 Mar 2021 18:19:10 +0000
+Subject: mm: writeback: use exact memcg dirty counts
+To: <stable@vger.kernel.org>
+Cc: <fllinden@amazon.com>
+Message-ID: <20210330181910.15378-6-fllinden@amazon.com>
+
+From: Frank van der Linden <fllinden@amazon.com>
+
+From: Greg Thelen <gthelen@google.com>
+
+commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.
+
+Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
+memory.stat reporting") memcg dirty and writeback counters are managed
+as:
+
+ 1) per-memcg per-cpu values in range of [-32..32]
+
+ 2) per-memcg atomic counter
+
+When a per-cpu counter cannot fit in [-32..32] it's flushed to the
+atomic. Stat readers only check the atomic. Thus readers such as
+balance_dirty_pages() may see a nontrivial error margin: 32 pages per
+cpu.
+
+Assuming 100 cpus:
+ 4k x86 page_size: 13 MiB error per memcg
+ 64k ppc page_size: 200 MiB error per memcg
+
+Considering that dirty+writeback are used together for some decisions the
+errors double.
+
+This inaccuracy can lead to undeserved oom kills. One nasty case is
+when all per-cpu counters hold positive values offsetting an atomic
+negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
+balance_dirty_pages() only consults the atomic and does not consider
+throttling the next n_cpu*32 dirty pages. If the file_lru is in the
+13..200 MiB range then there's absolutely no dirty throttling, which
+burdens vmscan with only dirty+writeback pages thus resorting to oom
+kill.
+
+It could be argued that tiny containers are not supported, but it's more
+subtle. It's the amount the space available for file lru that matters.
+If a container has memory.max-200MiB of non reclaimable memory, then it
+will also suffer such oom kills on a 100 cpu machine.
+
+The following test reliably ooms without this patch. This patch avoids
+oom kills.
+
+ $ cat test
+ mount -t cgroup2 none /dev/cgroup
+ cd /dev/cgroup
+ echo +io +memory > cgroup.subtree_control
+ mkdir test
+ cd test
+ echo 10M > memory.max
+ (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
+ (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
+
+ $ cat memcg-writeback-stress.c
+ /*
+ * Dirty pages from all but one cpu.
+ * Clean pages from the non dirtying cpu.
+ * This is to stress per cpu counter imbalance.
+ * On a 100 cpu machine:
+ * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
+ * - per memcg atomic is -99*32 pages
+ * - thus the complete dirty limit: sum of all counters 0
+ * - balance_dirty_pages() only sees atomic count -99*32 pages, which
+ * it max()s to 0.
+ * - So a workload can dirty -99*32 pages before balance_dirty_pages()
+ * cares.
+ */
+ #define _GNU_SOURCE
+ #include <err.h>
+ #include <fcntl.h>
+ #include <sched.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include <sys/stat.h>
+ #include <sys/sysinfo.h>
+ #include <sys/types.h>
+ #include <unistd.h>
+
+ static char *buf;
+ static int bufSize;
+
+ static void set_affinity(int cpu)
+ {
+ cpu_set_t affinity;
+
+ CPU_ZERO(&affinity);
+ CPU_SET(cpu, &affinity);
+ if (sched_setaffinity(0, sizeof(affinity), &affinity))
+ err(1, "sched_setaffinity");
+ }
+
+ static void dirty_on(int output_fd, int cpu)
+ {
+ int i, wrote;
+
+ set_affinity(cpu);
+ for (i = 0; i < 32; i++) {
+ for (wrote = 0; wrote < bufSize; ) {
+ int ret = write(output_fd, buf+wrote, bufSize-wrote);
+ if (ret == -1)
+ err(1, "write");
+ wrote += ret;
+ }
+ }
+ }
+
+ int main(int argc, char **argv)
+ {
+ int cpu, flush_cpu = 1, output_fd;
+ const char *output;
+
+ if (argc != 2)
+ errx(1, "usage: output_file");
+
+ output = argv[1];
+ bufSize = getpagesize();
+ buf = malloc(getpagesize());
+ if (buf == NULL)
+ errx(1, "malloc failed");
+
+ output_fd = open(output, O_CREAT|O_RDWR);
+ if (output_fd == -1)
+ err(1, "open(%s)", output);
+
+ for (cpu = 0; cpu < get_nprocs(); cpu++) {
+ if (cpu != flush_cpu)
+ dirty_on(output_fd, cpu);
+ }
+
+ set_affinity(flush_cpu);
+ if (fsync(output_fd))
+ err(1, "fsync(%s)", output);
+ if (close(output_fd))
+ err(1, "close(%s)", output);
+ free(buf);
+ }
+
+Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
+collect exact per memcg counters. This avoids the aforementioned oom
+kills.
+
+This does not affect the overhead of memory.stat, which still reads the
+single atomic counter.
+
+Why not use percpu_counter? memcg already handles cpus going offline, so
+no need for that overhead from percpu_counter. And the percpu_counter
+spinlocks are more heavyweight than is required.
+
+It probably also makes sense to use exact dirty and writeback counters
+in memcg oom reports. But that is saved for later.
+
+Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
+Signed-off-by: Greg Thelen <gthelen@google.com>
+Reviewed-by: Roman Gushchin <guro@fb.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: <stable@vger.kernel.org> [4.16+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h | 5 ++++-
+ mm/memcontrol.c | 20 ++++++++++++++++++--
+ 2 files changed, 22 insertions(+), 3 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -507,7 +507,10 @@ struct mem_cgroup *lock_page_memcg(struc
+ void __unlock_page_memcg(struct mem_cgroup *memcg);
+ void unlock_page_memcg(struct page *page);
+
+-/* idx can be of type enum memcg_stat_item or node_stat_item */
++/*
++ * idx can be of type enum memcg_stat_item or node_stat_item.
++ * Keep in sync with memcg_exact_page_state().
++ */
+ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
+ int idx)
+ {
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3701,6 +3701,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
+ return &memcg->cgwb_domain;
+ }
+
++/*
++ * idx can be of type enum memcg_stat_item or node_stat_item.
++ * Keep in sync with memcg_exact_page().
++ */
++static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
++{
++ long x = atomic_long_read(&memcg->stat[idx]);
++ int cpu;
++
++ for_each_online_cpu(cpu)
++ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
++ if (x < 0)
++ x = 0;
++ return x;
++}
++
+ /**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+@@ -3726,10 +3742,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ struct mem_cgroup *parent;
+
+- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
++ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+
+ /* this should eventually include NR_UNSTABLE_NFS */
+- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
++ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ (1 << LRU_ACTIVE_FILE));
+ *pheadroom = PAGE_COUNTER_MAX;
drm-amdgpu-fix-offset-calculation-in-amdgpu_vm_bo_clear_mappings.patch
drm-amdgpu-check-alignment-on-cpu-page-for-bo-map.patch
reiserfs-update-reiserfs_xattrs_initialized-condition.patch
+mm-memcontrol-fix-nr_writeback-leak-in-memcg-and-system-stats.patch
+mm-memcg-make-sure-memory.events-is-uptodate-when-waking-pollers.patch
+mem_cgroup-make-sure-moving_account-move_lock_task-and-stat_cpu-in-the-same-cacheline.patch
+mm-fix-oom_kill-event-handling.patch
+mm-writeback-use-exact-memcg-dirty-counts.patch