+From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001
+From: Greg Thelen <gthelen@google.com>
+Date: Fri, 5 Apr 2019 18:39:18 -0700
+Subject: mm: writeback: use exact memcg dirty counts
+
+From: Greg Thelen <gthelen@google.com>
+
+commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.
+
+Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
+memory.stat reporting") memcg dirty and writeback counters are managed
+as:
+
+ 1) per-memcg per-cpu values in range of [-32..32]
+
+ 2) per-memcg atomic counter
+
+When a per-cpu counter cannot fit in [-32..32] it's flushed to the
+atomic. Stat readers only check the atomic. Thus readers such as
+balance_dirty_pages() may see a nontrivial error margin: 32 pages per
+cpu.
+
+Assuming 100 cpus:
+ 4k x86 page_size: 13 MiB error per memcg
+ 64k ppc page_size: 200 MiB error per memcg
+
+Considering that dirty+writeback are used together for some decisions the
+errors double.
+
+This inaccuracy can lead to undeserved oom kills. One nasty case is
+when all per-cpu counters hold positive values offsetting an atomic
+negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
+balance_dirty_pages() only consults the atomic and does not consider
+throttling the next n_cpu*32 dirty pages. If the file_lru is in the
+13..200 MiB range then there's absolutely no dirty throttling, which
+burdens vmscan with only dirty+writeback pages thus resorting to oom
+kill.
+
+It could be argued that tiny containers are not supported, but it's more
+subtle. It's the amount the space available for file lru that matters.
+If a container has memory.max-200MiB of non reclaimable memory, then it
+will also suffer such oom kills on a 100 cpu machine.
+
+The following test reliably ooms without this patch. This patch avoids
+oom kills.
+
+ $ cat test
+ mount -t cgroup2 none /dev/cgroup
+ cd /dev/cgroup
+ echo +io +memory > cgroup.subtree_control
+ mkdir test
+ cd test
+ echo 10M > memory.max
+ (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
+ (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
+
+ $ cat memcg-writeback-stress.c
+ /*
+ * Dirty pages from all but one cpu.
+ * Clean pages from the non dirtying cpu.
+ * This is to stress per cpu counter imbalance.
+ * On a 100 cpu machine:
+ * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
+ * - per memcg atomic is -99*32 pages
+ * - thus the complete dirty limit: sum of all counters 0
+ * - balance_dirty_pages() only sees atomic count -99*32 pages, which
+ * it max()s to 0.
+ * - So a workload can dirty -99*32 pages before balance_dirty_pages()
+ * cares.
+ */
+ #define _GNU_SOURCE
+ #include <err.h>
+ #include <fcntl.h>
+ #include <sched.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include <sys/stat.h>
+ #include <sys/sysinfo.h>
+ #include <sys/types.h>
+ #include <unistd.h>
+
+ static char *buf;
+ static int bufSize;
+
+ static void set_affinity(int cpu)
+ {
+ cpu_set_t affinity;
+
+ CPU_ZERO(&affinity);
+ CPU_SET(cpu, &affinity);
+ if (sched_setaffinity(0, sizeof(affinity), &affinity))
+ err(1, "sched_setaffinity");
+ }
+
+ static void dirty_on(int output_fd, int cpu)
+ {
+ int i, wrote;
+
+ set_affinity(cpu);
+ for (i = 0; i < 32; i++) {
+ for (wrote = 0; wrote < bufSize; ) {
+ int ret = write(output_fd, buf+wrote, bufSize-wrote);
+ if (ret == -1)
+ err(1, "write");
+ wrote += ret;
+ }
+ }
+ }
+
+ int main(int argc, char **argv)
+ {
+ int cpu, flush_cpu = 1, output_fd;
+ const char *output;
+
+ if (argc != 2)
+ errx(1, "usage: output_file");
+
+ output = argv[1];
+ bufSize = getpagesize();
+ buf = malloc(getpagesize());
+ if (buf == NULL)
+ errx(1, "malloc failed");
+
+ output_fd = open(output, O_CREAT|O_RDWR);
+ if (output_fd == -1)
+ err(1, "open(%s)", output);
+
+ for (cpu = 0; cpu < get_nprocs(); cpu++) {
+ if (cpu != flush_cpu)
+ dirty_on(output_fd, cpu);
+ }
+
+ set_affinity(flush_cpu);
+ if (fsync(output_fd))
+ err(1, "fsync(%s)", output);
+ if (close(output_fd))
+ err(1, "close(%s)", output);
+ free(buf);
+ }
+
+Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
+collect exact per memcg counters. This avoids the aforementioned oom
+kills.
+
+This does not affect the overhead of memory.stat, which still reads the
+single atomic counter.
+
+Why not use percpu_counter? memcg already handles cpus going offline, so
+no need for that overhead from percpu_counter. And the percpu_counter
+spinlocks are more heavyweight than is required.
+
+It probably also makes sense to use exact dirty and writeback counters
+in memcg oom reports. But that is saved for later.
+
+Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
+Signed-off-by: Greg Thelen <gthelen@google.com>
+Reviewed-by: Roman Gushchin <guro@fb.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: <stable@vger.kernel.org> [4.16+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/memcontrol.h | 5 ++++-
+ mm/memcontrol.c | 20 ++++++++++++++++++--
+ 2 files changed, 22 insertions(+), 3 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -488,7 +488,10 @@ struct mem_cgroup *lock_page_memcg(struc
+ void __unlock_page_memcg(struct mem_cgroup *memcg);
+ void unlock_page_memcg(struct page *page);
+
+-/* idx can be of type enum memcg_stat_item or node_stat_item */
++/*
++ * idx can be of type enum memcg_stat_item or node_stat_item.
++ * Keep in sync with memcg_exact_page_state().
++ */
+ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
+ int idx)
+ {
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3671,6 +3671,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
+ return &memcg->cgwb_domain;
+ }
+
++/*
++ * idx can be of type enum memcg_stat_item or node_stat_item.
++ * Keep in sync with memcg_exact_page().
++ */
++static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
++{
++ long x = atomic_long_read(&memcg->stat[idx]);
++ int cpu;
++
++ for_each_online_cpu(cpu)
++ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
++ if (x < 0)
++ x = 0;
++ return x;
++}
++
+ /**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+@@ -3696,10 +3712,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ struct mem_cgroup *parent;
+
+- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
++ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+
+ /* this should eventually include NR_UNSTABLE_NFS */
+- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
++ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ (1 << LRU_ACTIVE_FILE));
+ *pheadroom = PAGE_COUNTER_MAX;