+++ /dev/null
-From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001
-From: Greg Thelen <gthelen@google.com>
-Date: Fri, 5 Apr 2019 18:39:18 -0700
-Subject: mm: writeback: use exact memcg dirty counts
-
-From: Greg Thelen <gthelen@google.com>
-
-commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.
-
-Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
-memory.stat reporting") memcg dirty and writeback counters are managed
-as:
-
- 1) per-memcg per-cpu values in range of [-32..32]
-
- 2) per-memcg atomic counter
-
-When a per-cpu counter cannot fit in [-32..32] it's flushed to the
-atomic. Stat readers only check the atomic. Thus readers such as
-balance_dirty_pages() may see a nontrivial error margin: 32 pages per
-cpu.
-
-Assuming 100 cpus:
- 4k x86 page_size: 13 MiB error per memcg
- 64k ppc page_size: 200 MiB error per memcg
-
-Considering that dirty+writeback are used together for some decisions the
-errors double.
-
-This inaccuracy can lead to undeserved oom kills. One nasty case is
-when all per-cpu counters hold positive values offsetting an atomic
-negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
-balance_dirty_pages() only consults the atomic and does not consider
-throttling the next n_cpu*32 dirty pages. If the file_lru is in the
-13..200 MiB range then there's absolutely no dirty throttling, which
-burdens vmscan with only dirty+writeback pages thus resorting to oom
-kill.
-
-It could be argued that tiny containers are not supported, but it's more
-subtle. It's the amount the space available for file lru that matters.
-If a container has memory.max-200MiB of non reclaimable memory, then it
-will also suffer such oom kills on a 100 cpu machine.
-
-The following test reliably ooms without this patch. This patch avoids
-oom kills.
-
- $ cat test
- mount -t cgroup2 none /dev/cgroup
- cd /dev/cgroup
- echo +io +memory > cgroup.subtree_control
- mkdir test
- cd test
- echo 10M > memory.max
- (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
- (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
-
- $ cat memcg-writeback-stress.c
- /*
- * Dirty pages from all but one cpu.
- * Clean pages from the non dirtying cpu.
- * This is to stress per cpu counter imbalance.
- * On a 100 cpu machine:
- * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
- * - per memcg atomic is -99*32 pages
- * - thus the complete dirty limit: sum of all counters 0
- * - balance_dirty_pages() only sees atomic count -99*32 pages, which
- * it max()s to 0.
- * - So a workload can dirty -99*32 pages before balance_dirty_pages()
- * cares.
- */
- #define _GNU_SOURCE
- #include <err.h>
- #include <fcntl.h>
- #include <sched.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <sys/stat.h>
- #include <sys/sysinfo.h>
- #include <sys/types.h>
- #include <unistd.h>
-
- static char *buf;
- static int bufSize;
-
- static void set_affinity(int cpu)
- {
- cpu_set_t affinity;
-
- CPU_ZERO(&affinity);
- CPU_SET(cpu, &affinity);
- if (sched_setaffinity(0, sizeof(affinity), &affinity))
- err(1, "sched_setaffinity");
- }
-
- static void dirty_on(int output_fd, int cpu)
- {
- int i, wrote;
-
- set_affinity(cpu);
- for (i = 0; i < 32; i++) {
- for (wrote = 0; wrote < bufSize; ) {
- int ret = write(output_fd, buf+wrote, bufSize-wrote);
- if (ret == -1)
- err(1, "write");
- wrote += ret;
- }
- }
- }
-
- int main(int argc, char **argv)
- {
- int cpu, flush_cpu = 1, output_fd;
- const char *output;
-
- if (argc != 2)
- errx(1, "usage: output_file");
-
- output = argv[1];
- bufSize = getpagesize();
- buf = malloc(getpagesize());
- if (buf == NULL)
- errx(1, "malloc failed");
-
- output_fd = open(output, O_CREAT|O_RDWR);
- if (output_fd == -1)
- err(1, "open(%s)", output);
-
- for (cpu = 0; cpu < get_nprocs(); cpu++) {
- if (cpu != flush_cpu)
- dirty_on(output_fd, cpu);
- }
-
- set_affinity(flush_cpu);
- if (fsync(output_fd))
- err(1, "fsync(%s)", output);
- if (close(output_fd))
- err(1, "close(%s)", output);
- free(buf);
- }
-
-Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
-collect exact per memcg counters. This avoids the aforementioned oom
-kills.
-
-This does not affect the overhead of memory.stat, which still reads the
-single atomic counter.
-
-Why not use percpu_counter? memcg already handles cpus going offline, so
-no need for that overhead from percpu_counter. And the percpu_counter
-spinlocks are more heavyweight than is required.
-
-It probably also makes sense to use exact dirty and writeback counters
-in memcg oom reports. But that is saved for later.
-
-Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
-Signed-off-by: Greg Thelen <gthelen@google.com>
-Reviewed-by: Roman Gushchin <guro@fb.com>
-Acked-by: Johannes Weiner <hannes@cmpxchg.org>
-Cc: Michal Hocko <mhocko@kernel.org>
-Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
-Cc: Tejun Heo <tj@kernel.org>
-Cc: <stable@vger.kernel.org> [4.16+]
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- include/linux/memcontrol.h | 5 ++++-
- mm/memcontrol.c | 20 ++++++++++++++++++--
- 2 files changed, 22 insertions(+), 3 deletions(-)
-
---- a/include/linux/memcontrol.h
-+++ b/include/linux/memcontrol.h
-@@ -488,7 +488,10 @@ struct mem_cgroup *lock_page_memcg(struc
- void __unlock_page_memcg(struct mem_cgroup *memcg);
- void unlock_page_memcg(struct page *page);
-
--/* idx can be of type enum memcg_stat_item or node_stat_item */
-+/*
-+ * idx can be of type enum memcg_stat_item or node_stat_item.
-+ * Keep in sync with memcg_exact_page_state().
-+ */
- static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
- int idx)
- {
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
-@@ -3671,6 +3671,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
- return &memcg->cgwb_domain;
- }
-
-+/*
-+ * idx can be of type enum memcg_stat_item or node_stat_item.
-+ * Keep in sync with memcg_exact_page().
-+ */
-+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
-+{
-+ long x = atomic_long_read(&memcg->stat[idx]);
-+ int cpu;
-+
-+ for_each_online_cpu(cpu)
-+ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
-+ if (x < 0)
-+ x = 0;
-+ return x;
-+}
-+
- /**
- * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
- * @wb: bdi_writeback in question
-@@ -3696,10 +3712,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
- struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
- struct mem_cgroup *parent;
-
-- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
-+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
-
- /* this should eventually include NR_UNSTABLE_NFS */
-- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
-+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
- *pheadroom = PAGE_COUNTER_MAX;