releases/4.19.35/mm-writeback-use-exact-memcg-dirty-counts.patch

   1 From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001
   2 From: Greg Thelen <gthelen@google.com>
   3 Date: Fri, 5 Apr 2019 18:39:18 -0700
   4 Subject: mm: writeback: use exact memcg dirty counts
   5
   6 From: Greg Thelen <gthelen@google.com>
   7
   8 commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.
   9
  10 Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
  11 memory.stat reporting") memcg dirty and writeback counters are managed
  12 as:
  13
  14  1) per-memcg per-cpu values in range of [-32..32]
  15
  16  2) per-memcg atomic counter
  17
  18 When a per-cpu counter cannot fit in [-32..32] it's flushed to the
  19 atomic.  Stat readers only check the atomic.  Thus readers such as
  20 balance_dirty_pages() may see a nontrivial error margin: 32 pages per
  21 cpu.
  22
  23 Assuming 100 cpus:
  24    4k x86 page_size:  13 MiB error per memcg
  25   64k ppc page_size: 200 MiB error per memcg
  26
  27 Considering that dirty+writeback are used together for some decisions the
  28 errors double.
  29
  30 This inaccuracy can lead to undeserved oom kills.  One nasty case is
  31 when all per-cpu counters hold positive values offsetting an atomic
  32 negative value (i.e.  per_cpu[*]=32, atomic=n_cpu*-32).
  33 balance_dirty_pages() only consults the atomic and does not consider
  34 throttling the next n_cpu*32 dirty pages.  If the file_lru is in the
  35 13..200 MiB range then there's absolutely no dirty throttling, which
  36 burdens vmscan with only dirty+writeback pages thus resorting to oom
  37 kill.
  38
  39 It could be argued that tiny containers are not supported, but it's more
  40 subtle.  It's the amount the space available for file lru that matters.
  41 If a container has memory.max-200MiB of non reclaimable memory, then it
  42 will also suffer such oom kills on a 100 cpu machine.
  43
  44 The following test reliably ooms without this patch.  This patch avoids
  45 oom kills.
  46
  47   $ cat test
  48   mount -t cgroup2 none /dev/cgroup
  49   cd /dev/cgroup
  50   echo +io +memory > cgroup.subtree_control
  51   mkdir test
  52   cd test
  53   echo 10M > memory.max
  54   (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
  55   (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
  56
  57   $ cat memcg-writeback-stress.c
  58   /*
  59    * Dirty pages from all but one cpu.
  60    * Clean pages from the non dirtying cpu.
  61    * This is to stress per cpu counter imbalance.
  62    * On a 100 cpu machine:
  63    * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
  64    * - per memcg atomic is -99*32 pages
  65    * - thus the complete dirty limit: sum of all counters 0
  66    * - balance_dirty_pages() only sees atomic count -99*32 pages, which
  67    *   it max()s to 0.
  68    * - So a workload can dirty -99*32 pages before balance_dirty_pages()
  69    *   cares.
  70    */
  71   #define _GNU_SOURCE
  72   #include <err.h>
  73   #include <fcntl.h>
  74   #include <sched.h>
  75   #include <stdlib.h>
  76   #include <stdio.h>
  77   #include <sys/stat.h>
  78   #include <sys/sysinfo.h>
  79   #include <sys/types.h>
  80   #include <unistd.h>
  81
  82   static char *buf;
  83   static int bufSize;
  84
  85   static void set_affinity(int cpu)
  86   {
  87         cpu_set_t affinity;
  88
  89         CPU_ZERO(&affinity);
  90         CPU_SET(cpu, &affinity);
  91         if (sched_setaffinity(0, sizeof(affinity), &affinity))
  92                 err(1, "sched_setaffinity");
  93   }
  94
  95   static void dirty_on(int output_fd, int cpu)
  96   {
  97         int i, wrote;
  98
  99         set_affinity(cpu);
 100         for (i = 0; i < 32; i++) {
 101                 for (wrote = 0; wrote < bufSize; ) {
 102                         int ret = write(output_fd, buf+wrote, bufSize-wrote);
 103                         if (ret == -1)
 104                                 err(1, "write");
 105                         wrote += ret;
 106                 }
 107         }
 108   }
 109
 110   int main(int argc, char **argv)
 111   {
 112         int cpu, flush_cpu = 1, output_fd;
 113         const char *output;
 114
 115         if (argc != 2)
 116                 errx(1, "usage: output_file");
 117
 118         output = argv[1];
 119         bufSize = getpagesize();
 120         buf = malloc(getpagesize());
 121         if (buf == NULL)
 122                 errx(1, "malloc failed");
 123
 124         output_fd = open(output, O_CREAT|O_RDWR);
 125         if (output_fd == -1)
 126                 err(1, "open(%s)", output);
 127
 128         for (cpu = 0; cpu < get_nprocs(); cpu++) {
 129                 if (cpu != flush_cpu)
 130                         dirty_on(output_fd, cpu);
 131         }
 132
 133         set_affinity(flush_cpu);
 134         if (fsync(output_fd))
 135                 err(1, "fsync(%s)", output);
 136         if (close(output_fd))
 137                 err(1, "close(%s)", output);
 138         free(buf);
 139   }
 140
 141 Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
 142 collect exact per memcg counters.  This avoids the aforementioned oom
 143 kills.
 144
 145 This does not affect the overhead of memory.stat, which still reads the
 146 single atomic counter.
 147
 148 Why not use percpu_counter? memcg already handles cpus going offline, so
 149 no need for that overhead from percpu_counter.  And the percpu_counter
 150 spinlocks are more heavyweight than is required.
 151
 152 It probably also makes sense to use exact dirty and writeback counters
 153 in memcg oom reports.  But that is saved for later.
 154
 155 Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
 156 Signed-off-by: Greg Thelen <gthelen@google.com>
 157 Reviewed-by: Roman Gushchin <guro@fb.com>
 158 Acked-by: Johannes Weiner <hannes@cmpxchg.org>
 159 Cc: Michal Hocko <mhocko@kernel.org>
 160 Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
 161 Cc: Tejun Heo <tj@kernel.org>
 162 Cc: <stable@vger.kernel.org>    [4.16+]
 163 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 164 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 165 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 166
 167 ---
 168  include/linux/memcontrol.h |    5 ++++-
 169  mm/memcontrol.c            |   20 ++++++++++++++++++--
 170  2 files changed, 22 insertions(+), 3 deletions(-)
 171
 172 --- a/include/linux/memcontrol.h
 173 +++ b/include/linux/memcontrol.h
 174 @@ -559,7 +559,10 @@ struct mem_cgroup *lock_page_memcg(struc
 175  void __unlock_page_memcg(struct mem_cgroup *memcg);
 176  void unlock_page_memcg(struct page *page);
 177
 178 -/* idx can be of type enum memcg_stat_item or node_stat_item */
 179 +/*
 180 + * idx can be of type enum memcg_stat_item or node_stat_item.
 181 + * Keep in sync with memcg_exact_page_state().
 182 + */
 183  static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
 184                                              int idx)
 185  {
 186 --- a/mm/memcontrol.c
 187 +++ b/mm/memcontrol.c
 188 @@ -3897,6 +3897,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
 189         return &memcg->cgwb_domain;
 190  }
 191
 192 +/*
 193 + * idx can be of type enum memcg_stat_item or node_stat_item.
 194 + * Keep in sync with memcg_exact_page().
 195 + */
 196 +static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
 197 +{
 198 +       long x = atomic_long_read(&memcg->stat[idx]);
 199 +       int cpu;
 200 +
 201 +       for_each_online_cpu(cpu)
 202 +               x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
 203 +       if (x < 0)
 204 +               x = 0;
 205 +       return x;
 206 +}
 207 +
 208  /**
 209   * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
 210   * @wb: bdi_writeback in question
 211 @@ -3922,10 +3938,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
 212         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 213         struct mem_cgroup *parent;
 214
 215 -       *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 216 +       *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
 217
 218         /* this should eventually include NR_UNSTABLE_NFS */
 219 -       *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
 220 +       *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
 221         *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 222                                                      (1 << LRU_ACTIVE_FILE));
 223         *pheadroom = PAGE_COUNTER_MAX;