]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - releases/4.19.35/mm-writeback-use-exact-memcg-dirty-counts.patch
Linux 4.19.35
[thirdparty/kernel/stable-queue.git] / releases / 4.19.35 / mm-writeback-use-exact-memcg-dirty-counts.patch
CommitLineData
8d82f256
GKH
1From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001
2From: Greg Thelen <gthelen@google.com>
3Date: Fri, 5 Apr 2019 18:39:18 -0700
4Subject: mm: writeback: use exact memcg dirty counts
5
6From: Greg Thelen <gthelen@google.com>
7
8commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.
9
10Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
11memory.stat reporting") memcg dirty and writeback counters are managed
12as:
13
14 1) per-memcg per-cpu values in range of [-32..32]
15
16 2) per-memcg atomic counter
17
18When a per-cpu counter cannot fit in [-32..32] it's flushed to the
19atomic. Stat readers only check the atomic. Thus readers such as
20balance_dirty_pages() may see a nontrivial error margin: 32 pages per
21cpu.
22
23Assuming 100 cpus:
24 4k x86 page_size: 13 MiB error per memcg
25 64k ppc page_size: 200 MiB error per memcg
26
27Considering that dirty+writeback are used together for some decisions the
28errors double.
29
30This inaccuracy can lead to undeserved oom kills. One nasty case is
31when all per-cpu counters hold positive values offsetting an atomic
32negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
33balance_dirty_pages() only consults the atomic and does not consider
34throttling the next n_cpu*32 dirty pages. If the file_lru is in the
3513..200 MiB range then there's absolutely no dirty throttling, which
36burdens vmscan with only dirty+writeback pages thus resorting to oom
37kill.
38
39It could be argued that tiny containers are not supported, but it's more
40subtle. It's the amount the space available for file lru that matters.
41If a container has memory.max-200MiB of non reclaimable memory, then it
42will also suffer such oom kills on a 100 cpu machine.
43
44The following test reliably ooms without this patch. This patch avoids
45oom kills.
46
47 $ cat test
48 mount -t cgroup2 none /dev/cgroup
49 cd /dev/cgroup
50 echo +io +memory > cgroup.subtree_control
51 mkdir test
52 cd test
53 echo 10M > memory.max
54 (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
55 (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
56
57 $ cat memcg-writeback-stress.c
58 /*
59 * Dirty pages from all but one cpu.
60 * Clean pages from the non dirtying cpu.
61 * This is to stress per cpu counter imbalance.
62 * On a 100 cpu machine:
63 * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
64 * - per memcg atomic is -99*32 pages
65 * - thus the complete dirty limit: sum of all counters 0
66 * - balance_dirty_pages() only sees atomic count -99*32 pages, which
67 * it max()s to 0.
68 * - So a workload can dirty -99*32 pages before balance_dirty_pages()
69 * cares.
70 */
71 #define _GNU_SOURCE
72 #include <err.h>
73 #include <fcntl.h>
74 #include <sched.h>
75 #include <stdlib.h>
76 #include <stdio.h>
77 #include <sys/stat.h>
78 #include <sys/sysinfo.h>
79 #include <sys/types.h>
80 #include <unistd.h>
81
82 static char *buf;
83 static int bufSize;
84
85 static void set_affinity(int cpu)
86 {
87 cpu_set_t affinity;
88
89 CPU_ZERO(&affinity);
90 CPU_SET(cpu, &affinity);
91 if (sched_setaffinity(0, sizeof(affinity), &affinity))
92 err(1, "sched_setaffinity");
93 }
94
95 static void dirty_on(int output_fd, int cpu)
96 {
97 int i, wrote;
98
99 set_affinity(cpu);
100 for (i = 0; i < 32; i++) {
101 for (wrote = 0; wrote < bufSize; ) {
102 int ret = write(output_fd, buf+wrote, bufSize-wrote);
103 if (ret == -1)
104 err(1, "write");
105 wrote += ret;
106 }
107 }
108 }
109
110 int main(int argc, char **argv)
111 {
112 int cpu, flush_cpu = 1, output_fd;
113 const char *output;
114
115 if (argc != 2)
116 errx(1, "usage: output_file");
117
118 output = argv[1];
119 bufSize = getpagesize();
120 buf = malloc(getpagesize());
121 if (buf == NULL)
122 errx(1, "malloc failed");
123
124 output_fd = open(output, O_CREAT|O_RDWR);
125 if (output_fd == -1)
126 err(1, "open(%s)", output);
127
128 for (cpu = 0; cpu < get_nprocs(); cpu++) {
129 if (cpu != flush_cpu)
130 dirty_on(output_fd, cpu);
131 }
132
133 set_affinity(flush_cpu);
134 if (fsync(output_fd))
135 err(1, "fsync(%s)", output);
136 if (close(output_fd))
137 err(1, "close(%s)", output);
138 free(buf);
139 }
140
141Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
142collect exact per memcg counters. This avoids the aforementioned oom
143kills.
144
145This does not affect the overhead of memory.stat, which still reads the
146single atomic counter.
147
148Why not use percpu_counter? memcg already handles cpus going offline, so
149no need for that overhead from percpu_counter. And the percpu_counter
150spinlocks are more heavyweight than is required.
151
152It probably also makes sense to use exact dirty and writeback counters
153in memcg oom reports. But that is saved for later.
154
155Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
156Signed-off-by: Greg Thelen <gthelen@google.com>
157Reviewed-by: Roman Gushchin <guro@fb.com>
158Acked-by: Johannes Weiner <hannes@cmpxchg.org>
159Cc: Michal Hocko <mhocko@kernel.org>
160Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
161Cc: Tejun Heo <tj@kernel.org>
162Cc: <stable@vger.kernel.org> [4.16+]
163Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
164Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
165Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
166
167---
168 include/linux/memcontrol.h | 5 ++++-
169 mm/memcontrol.c | 20 ++++++++++++++++++--
170 2 files changed, 22 insertions(+), 3 deletions(-)
171
172--- a/include/linux/memcontrol.h
173+++ b/include/linux/memcontrol.h
174@@ -559,7 +559,10 @@ struct mem_cgroup *lock_page_memcg(struc
175 void __unlock_page_memcg(struct mem_cgroup *memcg);
176 void unlock_page_memcg(struct page *page);
177
178-/* idx can be of type enum memcg_stat_item or node_stat_item */
179+/*
180+ * idx can be of type enum memcg_stat_item or node_stat_item.
181+ * Keep in sync with memcg_exact_page_state().
182+ */
183 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
184 int idx)
185 {
186--- a/mm/memcontrol.c
187+++ b/mm/memcontrol.c
188@@ -3897,6 +3897,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
189 return &memcg->cgwb_domain;
190 }
191
192+/*
193+ * idx can be of type enum memcg_stat_item or node_stat_item.
194+ * Keep in sync with memcg_exact_page().
195+ */
196+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
197+{
198+ long x = atomic_long_read(&memcg->stat[idx]);
199+ int cpu;
200+
201+ for_each_online_cpu(cpu)
202+ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
203+ if (x < 0)
204+ x = 0;
205+ return x;
206+}
207+
208 /**
209 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
210 * @wb: bdi_writeback in question
211@@ -3922,10 +3938,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
212 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
213 struct mem_cgroup *parent;
214
215- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
216+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
217
218 /* this should eventually include NR_UNSTABLE_NFS */
219- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
220+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
221 *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
222 (1 << LRU_ACTIVE_FILE));
223 *pheadroom = PAGE_COUNTER_MAX;