]>
Commit | Line | Data |
---|---|---|
8d82f256 GKH |
1 | From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001 |
2 | From: Greg Thelen <gthelen@google.com> | |
3 | Date: Fri, 5 Apr 2019 18:39:18 -0700 | |
4 | Subject: mm: writeback: use exact memcg dirty counts | |
5 | ||
6 | From: Greg Thelen <gthelen@google.com> | |
7 | ||
8 | commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream. | |
9 | ||
10 | Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in | |
11 | memory.stat reporting") memcg dirty and writeback counters are managed | |
12 | as: | |
13 | ||
14 | 1) per-memcg per-cpu values in range of [-32..32] | |
15 | ||
16 | 2) per-memcg atomic counter | |
17 | ||
18 | When a per-cpu counter cannot fit in [-32..32] it's flushed to the | |
19 | atomic. Stat readers only check the atomic. Thus readers such as | |
20 | balance_dirty_pages() may see a nontrivial error margin: 32 pages per | |
21 | cpu. | |
22 | ||
23 | Assuming 100 cpus: | |
24 | 4k x86 page_size: 13 MiB error per memcg | |
25 | 64k ppc page_size: 200 MiB error per memcg | |
26 | ||
27 | Considering that dirty+writeback are used together for some decisions the | |
28 | errors double. | |
29 | ||
30 | This inaccuracy can lead to undeserved oom kills. One nasty case is | |
31 | when all per-cpu counters hold positive values offsetting an atomic | |
32 | negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32). | |
33 | balance_dirty_pages() only consults the atomic and does not consider | |
34 | throttling the next n_cpu*32 dirty pages. If the file_lru is in the | |
35 | 13..200 MiB range then there's absolutely no dirty throttling, which | |
36 | burdens vmscan with only dirty+writeback pages thus resorting to oom | |
37 | kill. | |
38 | ||
39 | It could be argued that tiny containers are not supported, but it's more | |
40 | subtle. It's the amount the space available for file lru that matters. | |
41 | If a container has memory.max-200MiB of non reclaimable memory, then it | |
42 | will also suffer such oom kills on a 100 cpu machine. | |
43 | ||
44 | The following test reliably ooms without this patch. This patch avoids | |
45 | oom kills. | |
46 | ||
47 | $ cat test | |
48 | mount -t cgroup2 none /dev/cgroup | |
49 | cd /dev/cgroup | |
50 | echo +io +memory > cgroup.subtree_control | |
51 | mkdir test | |
52 | cd test | |
53 | echo 10M > memory.max | |
54 | (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo) | |
55 | (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100) | |
56 | ||
57 | $ cat memcg-writeback-stress.c | |
58 | /* | |
59 | * Dirty pages from all but one cpu. | |
60 | * Clean pages from the non dirtying cpu. | |
61 | * This is to stress per cpu counter imbalance. | |
62 | * On a 100 cpu machine: | |
63 | * - per memcg per cpu dirty count is 32 pages for each of 99 cpus | |
64 | * - per memcg atomic is -99*32 pages | |
65 | * - thus the complete dirty limit: sum of all counters 0 | |
66 | * - balance_dirty_pages() only sees atomic count -99*32 pages, which | |
67 | * it max()s to 0. | |
68 | * - So a workload can dirty -99*32 pages before balance_dirty_pages() | |
69 | * cares. | |
70 | */ | |
71 | #define _GNU_SOURCE | |
72 | #include <err.h> | |
73 | #include <fcntl.h> | |
74 | #include <sched.h> | |
75 | #include <stdlib.h> | |
76 | #include <stdio.h> | |
77 | #include <sys/stat.h> | |
78 | #include <sys/sysinfo.h> | |
79 | #include <sys/types.h> | |
80 | #include <unistd.h> | |
81 | ||
82 | static char *buf; | |
83 | static int bufSize; | |
84 | ||
85 | static void set_affinity(int cpu) | |
86 | { | |
87 | cpu_set_t affinity; | |
88 | ||
89 | CPU_ZERO(&affinity); | |
90 | CPU_SET(cpu, &affinity); | |
91 | if (sched_setaffinity(0, sizeof(affinity), &affinity)) | |
92 | err(1, "sched_setaffinity"); | |
93 | } | |
94 | ||
95 | static void dirty_on(int output_fd, int cpu) | |
96 | { | |
97 | int i, wrote; | |
98 | ||
99 | set_affinity(cpu); | |
100 | for (i = 0; i < 32; i++) { | |
101 | for (wrote = 0; wrote < bufSize; ) { | |
102 | int ret = write(output_fd, buf+wrote, bufSize-wrote); | |
103 | if (ret == -1) | |
104 | err(1, "write"); | |
105 | wrote += ret; | |
106 | } | |
107 | } | |
108 | } | |
109 | ||
110 | int main(int argc, char **argv) | |
111 | { | |
112 | int cpu, flush_cpu = 1, output_fd; | |
113 | const char *output; | |
114 | ||
115 | if (argc != 2) | |
116 | errx(1, "usage: output_file"); | |
117 | ||
118 | output = argv[1]; | |
119 | bufSize = getpagesize(); | |
120 | buf = malloc(getpagesize()); | |
121 | if (buf == NULL) | |
122 | errx(1, "malloc failed"); | |
123 | ||
124 | output_fd = open(output, O_CREAT|O_RDWR); | |
125 | if (output_fd == -1) | |
126 | err(1, "open(%s)", output); | |
127 | ||
128 | for (cpu = 0; cpu < get_nprocs(); cpu++) { | |
129 | if (cpu != flush_cpu) | |
130 | dirty_on(output_fd, cpu); | |
131 | } | |
132 | ||
133 | set_affinity(flush_cpu); | |
134 | if (fsync(output_fd)) | |
135 | err(1, "fsync(%s)", output); | |
136 | if (close(output_fd)) | |
137 | err(1, "close(%s)", output); | |
138 | free(buf); | |
139 | } | |
140 | ||
141 | Make balance_dirty_pages() and wb_over_bg_thresh() work harder to | |
142 | collect exact per memcg counters. This avoids the aforementioned oom | |
143 | kills. | |
144 | ||
145 | This does not affect the overhead of memory.stat, which still reads the | |
146 | single atomic counter. | |
147 | ||
148 | Why not use percpu_counter? memcg already handles cpus going offline, so | |
149 | no need for that overhead from percpu_counter. And the percpu_counter | |
150 | spinlocks are more heavyweight than is required. | |
151 | ||
152 | It probably also makes sense to use exact dirty and writeback counters | |
153 | in memcg oom reports. But that is saved for later. | |
154 | ||
155 | Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com | |
156 | Signed-off-by: Greg Thelen <gthelen@google.com> | |
157 | Reviewed-by: Roman Gushchin <guro@fb.com> | |
158 | Acked-by: Johannes Weiner <hannes@cmpxchg.org> | |
159 | Cc: Michal Hocko <mhocko@kernel.org> | |
160 | Cc: Vladimir Davydov <vdavydov.dev@gmail.com> | |
161 | Cc: Tejun Heo <tj@kernel.org> | |
162 | Cc: <stable@vger.kernel.org> [4.16+] | |
163 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
164 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
165 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
166 | ||
167 | --- | |
168 | include/linux/memcontrol.h | 5 ++++- | |
169 | mm/memcontrol.c | 20 ++++++++++++++++++-- | |
170 | 2 files changed, 22 insertions(+), 3 deletions(-) | |
171 | ||
172 | --- a/include/linux/memcontrol.h | |
173 | +++ b/include/linux/memcontrol.h | |
174 | @@ -559,7 +559,10 @@ struct mem_cgroup *lock_page_memcg(struc | |
175 | void __unlock_page_memcg(struct mem_cgroup *memcg); | |
176 | void unlock_page_memcg(struct page *page); | |
177 | ||
178 | -/* idx can be of type enum memcg_stat_item or node_stat_item */ | |
179 | +/* | |
180 | + * idx can be of type enum memcg_stat_item or node_stat_item. | |
181 | + * Keep in sync with memcg_exact_page_state(). | |
182 | + */ | |
183 | static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, | |
184 | int idx) | |
185 | { | |
186 | --- a/mm/memcontrol.c | |
187 | +++ b/mm/memcontrol.c | |
188 | @@ -3897,6 +3897,22 @@ struct wb_domain *mem_cgroup_wb_domain(s | |
189 | return &memcg->cgwb_domain; | |
190 | } | |
191 | ||
192 | +/* | |
193 | + * idx can be of type enum memcg_stat_item or node_stat_item. | |
194 | + * Keep in sync with memcg_exact_page(). | |
195 | + */ | |
196 | +static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) | |
197 | +{ | |
198 | + long x = atomic_long_read(&memcg->stat[idx]); | |
199 | + int cpu; | |
200 | + | |
201 | + for_each_online_cpu(cpu) | |
202 | + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; | |
203 | + if (x < 0) | |
204 | + x = 0; | |
205 | + return x; | |
206 | +} | |
207 | + | |
208 | /** | |
209 | * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg | |
210 | * @wb: bdi_writeback in question | |
211 | @@ -3922,10 +3938,10 @@ void mem_cgroup_wb_stats(struct bdi_writ | |
212 | struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); | |
213 | struct mem_cgroup *parent; | |
214 | ||
215 | - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); | |
216 | + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); | |
217 | ||
218 | /* this should eventually include NR_UNSTABLE_NFS */ | |
219 | - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); | |
220 | + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); | |
221 | *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | | |
222 | (1 << LRU_ACTIVE_FILE)); | |
223 | *pheadroom = PAGE_COUNTER_MAX; |