[thirdparty/kernel/stable-queue.git] / releases / 4.19.35 / mm-writeback-use-exact-memcg-dirty-counts.patch

From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Fri, 5 Apr 2019 18:39:18 -0700
Subject: mm: writeback: use exact memcg dirty counts

From: Greg Thelen <gthelen@google.com>

commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.

Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting") memcg dirty and writeback counters are managed
as:

 1) per-memcg per-cpu values in range of [-32..32]

 2) per-memcg atomic counter

When a per-cpu counter cannot fit in [-32..32] it's flushed to the
atomic.  Stat readers only check the atomic.  Thus readers such as
balance_dirty_pages() may see a nontrivial error margin: 32 pages per
cpu.

Assuming 100 cpus:
   4k x86 page_size:  13 MiB error per memcg
  64k ppc page_size: 200 MiB error per memcg

Considering that dirty+writeback are used together for some decisions the
errors double.

This inaccuracy can lead to undeserved oom kills.  One nasty case is
when all per-cpu counters hold positive values offsetting an atomic
negative value (i.e.  per_cpu[*]=32, atomic=n_cpu*-32).
balance_dirty_pages() only consults the atomic and does not consider
throttling the next n_cpu*32 dirty pages.  If the file_lru is in the
13..200 MiB range then there's absolutely no dirty throttling, which
burdens vmscan with only dirty+writeback pages thus resorting to oom
kill.

It could be argued that tiny containers are not supported, but it's more
subtle.  It's the amount the space available for file lru that matters.
If a container has memory.max-200MiB of non reclaimable memory, then it
will also suffer such oom kills on a 100 cpu machine.

The following test reliably ooms without this patch.  This patch avoids
oom kills.

  $ cat test
  mount -t cgroup2 none /dev/cgroup
  cd /dev/cgroup
  echo +io +memory > cgroup.subtree_control
  mkdir test
  cd test
  echo 10M > memory.max
  (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
  (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)

  $ cat memcg-writeback-stress.c
  /*
   * Dirty pages from all but one cpu.
   * Clean pages from the non dirtying cpu.
   * This is to stress per cpu counter imbalance.
   * On a 100 cpu machine:
   * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
   * - per memcg atomic is -99*32 pages
   * - thus the complete dirty limit: sum of all counters 0
   * - balance_dirty_pages() only sees atomic count -99*32 pages, which
   *   it max()s to 0.
   * - So a workload can dirty -99*32 pages before balance_dirty_pages()
   *   cares.
   */
  #define _GNU_SOURCE
  #include <err.h>
  #include <fcntl.h>
  #include <sched.h>
  #include <stdlib.h>
  #include <stdio.h>
  #include <sys/stat.h>
  #include <sys/sysinfo.h>
  #include <sys/types.h>
  #include <unistd.h>

  static char *buf;
  static int bufSize;

  static void set_affinity(int cpu)
  {
  	cpu_set_t affinity;

  	CPU_ZERO(&affinity);
  	CPU_SET(cpu, &affinity);
  	if (sched_setaffinity(0, sizeof(affinity), &affinity))
  		err(1, "sched_setaffinity");
  }

  static void dirty_on(int output_fd, int cpu)
  {
  	int i, wrote;

  	set_affinity(cpu);
  	for (i = 0; i < 32; i++) {
  		for (wrote = 0; wrote < bufSize; ) {
  			int ret = write(output_fd, buf+wrote, bufSize-wrote);
  			if (ret == -1)
  				err(1, "write");
  			wrote += ret;
  		}
  	}
  }

  int main(int argc, char **argv)
  {
  	int cpu, flush_cpu = 1, output_fd;
  	const char *output;

  	if (argc != 2)
  		errx(1, "usage: output_file");

  	output = argv[1];
  	bufSize = getpagesize();
  	buf = malloc(getpagesize());
  	if (buf == NULL)
  		errx(1, "malloc failed");

  	output_fd = open(output, O_CREAT|O_RDWR);
  	if (output_fd == -1)
  		err(1, "open(%s)", output);

  	for (cpu = 0; cpu < get_nprocs(); cpu++) {
  		if (cpu != flush_cpu)
  			dirty_on(output_fd, cpu);
  	}

  	set_affinity(flush_cpu);
  	if (fsync(output_fd))
  		err(1, "fsync(%s)", output);
  	if (close(output_fd))
  		err(1, "close(%s)", output);
  	free(buf);
  }

Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
collect exact per memcg counters.  This avoids the aforementioned oom
kills.

This does not affect the overhead of memory.stat, which still reads the
single atomic counter.

Why not use percpu_counter? memcg already handles cpus going offline, so
no need for that overhead from percpu_counter.  And the percpu_counter
spinlocks are more heavyweight than is required.

It probably also makes sense to use exact dirty and writeback counters
in memcg oom reports.  But that is saved for later.

Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>	[4.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 include/linux/memcontrol.h |    5 ++++-
 mm/memcontrol.c            |   20 ++++++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -559,7 +559,10 @@ struct mem_cgroup *lock_page_memcg(struc
 void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page_state().
+ */
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
 					     int idx)
 {
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3897,6 +3897,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
 	return &memcg->cgwb_domain;
 }
 
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page().
+ */
+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
+{
+	long x = atomic_long_read(&memcg->stat[idx]);
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
+	if (x < 0)
+		x = 0;
+	return x;
+}
+
 /**
  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
  * @wb: bdi_writeback in question
@@ -3922,10 +3938,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
 
 	/* this should eventually include NR_UNSTABLE_NFS */
-	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
 	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 						     (1 << LRU_ACTIVE_FILE));
 	*pheadroom = PAGE_COUNTER_MAX;
Commit	Line	Data
8d82f256 GKH	1	From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001
	2	From: Greg Thelen <gthelen@google.com>
	3	Date: Fri, 5 Apr 2019 18:39:18 -0700
	4	Subject: mm: writeback: use exact memcg dirty counts
	5
	6	From: Greg Thelen <gthelen@google.com>
	7
	8	commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.
	9
	10	Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
	11	memory.stat reporting") memcg dirty and writeback counters are managed
	12	as:
	13
	14	1) per-memcg per-cpu values in range of [-32..32]
	15
	16	2) per-memcg atomic counter
	17
	18	When a per-cpu counter cannot fit in [-32..32] it's flushed to the
	19	atomic. Stat readers only check the atomic. Thus readers such as
	20	balance_dirty_pages() may see a nontrivial error margin: 32 pages per
	21	cpu.
	22
	23	Assuming 100 cpus:
	24	4k x86 page_size: 13 MiB error per memcg
	25	64k ppc page_size: 200 MiB error per memcg
	26
	27	Considering that dirty+writeback are used together for some decisions the
	28	errors double.
	29
	30	This inaccuracy can lead to undeserved oom kills. One nasty case is
	31	when all per-cpu counters hold positive values offsetting an atomic
	32	negative value (i.e. per_cpu[]=32, atomic=n_cpu-32).
	33	balance_dirty_pages() only consults the atomic and does not consider
	34	throttling the next n_cpu*32 dirty pages. If the file_lru is in the
	35	13..200 MiB range then there's absolutely no dirty throttling, which
	36	burdens vmscan with only dirty+writeback pages thus resorting to oom
	37	kill.
	38
	39	It could be argued that tiny containers are not supported, but it's more
	40	subtle. It's the amount the space available for file lru that matters.
	41	If a container has memory.max-200MiB of non reclaimable memory, then it
	42	will also suffer such oom kills on a 100 cpu machine.
	43
	44	The following test reliably ooms without this patch. This patch avoids
	45	oom kills.
	46
	47	$ cat test
	48	mount -t cgroup2 none /dev/cgroup
	49	cd /dev/cgroup
	50	echo +io +memory > cgroup.subtree_control
	51	mkdir test
	52	cd test
	53	echo 10M > memory.max
	54	(echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
	55	(echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)
	56
	57	$ cat memcg-writeback-stress.c
	58	/*
	59	* Dirty pages from all but one cpu.
	60	* Clean pages from the non dirtying cpu.
	61	* This is to stress per cpu counter imbalance.
	62	* On a 100 cpu machine:
	63	* - per memcg per cpu dirty count is 32 pages for each of 99 cpus
	64	* - per memcg atomic is -99*32 pages
65	* - thus the complete dirty limit: sum of all counters 0
66	* - balance_dirty_pages() only sees atomic count -99*32 pages, which
67	* it max()s to 0.
68	* - So a workload can dirty -99*32 pages before balance_dirty_pages()
69	* cares.
70	*/
71	#define _GNU_SOURCE
72	#include <err.h>
73	#include <fcntl.h>
74	#include <sched.h>
75	#include <stdlib.h>
76	#include <stdio.h>
77	#include <sys/stat.h>
78	#include <sys/sysinfo.h>
79	#include <sys/types.h>
80	#include <unistd.h>
81
82	static char *buf;
83	static int bufSize;
84
85	static void set_affinity(int cpu)
86	{
87	cpu_set_t affinity;
88
89	CPU_ZERO(&affinity);
90	CPU_SET(cpu, &affinity);
91	if (sched_setaffinity(0, sizeof(affinity), &affinity))
92	err(1, "sched_setaffinity");
93	}
94
95	static void dirty_on(int output_fd, int cpu)
96	{
97	int i, wrote;
98
99	set_affinity(cpu);
100	for (i = 0; i < 32; i++) {
101	for (wrote = 0; wrote < bufSize; ) {
102	int ret = write(output_fd, buf+wrote, bufSize-wrote);
103	if (ret == -1)
104	err(1, "write");
105	wrote += ret;
106	}
107	}
108	}
109
110	int main(int argc, char **argv)
111	{
112	int cpu, flush_cpu = 1, output_fd;
113	const char *output;
114
115	if (argc != 2)
116	errx(1, "usage: output_file");
117
118	output = argv[1];
119	bufSize = getpagesize();
120	buf = malloc(getpagesize());
121	if (buf == NULL)
122	errx(1, "malloc failed");
123
124	output_fd = open(output, O_CREAT\|O_RDWR);
125	if (output_fd == -1)
126	err(1, "open(%s)", output);
127
128	for (cpu = 0; cpu < get_nprocs(); cpu++) {
129	if (cpu != flush_cpu)
130	dirty_on(output_fd, cpu);
131	}
132
133	set_affinity(flush_cpu);
134	if (fsync(output_fd))
135	err(1, "fsync(%s)", output);
136	if (close(output_fd))
137	err(1, "close(%s)", output);
138	free(buf);
139	}
140
141	Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
142	collect exact per memcg counters. This avoids the aforementioned oom
143	kills.
144
145	This does not affect the overhead of memory.stat, which still reads the
146	single atomic counter.
147
148	Why not use percpu_counter? memcg already handles cpus going offline, so
149	no need for that overhead from percpu_counter. And the percpu_counter
150	spinlocks are more heavyweight than is required.
151
152	It probably also makes sense to use exact dirty and writeback counters
153	in memcg oom reports. But that is saved for later.
154
155	Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
156	Signed-off-by: Greg Thelen <gthelen@google.com>
157	Reviewed-by: Roman Gushchin <guro@fb.com>
158	Acked-by: Johannes Weiner <hannes@cmpxchg.org>
159	Cc: Michal Hocko <mhocko@kernel.org>
160	Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
161	Cc: Tejun Heo <tj@kernel.org>
162	Cc: <stable@vger.kernel.org> [4.16+]
163	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
164	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
165	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
166
167	---
168	include/linux/memcontrol.h \| 5 ++++-
169	mm/memcontrol.c \| 20 ++++++++++++++++++--
170	2 files changed, 22 insertions(+), 3 deletions(-)
171
172	--- a/include/linux/memcontrol.h
173	+++ b/include/linux/memcontrol.h
174	@@ -559,7 +559,10 @@ struct mem_cgroup *lock_page_memcg(struc
175	void __unlock_page_memcg(struct mem_cgroup *memcg);
176	void unlock_page_memcg(struct page *page);
177
178	-/* idx can be of type enum memcg_stat_item or node_stat_item */
179	+/*
180	+ * idx can be of type enum memcg_stat_item or node_stat_item.
181	+ * Keep in sync with memcg_exact_page_state().
182	+ */
183	static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
184	int idx)
185	{
186	--- a/mm/memcontrol.c
187	+++ b/mm/memcontrol.c
188	@@ -3897,6 +3897,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
189	return &memcg->cgwb_domain;
190	}
191
192	+/*
193	+ * idx can be of type enum memcg_stat_item or node_stat_item.
194	+ * Keep in sync with memcg_exact_page().
195	+ */
196	+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
197	+{
198	+ long x = atomic_long_read(&memcg->stat[idx]);
199	+ int cpu;
200	+
201	+ for_each_online_cpu(cpu)
202	+ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
203	+ if (x < 0)
204	+ x = 0;
205	+ return x;
206	+}
207	+
208	/**
209	* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
210	* @wb: bdi_writeback in question
211	@@ -3922,10 +3938,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
212	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
213	struct mem_cgroup *parent;
214
215	- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
216	+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
217
218	/* this should eventually include NR_UNSTABLE_NFS */
219	- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
220	+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
221	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) \|
222	(1 << LRU_ACTIVE_FILE));
223	*pheadroom = PAGE_COUNTER_MAX;