[thirdparty/kernel/stable-queue.git] / releases / 4.4.172 / dm-snapshot-fix-excessive-memory-usage-and-workqueue.patch

From b13166f8edc5a196d17c06efaa6ab9bfa47ffe0c Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 31 Oct 2018 17:53:08 -0400
Subject: dm snapshot: Fix excessive memory usage and workqueue stalls

[ Upstream commit 721b1d98fb517ae99ab3b757021cf81db41e67be ]

kcopyd has no upper limit to the number of jobs one can allocate and
issue. Under certain workloads this can lead to excessive memory usage
and workqueue stalls. For example, when creating multiple dm-snapshot
targets with a 4K chunk size and then writing to the origin through the
page cache. Syncing the page cache causes a large number of BIOs to be
issued to the dm-snapshot origin target, which itself issues an even
larger (because of the BIO splitting taking place) number of kcopyd
jobs.

Running the following test, from the device mapper test suite [1],

  dmtest run --suite snapshot -n many_snapshots_of_same_volume_N

, with 8 active snapshots, results in the kcopyd job slab cache growing
to 10G. Depending on the available system RAM this can lead to the OOM
killer killing user processes:

[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP),
              nodemask=(null), order=1, oom_score_adj=0
[463.492894] kthreadd cpuset=/ mems_allowed=0
[463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3
[463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[463.492952] Call Trace:
[463.492964]  dump_stack+0x7d/0xbb
[463.492973]  dump_header+0x6b/0x2fc
[463.492987]  ? lockdep_hardirqs_on+0xee/0x190
[463.493012]  oom_kill_process+0x302/0x370
[463.493021]  out_of_memory+0x113/0x560
[463.493030]  __alloc_pages_slowpath+0xf40/0x1020
[463.493055]  __alloc_pages_nodemask+0x348/0x3c0
[463.493067]  cache_grow_begin+0x81/0x8b0
[463.493072]  ? cache_grow_begin+0x874/0x8b0
[463.493078]  fallback_alloc+0x1e4/0x280
[463.493092]  kmem_cache_alloc_node+0xd6/0x370
[463.493098]  ? copy_process.part.31+0x1c5/0x20d0
[463.493105]  copy_process.part.31+0x1c5/0x20d0
[463.493115]  ? __lock_acquire+0x3cc/0x1550
[463.493121]  ? __switch_to_asm+0x34/0x70
[463.493129]  ? kthread_create_worker_on_cpu+0x70/0x70
[463.493135]  ? finish_task_switch+0x90/0x280
[463.493165]  _do_fork+0xe0/0x6d0
[463.493191]  ? kthreadd+0x19f/0x220
[463.493233]  kernel_thread+0x25/0x30
[463.493235]  kthreadd+0x1bf/0x220
[463.493242]  ? kthread_create_on_cpu+0x90/0x90
[463.493248]  ret_from_fork+0x3a/0x50
[463.493279] Mem-Info:
[463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0
[463.493285]  active_file:80216 inactive_file:80107 isolated_file:435
[463.493285]  unevictable:0 dirty:51266 writeback:109372 unstable:0
[463.493285]  slab_reclaimable:31191 slab_unreclaimable:3483521
[463.493285]  mapped:526 shmem:4903 pagetables:1759 bounce:0
[463.493285]  free:33623 free_pcp:2392 free_cma:0
...
[463.493489] Unreclaimable slab info:
[463.493513] Name                      Used          Total
[463.493522] bio-6                   1028KB       1028KB
[463.493525] bio-5                   1028KB       1028KB
[463.493528] dm_snap_pending_exception     236783KB     243789KB
[463.493531] dm_exception              41KB         42KB
[463.493534] bio-4                   1216KB       1216KB
[463.493537] bio-3                 439396KB     439396KB
[463.493539] kcopyd_job           6973427KB    6973427KB
...
[463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child
[463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB
[463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB

Moreover, issuing a large number of kcopyd jobs results in kcopyd
hogging the CPU, while processing them. As a result, processing of work
items, queued for execution on the same CPU as the currently running
kcopyd thread, is stalled for long periods of time, hurting performance.
Running the aforementioned test we get, in dmesg, messages like the
following:

[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s!
[67501.195586] Showing busy workqueues and worker pools:
[67501.195591] workqueue events: flags=0x0
[67501.195597]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195611]     pending: cache_reap
[67501.195641] workqueue mm_percpu_wq: flags=0x8
[67501.195645]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195656]     pending: vmstat_update
[67501.195682] workqueue kblockd: flags=0x18
[67501.195687]   pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256
[67501.195698]     pending: blk_timeout_work
[67501.195753] workqueue kcopyd: flags=0x8
[67501.195757]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195768]     pending: do_work [dm_mod]
[67501.195802] workqueue kcopyd: flags=0x8
[67501.195806]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195817]     pending: do_work [dm_mod]
[67501.195834] workqueue kcopyd: flags=0x8
[67501.195838]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195848]     pending: do_work [dm_mod]
[67501.195881] workqueue kcopyd: flags=0x8
[67501.195885]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195896]     pending: do_work [dm_mod]
[67501.195920] workqueue kcopyd: flags=0x8
[67501.195924]   pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256
[67501.195935]     in-flight: 67:do_work [dm_mod]
[67501.195945]     pending: do_work [dm_mod]
[67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765

The root cause for these issues is the way dm-snapshot uses kcopyd. In
particular, the lack of an explicit or implicit limit to the maximum
number of in-flight COW jobs. The merging path is not affected because
it implicitly limits the in-flight kcopyd jobs to one.

Fix these issues by using a semaphore to limit the maximum number of
in-flight kcopyd jobs. We grab the semaphore before allocating a new
kcopyd job in start_copy() and start_full_bio() and release it after the
job finishes in copy_callback().

The initial semaphore value is configurable through a module parameter,
to allow fine tuning the maximum number of in-flight COW jobs. Setting
this parameter to zero initializes the semaphore to INT_MAX.

A default value of 2048 maximum in-flight kcopyd jobs was chosen. This
value was decided experimentally as a trade-off between memory
consumption, stalling the kernel's workqueues and maintaining a high
enough throughput.

Re-running the aforementioned test:

  * Workqueue stalls are eliminated
  * kcopyd's job slab cache uses a maximum of 130MB
  * The time taken by the test to write to the snapshot-origin target is
    reduced from 05m20.48s to 03m26.38s

[1] https://github.com/jthornber/device-mapper-test-suite

Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/dm-snap.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index e108deebbaaa..5d3797728b9c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,6 +19,7 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
+#include <linux/semaphore.h>
 
 #include "dm.h"
 
@@ -105,6 +106,9 @@ struct dm_snapshot {
 	/* The on disk metadata handler */
 	struct dm_exception_store *store;
 
+	/* Maximum number of in-flight COW jobs. */
+	struct semaphore cow_count;
+
 	struct dm_kcopyd_client *kcopyd_client;
 
 	/* Wait for events based on state_bits */
@@ -145,6 +149,19 @@ struct dm_snapshot {
 #define RUNNING_MERGE          0
 #define SHUTDOWN_MERGE         1
 
+/*
+ * Maximum number of chunks being copied on write.
+ *
+ * The value was decided experimentally as a trade-off between memory
+ * consumption, stalling the kernel's workqueues and maintaining a high enough
+ * throughput.
+ */
+#define DEFAULT_COW_THRESHOLD 2048
+
+static int cow_threshold = DEFAULT_COW_THRESHOLD;
+module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
+MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
+
 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 		"A percentage of time allocated for copy on write");
 
@@ -1190,6 +1207,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_hash_tables;
 	}
 
+	sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
+
 	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(s->kcopyd_client)) {
 		r = PTR_ERR(s->kcopyd_client);
@@ -1563,6 +1582,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
 		}
 		list_add(&pe->out_of_order_entry, lh);
 	}
+	up(&s->cow_count);
 }
 
 /*
@@ -1586,6 +1606,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 	dest.count = src.count;
 
 	/* Hand over to kcopyd */
+	down(&s->cow_count);
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
 }
 
@@ -1606,6 +1627,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
 	pe->full_bio_end_io = bio->bi_end_io;
 	pe->full_bio_private = bio->bi_private;
 
+	down(&s->cow_count);
 	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
 						   copy_callback, pe);
 
-- 
2.19.1
Commit	Line	Data
d3e91060 SL	1	From b13166f8edc5a196d17c06efaa6ab9bfa47ffe0c Mon Sep 17 00:00:00 2001
	2	From: Nikos Tsironis <ntsironis@arrikto.com>
	3	Date: Wed, 31 Oct 2018 17:53:08 -0400
	4	Subject: dm snapshot: Fix excessive memory usage and workqueue stalls
	5
	6	[ Upstream commit 721b1d98fb517ae99ab3b757021cf81db41e67be ]
	7
	8	kcopyd has no upper limit to the number of jobs one can allocate and
	9	issue. Under certain workloads this can lead to excessive memory usage
	10	and workqueue stalls. For example, when creating multiple dm-snapshot
	11	targets with a 4K chunk size and then writing to the origin through the
	12	page cache. Syncing the page cache causes a large number of BIOs to be
	13	issued to the dm-snapshot origin target, which itself issues an even
	14	larger (because of the BIO splitting taking place) number of kcopyd
	15	jobs.
	16
	17	Running the following test, from the device mapper test suite [1],
	18
	19	dmtest run --suite snapshot -n many_snapshots_of_same_volume_N
	20
	21	, with 8 active snapshots, results in the kcopyd job slab cache growing
	22	to 10G. Depending on the available system RAM this can lead to the OOM
	23	killer killing user processes:
	24
	25	[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL\|__GFP_COMP),
	26	nodemask=(null), order=1, oom_score_adj=0
	27	[463.492894] kthreadd cpuset=/ mems_allowed=0
	28	[463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3
	29	[463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
	30	[463.492952] Call Trace:
	31	[463.492964] dump_stack+0x7d/0xbb
	32	[463.492973] dump_header+0x6b/0x2fc
	33	[463.492987] ? lockdep_hardirqs_on+0xee/0x190
	34	[463.493012] oom_kill_process+0x302/0x370
	35	[463.493021] out_of_memory+0x113/0x560
	36	[463.493030] __alloc_pages_slowpath+0xf40/0x1020
	37	[463.493055] __alloc_pages_nodemask+0x348/0x3c0
	38	[463.493067] cache_grow_begin+0x81/0x8b0
	39	[463.493072] ? cache_grow_begin+0x874/0x8b0
	40	[463.493078] fallback_alloc+0x1e4/0x280
	41	[463.493092] kmem_cache_alloc_node+0xd6/0x370
	42	[463.493098] ? copy_process.part.31+0x1c5/0x20d0
	43	[463.493105] copy_process.part.31+0x1c5/0x20d0
	44	[463.493115] ? __lock_acquire+0x3cc/0x1550
	45	[463.493121] ? __switch_to_asm+0x34/0x70
	46	[463.493129] ? kthread_create_worker_on_cpu+0x70/0x70
	47	[463.493135] ? finish_task_switch+0x90/0x280
	48	[463.493165] _do_fork+0xe0/0x6d0
	49	[463.493191] ? kthreadd+0x19f/0x220
	50	[463.493233] kernel_thread+0x25/0x30
	51	[463.493235] kthreadd+0x1bf/0x220
	52	[463.493242] ? kthread_create_on_cpu+0x90/0x90
	53	[463.493248] ret_from_fork+0x3a/0x50
	54	[463.493279] Mem-Info:
	55	[463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0
	56	[463.493285] active_file:80216 inactive_file:80107 isolated_file:435
	57	[463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0
	58	[463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521
	59	[463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0
	60	[463.493285] free:33623 free_pcp:2392 free_cma:0
	61	...
	62	[463.493489] Unreclaimable slab info:
	63	[463.493513] Name Used Total
	64	[463.493522] bio-6 1028KB 1028KB
65	[463.493525] bio-5 1028KB 1028KB
66	[463.493528] dm_snap_pending_exception 236783KB 243789KB
67	[463.493531] dm_exception 41KB 42KB
68	[463.493534] bio-4 1216KB 1216KB
69	[463.493537] bio-3 439396KB 439396KB
70	[463.493539] kcopyd_job 6973427KB 6973427KB
71	...
72	[463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child
73	[463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB
74	[463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
75
76	Moreover, issuing a large number of kcopyd jobs results in kcopyd
77	hogging the CPU, while processing them. As a result, processing of work
78	items, queued for execution on the same CPU as the currently running
79	kcopyd thread, is stalled for long periods of time, hurting performance.
80	Running the aforementioned test we get, in dmesg, messages like the
81	following:
82
83	[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s!
84	[67501.195586] Showing busy workqueues and worker pools:
85	[67501.195591] workqueue events: flags=0x0
86	[67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
87	[67501.195611] pending: cache_reap
88	[67501.195641] workqueue mm_percpu_wq: flags=0x8
89	[67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
90	[67501.195656] pending: vmstat_update
91	[67501.195682] workqueue kblockd: flags=0x18
92	[67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256
93	[67501.195698] pending: blk_timeout_work
94	[67501.195753] workqueue kcopyd: flags=0x8
95	[67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
96	[67501.195768] pending: do_work [dm_mod]
97	[67501.195802] workqueue kcopyd: flags=0x8
98	[67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
99	[67501.195817] pending: do_work [dm_mod]
100	[67501.195834] workqueue kcopyd: flags=0x8
101	[67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
102	[67501.195848] pending: do_work [dm_mod]
103	[67501.195881] workqueue kcopyd: flags=0x8
104	[67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
105	[67501.195896] pending: do_work [dm_mod]
106	[67501.195920] workqueue kcopyd: flags=0x8
107	[67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256
108	[67501.195935] in-flight: 67:do_work [dm_mod]
109	[67501.195945] pending: do_work [dm_mod]
110	[67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765
111
112	The root cause for these issues is the way dm-snapshot uses kcopyd. In
113	particular, the lack of an explicit or implicit limit to the maximum
114	number of in-flight COW jobs. The merging path is not affected because
115	it implicitly limits the in-flight kcopyd jobs to one.
116
117	Fix these issues by using a semaphore to limit the maximum number of
118	in-flight kcopyd jobs. We grab the semaphore before allocating a new
119	kcopyd job in start_copy() and start_full_bio() and release it after the
120	job finishes in copy_callback().
121
122	The initial semaphore value is configurable through a module parameter,
123	to allow fine tuning the maximum number of in-flight COW jobs. Setting
124	this parameter to zero initializes the semaphore to INT_MAX.
125
126	A default value of 2048 maximum in-flight kcopyd jobs was chosen. This
127	value was decided experimentally as a trade-off between memory
128	consumption, stalling the kernel's workqueues and maintaining a high
129	enough throughput.
130
131	Re-running the aforementioned test:
132
133	* Workqueue stalls are eliminated
134	* kcopyd's job slab cache uses a maximum of 130MB
135	* The time taken by the test to write to the snapshot-origin target is
136	reduced from 05m20.48s to 03m26.38s
137
138	[1] https://github.com/jthornber/device-mapper-test-suite
139
140	Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
141	Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
142	Signed-off-by: Mike Snitzer <snitzer@redhat.com>
143	Signed-off-by: Sasha Levin <sashal@kernel.org>
144	---
145	drivers/md/dm-snap.c \| 22 ++++++++++++++++++++++
146	1 file changed, 22 insertions(+)
147
148	diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
149	index e108deebbaaa..5d3797728b9c 100644
150	--- a/drivers/md/dm-snap.c
151	+++ b/drivers/md/dm-snap.c
152	@@ -19,6 +19,7 @@
153	#include <linux/vmalloc.h>
154	#include <linux/log2.h>
155	#include <linux/dm-kcopyd.h>
156	+#include <linux/semaphore.h>
157
158	#include "dm.h"
159
160	@@ -105,6 +106,9 @@ struct dm_snapshot {
161	/* The on disk metadata handler */
162	struct dm_exception_store *store;
163
164	+ /* Maximum number of in-flight COW jobs. */
165	+ struct semaphore cow_count;
166	+
167	struct dm_kcopyd_client *kcopyd_client;
168
169	/* Wait for events based on state_bits */
170	@@ -145,6 +149,19 @@ struct dm_snapshot {
171	#define RUNNING_MERGE 0
172	#define SHUTDOWN_MERGE 1
173
174	+/*
175	+ * Maximum number of chunks being copied on write.
176	+ *
177	+ * The value was decided experimentally as a trade-off between memory
178	+ * consumption, stalling the kernel's workqueues and maintaining a high enough
179	+ * throughput.
180	+ */
181	+#define DEFAULT_COW_THRESHOLD 2048
182	+
183	+static int cow_threshold = DEFAULT_COW_THRESHOLD;
184	+module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
185	+MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
186	+
187	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
188	"A percentage of time allocated for copy on write");
189
190	@@ -1190,6 +1207,8 @@ static int snapshot_ctr(struct dm_target ti, unsigned int argc, char *argv)
191	goto bad_hash_tables;
192	}
193
194	+ sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
195	+
196	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
197	if (IS_ERR(s->kcopyd_client)) {
198	r = PTR_ERR(s->kcopyd_client);
199	@@ -1563,6 +1582,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
200	}
201	list_add(&pe->out_of_order_entry, lh);
202	}
203	+ up(&s->cow_count);
204	}
205
206	/*
207	@@ -1586,6 +1606,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
208	dest.count = src.count;
209
210	/* Hand over to kcopyd */
211	+ down(&s->cow_count);
212	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
213	}
214
215	@@ -1606,6 +1627,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
216	pe->full_bio_end_io = bio->bi_end_io;
217	pe->full_bio_private = bio->bi_private;
218
219	+ down(&s->cow_count);
220	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
221	copy_callback, pe);
222
223	--
224	2.19.1
225