]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - releases/4.4.172/dm-snapshot-fix-excessive-memory-usage-and-workqueue.patch
Linux 4.4.172
[thirdparty/kernel/stable-queue.git] / releases / 4.4.172 / dm-snapshot-fix-excessive-memory-usage-and-workqueue.patch
CommitLineData
d3e91060
SL
1From b13166f8edc5a196d17c06efaa6ab9bfa47ffe0c Mon Sep 17 00:00:00 2001
2From: Nikos Tsironis <ntsironis@arrikto.com>
3Date: Wed, 31 Oct 2018 17:53:08 -0400
4Subject: dm snapshot: Fix excessive memory usage and workqueue stalls
5
6[ Upstream commit 721b1d98fb517ae99ab3b757021cf81db41e67be ]
7
8kcopyd has no upper limit to the number of jobs one can allocate and
9issue. Under certain workloads this can lead to excessive memory usage
10and workqueue stalls. For example, when creating multiple dm-snapshot
11targets with a 4K chunk size and then writing to the origin through the
12page cache. Syncing the page cache causes a large number of BIOs to be
13issued to the dm-snapshot origin target, which itself issues an even
14larger (because of the BIO splitting taking place) number of kcopyd
15jobs.
16
17Running the following test, from the device mapper test suite [1],
18
19 dmtest run --suite snapshot -n many_snapshots_of_same_volume_N
20
21, with 8 active snapshots, results in the kcopyd job slab cache growing
22to 10G. Depending on the available system RAM this can lead to the OOM
23killer killing user processes:
24
25[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP),
26 nodemask=(null), order=1, oom_score_adj=0
27[463.492894] kthreadd cpuset=/ mems_allowed=0
28[463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3
29[463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
30[463.492952] Call Trace:
31[463.492964] dump_stack+0x7d/0xbb
32[463.492973] dump_header+0x6b/0x2fc
33[463.492987] ? lockdep_hardirqs_on+0xee/0x190
34[463.493012] oom_kill_process+0x302/0x370
35[463.493021] out_of_memory+0x113/0x560
36[463.493030] __alloc_pages_slowpath+0xf40/0x1020
37[463.493055] __alloc_pages_nodemask+0x348/0x3c0
38[463.493067] cache_grow_begin+0x81/0x8b0
39[463.493072] ? cache_grow_begin+0x874/0x8b0
40[463.493078] fallback_alloc+0x1e4/0x280
41[463.493092] kmem_cache_alloc_node+0xd6/0x370
42[463.493098] ? copy_process.part.31+0x1c5/0x20d0
43[463.493105] copy_process.part.31+0x1c5/0x20d0
44[463.493115] ? __lock_acquire+0x3cc/0x1550
45[463.493121] ? __switch_to_asm+0x34/0x70
46[463.493129] ? kthread_create_worker_on_cpu+0x70/0x70
47[463.493135] ? finish_task_switch+0x90/0x280
48[463.493165] _do_fork+0xe0/0x6d0
49[463.493191] ? kthreadd+0x19f/0x220
50[463.493233] kernel_thread+0x25/0x30
51[463.493235] kthreadd+0x1bf/0x220
52[463.493242] ? kthread_create_on_cpu+0x90/0x90
53[463.493248] ret_from_fork+0x3a/0x50
54[463.493279] Mem-Info:
55[463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0
56[463.493285] active_file:80216 inactive_file:80107 isolated_file:435
57[463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0
58[463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521
59[463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0
60[463.493285] free:33623 free_pcp:2392 free_cma:0
61...
62[463.493489] Unreclaimable slab info:
63[463.493513] Name Used Total
64[463.493522] bio-6 1028KB 1028KB
65[463.493525] bio-5 1028KB 1028KB
66[463.493528] dm_snap_pending_exception 236783KB 243789KB
67[463.493531] dm_exception 41KB 42KB
68[463.493534] bio-4 1216KB 1216KB
69[463.493537] bio-3 439396KB 439396KB
70[463.493539] kcopyd_job 6973427KB 6973427KB
71...
72[463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child
73[463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB
74[463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
75
76Moreover, issuing a large number of kcopyd jobs results in kcopyd
77hogging the CPU, while processing them. As a result, processing of work
78items, queued for execution on the same CPU as the currently running
79kcopyd thread, is stalled for long periods of time, hurting performance.
80Running the aforementioned test we get, in dmesg, messages like the
81following:
82
83[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s!
84[67501.195586] Showing busy workqueues and worker pools:
85[67501.195591] workqueue events: flags=0x0
86[67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
87[67501.195611] pending: cache_reap
88[67501.195641] workqueue mm_percpu_wq: flags=0x8
89[67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
90[67501.195656] pending: vmstat_update
91[67501.195682] workqueue kblockd: flags=0x18
92[67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256
93[67501.195698] pending: blk_timeout_work
94[67501.195753] workqueue kcopyd: flags=0x8
95[67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
96[67501.195768] pending: do_work [dm_mod]
97[67501.195802] workqueue kcopyd: flags=0x8
98[67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
99[67501.195817] pending: do_work [dm_mod]
100[67501.195834] workqueue kcopyd: flags=0x8
101[67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
102[67501.195848] pending: do_work [dm_mod]
103[67501.195881] workqueue kcopyd: flags=0x8
104[67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
105[67501.195896] pending: do_work [dm_mod]
106[67501.195920] workqueue kcopyd: flags=0x8
107[67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256
108[67501.195935] in-flight: 67:do_work [dm_mod]
109[67501.195945] pending: do_work [dm_mod]
110[67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765
111
112The root cause for these issues is the way dm-snapshot uses kcopyd. In
113particular, the lack of an explicit or implicit limit to the maximum
114number of in-flight COW jobs. The merging path is not affected because
115it implicitly limits the in-flight kcopyd jobs to one.
116
117Fix these issues by using a semaphore to limit the maximum number of
118in-flight kcopyd jobs. We grab the semaphore before allocating a new
119kcopyd job in start_copy() and start_full_bio() and release it after the
120job finishes in copy_callback().
121
122The initial semaphore value is configurable through a module parameter,
123to allow fine tuning the maximum number of in-flight COW jobs. Setting
124this parameter to zero initializes the semaphore to INT_MAX.
125
126A default value of 2048 maximum in-flight kcopyd jobs was chosen. This
127value was decided experimentally as a trade-off between memory
128consumption, stalling the kernel's workqueues and maintaining a high
129enough throughput.
130
131Re-running the aforementioned test:
132
133 * Workqueue stalls are eliminated
134 * kcopyd's job slab cache uses a maximum of 130MB
135 * The time taken by the test to write to the snapshot-origin target is
136 reduced from 05m20.48s to 03m26.38s
137
138[1] https://github.com/jthornber/device-mapper-test-suite
139
140Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
141Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
142Signed-off-by: Mike Snitzer <snitzer@redhat.com>
143Signed-off-by: Sasha Levin <sashal@kernel.org>
144---
145 drivers/md/dm-snap.c | 22 ++++++++++++++++++++++
146 1 file changed, 22 insertions(+)
147
148diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
149index e108deebbaaa..5d3797728b9c 100644
150--- a/drivers/md/dm-snap.c
151+++ b/drivers/md/dm-snap.c
152@@ -19,6 +19,7 @@
153 #include <linux/vmalloc.h>
154 #include <linux/log2.h>
155 #include <linux/dm-kcopyd.h>
156+#include <linux/semaphore.h>
157
158 #include "dm.h"
159
160@@ -105,6 +106,9 @@ struct dm_snapshot {
161 /* The on disk metadata handler */
162 struct dm_exception_store *store;
163
164+ /* Maximum number of in-flight COW jobs. */
165+ struct semaphore cow_count;
166+
167 struct dm_kcopyd_client *kcopyd_client;
168
169 /* Wait for events based on state_bits */
170@@ -145,6 +149,19 @@ struct dm_snapshot {
171 #define RUNNING_MERGE 0
172 #define SHUTDOWN_MERGE 1
173
174+/*
175+ * Maximum number of chunks being copied on write.
176+ *
177+ * The value was decided experimentally as a trade-off between memory
178+ * consumption, stalling the kernel's workqueues and maintaining a high enough
179+ * throughput.
180+ */
181+#define DEFAULT_COW_THRESHOLD 2048
182+
183+static int cow_threshold = DEFAULT_COW_THRESHOLD;
184+module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
185+MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
186+
187 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
188 "A percentage of time allocated for copy on write");
189
190@@ -1190,6 +1207,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
191 goto bad_hash_tables;
192 }
193
194+ sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
195+
196 s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
197 if (IS_ERR(s->kcopyd_client)) {
198 r = PTR_ERR(s->kcopyd_client);
199@@ -1563,6 +1582,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
200 }
201 list_add(&pe->out_of_order_entry, lh);
202 }
203+ up(&s->cow_count);
204 }
205
206 /*
207@@ -1586,6 +1606,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
208 dest.count = src.count;
209
210 /* Hand over to kcopyd */
211+ down(&s->cow_count);
212 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
213 }
214
215@@ -1606,6 +1627,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
216 pe->full_bio_end_io = bio->bi_end_io;
217 pe->full_bio_private = bio->bi_private;
218
219+ down(&s->cow_count);
220 callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
221 copy_callback, pe);
222
223--
2242.19.1
225