]>
Commit | Line | Data |
---|---|---|
d3e91060 SL |
1 | From b13166f8edc5a196d17c06efaa6ab9bfa47ffe0c Mon Sep 17 00:00:00 2001 |
2 | From: Nikos Tsironis <ntsironis@arrikto.com> | |
3 | Date: Wed, 31 Oct 2018 17:53:08 -0400 | |
4 | Subject: dm snapshot: Fix excessive memory usage and workqueue stalls | |
5 | ||
6 | [ Upstream commit 721b1d98fb517ae99ab3b757021cf81db41e67be ] | |
7 | ||
8 | kcopyd has no upper limit to the number of jobs one can allocate and | |
9 | issue. Under certain workloads this can lead to excessive memory usage | |
10 | and workqueue stalls. For example, when creating multiple dm-snapshot | |
11 | targets with a 4K chunk size and then writing to the origin through the | |
12 | page cache. Syncing the page cache causes a large number of BIOs to be | |
13 | issued to the dm-snapshot origin target, which itself issues an even | |
14 | larger (because of the BIO splitting taking place) number of kcopyd | |
15 | jobs. | |
16 | ||
17 | Running the following test, from the device mapper test suite [1], | |
18 | ||
19 | dmtest run --suite snapshot -n many_snapshots_of_same_volume_N | |
20 | ||
21 | , with 8 active snapshots, results in the kcopyd job slab cache growing | |
22 | to 10G. Depending on the available system RAM this can lead to the OOM | |
23 | killer killing user processes: | |
24 | ||
25 | [463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), | |
26 | nodemask=(null), order=1, oom_score_adj=0 | |
27 | [463.492894] kthreadd cpuset=/ mems_allowed=0 | |
28 | [463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3 | |
29 | [463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 | |
30 | [463.492952] Call Trace: | |
31 | [463.492964] dump_stack+0x7d/0xbb | |
32 | [463.492973] dump_header+0x6b/0x2fc | |
33 | [463.492987] ? lockdep_hardirqs_on+0xee/0x190 | |
34 | [463.493012] oom_kill_process+0x302/0x370 | |
35 | [463.493021] out_of_memory+0x113/0x560 | |
36 | [463.493030] __alloc_pages_slowpath+0xf40/0x1020 | |
37 | [463.493055] __alloc_pages_nodemask+0x348/0x3c0 | |
38 | [463.493067] cache_grow_begin+0x81/0x8b0 | |
39 | [463.493072] ? cache_grow_begin+0x874/0x8b0 | |
40 | [463.493078] fallback_alloc+0x1e4/0x280 | |
41 | [463.493092] kmem_cache_alloc_node+0xd6/0x370 | |
42 | [463.493098] ? copy_process.part.31+0x1c5/0x20d0 | |
43 | [463.493105] copy_process.part.31+0x1c5/0x20d0 | |
44 | [463.493115] ? __lock_acquire+0x3cc/0x1550 | |
45 | [463.493121] ? __switch_to_asm+0x34/0x70 | |
46 | [463.493129] ? kthread_create_worker_on_cpu+0x70/0x70 | |
47 | [463.493135] ? finish_task_switch+0x90/0x280 | |
48 | [463.493165] _do_fork+0xe0/0x6d0 | |
49 | [463.493191] ? kthreadd+0x19f/0x220 | |
50 | [463.493233] kernel_thread+0x25/0x30 | |
51 | [463.493235] kthreadd+0x1bf/0x220 | |
52 | [463.493242] ? kthread_create_on_cpu+0x90/0x90 | |
53 | [463.493248] ret_from_fork+0x3a/0x50 | |
54 | [463.493279] Mem-Info: | |
55 | [463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0 | |
56 | [463.493285] active_file:80216 inactive_file:80107 isolated_file:435 | |
57 | [463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0 | |
58 | [463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521 | |
59 | [463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0 | |
60 | [463.493285] free:33623 free_pcp:2392 free_cma:0 | |
61 | ... | |
62 | [463.493489] Unreclaimable slab info: | |
63 | [463.493513] Name Used Total | |
64 | [463.493522] bio-6 1028KB 1028KB | |
65 | [463.493525] bio-5 1028KB 1028KB | |
66 | [463.493528] dm_snap_pending_exception 236783KB 243789KB | |
67 | [463.493531] dm_exception 41KB 42KB | |
68 | [463.493534] bio-4 1216KB 1216KB | |
69 | [463.493537] bio-3 439396KB 439396KB | |
70 | [463.493539] kcopyd_job 6973427KB 6973427KB | |
71 | ... | |
72 | [463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child | |
73 | [463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB | |
74 | [463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB | |
75 | ||
76 | Moreover, issuing a large number of kcopyd jobs results in kcopyd | |
77 | hogging the CPU, while processing them. As a result, processing of work | |
78 | items, queued for execution on the same CPU as the currently running | |
79 | kcopyd thread, is stalled for long periods of time, hurting performance. | |
80 | Running the aforementioned test we get, in dmesg, messages like the | |
81 | following: | |
82 | ||
83 | [67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s! | |
84 | [67501.195586] Showing busy workqueues and worker pools: | |
85 | [67501.195591] workqueue events: flags=0x0 | |
86 | [67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 | |
87 | [67501.195611] pending: cache_reap | |
88 | [67501.195641] workqueue mm_percpu_wq: flags=0x8 | |
89 | [67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 | |
90 | [67501.195656] pending: vmstat_update | |
91 | [67501.195682] workqueue kblockd: flags=0x18 | |
92 | [67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256 | |
93 | [67501.195698] pending: blk_timeout_work | |
94 | [67501.195753] workqueue kcopyd: flags=0x8 | |
95 | [67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 | |
96 | [67501.195768] pending: do_work [dm_mod] | |
97 | [67501.195802] workqueue kcopyd: flags=0x8 | |
98 | [67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 | |
99 | [67501.195817] pending: do_work [dm_mod] | |
100 | [67501.195834] workqueue kcopyd: flags=0x8 | |
101 | [67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 | |
102 | [67501.195848] pending: do_work [dm_mod] | |
103 | [67501.195881] workqueue kcopyd: flags=0x8 | |
104 | [67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 | |
105 | [67501.195896] pending: do_work [dm_mod] | |
106 | [67501.195920] workqueue kcopyd: flags=0x8 | |
107 | [67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256 | |
108 | [67501.195935] in-flight: 67:do_work [dm_mod] | |
109 | [67501.195945] pending: do_work [dm_mod] | |
110 | [67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765 | |
111 | ||
112 | The root cause for these issues is the way dm-snapshot uses kcopyd. In | |
113 | particular, the lack of an explicit or implicit limit to the maximum | |
114 | number of in-flight COW jobs. The merging path is not affected because | |
115 | it implicitly limits the in-flight kcopyd jobs to one. | |
116 | ||
117 | Fix these issues by using a semaphore to limit the maximum number of | |
118 | in-flight kcopyd jobs. We grab the semaphore before allocating a new | |
119 | kcopyd job in start_copy() and start_full_bio() and release it after the | |
120 | job finishes in copy_callback(). | |
121 | ||
122 | The initial semaphore value is configurable through a module parameter, | |
123 | to allow fine tuning the maximum number of in-flight COW jobs. Setting | |
124 | this parameter to zero initializes the semaphore to INT_MAX. | |
125 | ||
126 | A default value of 2048 maximum in-flight kcopyd jobs was chosen. This | |
127 | value was decided experimentally as a trade-off between memory | |
128 | consumption, stalling the kernel's workqueues and maintaining a high | |
129 | enough throughput. | |
130 | ||
131 | Re-running the aforementioned test: | |
132 | ||
133 | * Workqueue stalls are eliminated | |
134 | * kcopyd's job slab cache uses a maximum of 130MB | |
135 | * The time taken by the test to write to the snapshot-origin target is | |
136 | reduced from 05m20.48s to 03m26.38s | |
137 | ||
138 | [1] https://github.com/jthornber/device-mapper-test-suite | |
139 | ||
140 | Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com> | |
141 | Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com> | |
142 | Signed-off-by: Mike Snitzer <snitzer@redhat.com> | |
143 | Signed-off-by: Sasha Levin <sashal@kernel.org> | |
144 | --- | |
145 | drivers/md/dm-snap.c | 22 ++++++++++++++++++++++ | |
146 | 1 file changed, 22 insertions(+) | |
147 | ||
148 | diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c | |
149 | index e108deebbaaa..5d3797728b9c 100644 | |
150 | --- a/drivers/md/dm-snap.c | |
151 | +++ b/drivers/md/dm-snap.c | |
152 | @@ -19,6 +19,7 @@ | |
153 | #include <linux/vmalloc.h> | |
154 | #include <linux/log2.h> | |
155 | #include <linux/dm-kcopyd.h> | |
156 | +#include <linux/semaphore.h> | |
157 | ||
158 | #include "dm.h" | |
159 | ||
160 | @@ -105,6 +106,9 @@ struct dm_snapshot { | |
161 | /* The on disk metadata handler */ | |
162 | struct dm_exception_store *store; | |
163 | ||
164 | + /* Maximum number of in-flight COW jobs. */ | |
165 | + struct semaphore cow_count; | |
166 | + | |
167 | struct dm_kcopyd_client *kcopyd_client; | |
168 | ||
169 | /* Wait for events based on state_bits */ | |
170 | @@ -145,6 +149,19 @@ struct dm_snapshot { | |
171 | #define RUNNING_MERGE 0 | |
172 | #define SHUTDOWN_MERGE 1 | |
173 | ||
174 | +/* | |
175 | + * Maximum number of chunks being copied on write. | |
176 | + * | |
177 | + * The value was decided experimentally as a trade-off between memory | |
178 | + * consumption, stalling the kernel's workqueues and maintaining a high enough | |
179 | + * throughput. | |
180 | + */ | |
181 | +#define DEFAULT_COW_THRESHOLD 2048 | |
182 | + | |
183 | +static int cow_threshold = DEFAULT_COW_THRESHOLD; | |
184 | +module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644); | |
185 | +MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); | |
186 | + | |
187 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, | |
188 | "A percentage of time allocated for copy on write"); | |
189 | ||
190 | @@ -1190,6 +1207,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |
191 | goto bad_hash_tables; | |
192 | } | |
193 | ||
194 | + sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX); | |
195 | + | |
196 | s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); | |
197 | if (IS_ERR(s->kcopyd_client)) { | |
198 | r = PTR_ERR(s->kcopyd_client); | |
199 | @@ -1563,6 +1582,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) | |
200 | } | |
201 | list_add(&pe->out_of_order_entry, lh); | |
202 | } | |
203 | + up(&s->cow_count); | |
204 | } | |
205 | ||
206 | /* | |
207 | @@ -1586,6 +1606,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) | |
208 | dest.count = src.count; | |
209 | ||
210 | /* Hand over to kcopyd */ | |
211 | + down(&s->cow_count); | |
212 | dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); | |
213 | } | |
214 | ||
215 | @@ -1606,6 +1627,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, | |
216 | pe->full_bio_end_io = bio->bi_end_io; | |
217 | pe->full_bio_private = bio->bi_private; | |
218 | ||
219 | + down(&s->cow_count); | |
220 | callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, | |
221 | copy_callback, pe); | |
222 | ||
223 | -- | |
224 | 2.19.1 | |
225 |