]> git.ipfire.org Git - thirdparty/qemu.git/blame - block/block-copy.c
Merge remote-tracking branch 'remotes/mdroth/tags/qga-pull-2020-03-24-tag0' into...
[thirdparty/qemu.git] / block / block-copy.c
CommitLineData
beb5f545
VSO
1/*
2 * block_copy API
3 *
4 * Copyright (C) 2013 Proxmox Server Solutions
5 * Copyright (c) 2019 Virtuozzo International GmbH.
6 *
7 * Authors:
8 * Dietmar Maurer (dietmar@proxmox.com)
9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 */
14
15#include "qemu/osdep.h"
16
17#include "trace.h"
18#include "qapi/error.h"
19#include "block/block-copy.h"
20#include "sysemu/block-backend.h"
b3b7036a
VSO
21#include "qemu/units.h"
22
23#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
0e240245 24#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
7f739d0e 25#define BLOCK_COPY_MAX_MEM (128 * MiB)
beb5f545 26
397f4e9d
VSO
27typedef struct BlockCopyInFlightReq {
28 int64_t offset;
29 int64_t bytes;
30 QLIST_ENTRY(BlockCopyInFlightReq) list;
31 CoQueue wait_queue; /* coroutines blocked on this request */
32} BlockCopyInFlightReq;
33
34typedef struct BlockCopyState {
35 /*
36 * BdrvChild objects are not owned or managed by block-copy. They are
37 * provided by block-copy user and user is responsible for appropriate
38 * permissions on these children.
39 */
40 BdrvChild *source;
41 BdrvChild *target;
42 BdrvDirtyBitmap *copy_bitmap;
43 int64_t in_flight_bytes;
44 int64_t cluster_size;
45 bool use_copy_range;
46 int64_t copy_size;
47 uint64_t len;
48 QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
49
50 BdrvRequestFlags write_flags;
51
52 /*
53 * skip_unallocated:
54 *
55 * Used by sync=top jobs, which first scan the source node for unallocated
56 * areas and clear them in the copy_bitmap. During this process, the bitmap
57 * is thus not fully initialized: It may still have bits set for areas that
58 * are unallocated and should actually not be copied.
59 *
60 * This is indicated by skip_unallocated.
61 *
62 * In this case, block_copy() will query the source’s allocation status,
63 * skip unallocated regions, clear them in the copy_bitmap, and invoke
64 * block_copy_reset_unallocated() every time it does.
65 */
66 bool skip_unallocated;
67
68 ProgressMeter *progress;
69 /* progress_bytes_callback: called when some copying progress is done. */
70 ProgressBytesCallbackFunc progress_bytes_callback;
71 void *progress_opaque;
72
73 SharedResource *mem;
74} BlockCopyState;
75
17187cb6 76static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
8719091f 77 int64_t offset,
dafaf135 78 int64_t bytes)
17187cb6
VSO
79{
80 BlockCopyInFlightReq *req;
81
82 QLIST_FOREACH(req, &s->inflight_reqs, list) {
8719091f 83 if (offset + bytes > req->offset && offset < req->offset + req->bytes) {
17187cb6
VSO
84 return req;
85 }
86 }
87
88 return NULL;
89}
90
5332e5d2
VSO
91/*
92 * If there are no intersecting requests return false. Otherwise, wait for the
93 * first found intersecting request to finish and return true.
94 */
95static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
96 int64_t bytes)
a6ffe199 97{
5332e5d2 98 BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, bytes);
17187cb6 99
5332e5d2
VSO
100 if (!req) {
101 return false;
17187cb6 102 }
5332e5d2
VSO
103
104 qemu_co_queue_wait(&req->wait_queue, NULL);
105
106 return true;
a6ffe199
VSO
107}
108
5332e5d2 109/* Called only on full-dirty region */
a6ffe199
VSO
110static void block_copy_inflight_req_begin(BlockCopyState *s,
111 BlockCopyInFlightReq *req,
8719091f 112 int64_t offset, int64_t bytes)
a6ffe199 113{
5332e5d2
VSO
114 assert(!find_conflicting_inflight_req(s, offset, bytes));
115
116 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
117 s->in_flight_bytes += bytes;
118
8719091f 119 req->offset = offset;
dafaf135 120 req->bytes = bytes;
a6ffe199
VSO
121 qemu_co_queue_init(&req->wait_queue);
122 QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
123}
124
5332e5d2
VSO
125/*
126 * block_copy_inflight_req_shrink
127 *
128 * Drop the tail of the request to be handled later. Set dirty bits back and
129 * wake up all requests waiting for us (may be some of them are not intersecting
130 * with shrunk request)
131 */
132static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s,
133 BlockCopyInFlightReq *req, int64_t new_bytes)
a6ffe199 134{
5332e5d2
VSO
135 if (new_bytes == req->bytes) {
136 return;
137 }
138
139 assert(new_bytes > 0 && new_bytes < req->bytes);
140
141 s->in_flight_bytes -= req->bytes - new_bytes;
142 bdrv_set_dirty_bitmap(s->copy_bitmap,
143 req->offset + new_bytes, req->bytes - new_bytes);
144
145 req->bytes = new_bytes;
146 qemu_co_queue_restart_all(&req->wait_queue);
147}
148
149static void coroutine_fn block_copy_inflight_req_end(BlockCopyState *s,
150 BlockCopyInFlightReq *req,
151 int ret)
152{
153 s->in_flight_bytes -= req->bytes;
154 if (ret < 0) {
155 bdrv_set_dirty_bitmap(s->copy_bitmap, req->offset, req->bytes);
156 }
a6ffe199
VSO
157 QLIST_REMOVE(req, list);
158 qemu_co_queue_restart_all(&req->wait_queue);
159}
160
beb5f545
VSO
161void block_copy_state_free(BlockCopyState *s)
162{
163 if (!s) {
164 return;
165 }
166
5deb6cbd 167 bdrv_release_dirty_bitmap(s->copy_bitmap);
7f739d0e 168 shres_destroy(s->mem);
beb5f545
VSO
169 g_free(s);
170}
171
9d31bc53
VSO
172static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
173{
174 return MIN_NON_ZERO(INT_MAX,
175 MIN_NON_ZERO(source->bs->bl.max_transfer,
176 target->bs->bl.max_transfer));
177}
178
00e30f05 179BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
0f4b02b7
VSO
180 int64_t cluster_size,
181 BdrvRequestFlags write_flags, Error **errp)
beb5f545
VSO
182{
183 BlockCopyState *s;
beb5f545
VSO
184 BdrvDirtyBitmap *copy_bitmap;
185
00e30f05
VSO
186 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
187 errp);
beb5f545
VSO
188 if (!copy_bitmap) {
189 return NULL;
190 }
191 bdrv_disable_dirty_bitmap(copy_bitmap);
192
193 s = g_new(BlockCopyState, 1);
194 *s = (BlockCopyState) {
00e30f05
VSO
195 .source = source,
196 .target = target,
beb5f545
VSO
197 .copy_bitmap = copy_bitmap,
198 .cluster_size = cluster_size,
199 .len = bdrv_dirty_bitmap_size(copy_bitmap),
200 .write_flags = write_flags,
7f739d0e 201 .mem = shres_create(BLOCK_COPY_MAX_MEM),
beb5f545
VSO
202 };
203
9d31bc53 204 if (block_copy_max_transfer(source, target) < cluster_size) {
0e240245
VSO
205 /*
206 * copy_range does not respect max_transfer. We don't want to bother
207 * with requests smaller than block-copy cluster size, so fallback to
208 * buffered copying (read and write respect max_transfer on their
209 * behalf).
210 */
211 s->use_copy_range = false;
212 s->copy_size = cluster_size;
213 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
dcfbece6 214 /* Compression supports only cluster-size writes and no copy-range. */
0e240245 215 s->use_copy_range = false;
dcfbece6 216 s->copy_size = cluster_size;
0e240245
VSO
217 } else {
218 /*
9d31bc53
VSO
219 * We enable copy-range, but keep small copy_size, until first
220 * successful copy_range (look at block_copy_do_copy).
0e240245
VSO
221 */
222 s->use_copy_range = true;
9d31bc53 223 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
0e240245 224 }
beb5f545 225
a6ffe199
VSO
226 QLIST_INIT(&s->inflight_reqs);
227
beb5f545 228 return s;
beb5f545
VSO
229}
230
d0ebeca1 231void block_copy_set_progress_callback(
0f4b02b7
VSO
232 BlockCopyState *s,
233 ProgressBytesCallbackFunc progress_bytes_callback,
0f4b02b7
VSO
234 void *progress_opaque)
235{
236 s->progress_bytes_callback = progress_bytes_callback;
0f4b02b7
VSO
237 s->progress_opaque = progress_opaque;
238}
239
d0ebeca1
VSO
240void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
241{
242 s->progress = pm;
243}
244
beb5f545 245/*
e332a726
VSO
246 * block_copy_do_copy
247 *
dafaf135
VSO
248 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
249 * s->len only to cover last cluster when s->len is not aligned to clusters.
e332a726
VSO
250 *
251 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
252 *
253 * Returns 0 on success.
beb5f545 254 */
e332a726 255static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
8719091f 256 int64_t offset, int64_t bytes,
2d57511a 257 bool zeroes, bool *error_is_read)
beb5f545
VSO
258{
259 int ret;
8719091f 260 int64_t nbytes = MIN(offset + bytes, s->len) - offset;
e332a726 261 void *bounce_buffer = NULL;
beb5f545 262
8719091f
VSO
263 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
264 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
dafaf135 265 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
8719091f
VSO
266 assert(offset < s->len);
267 assert(offset + bytes <= s->len ||
268 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
dafaf135 269 assert(nbytes < INT_MAX);
e332a726 270
2d57511a 271 if (zeroes) {
8719091f 272 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
2d57511a
VSO
273 ~BDRV_REQ_WRITE_COMPRESSED);
274 if (ret < 0) {
8719091f 275 trace_block_copy_write_zeroes_fail(s, offset, ret);
2d57511a
VSO
276 if (error_is_read) {
277 *error_is_read = false;
278 }
279 }
280 return ret;
281 }
282
e332a726 283 if (s->use_copy_range) {
8719091f 284 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
e332a726
VSO
285 0, s->write_flags);
286 if (ret < 0) {
8719091f 287 trace_block_copy_copy_range_fail(s, offset, ret);
e332a726 288 s->use_copy_range = false;
0e240245 289 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
e332a726
VSO
290 /* Fallback to read+write with allocated buffer */
291 } else {
9d31bc53
VSO
292 if (s->use_copy_range) {
293 /*
294 * Successful copy-range. Now increase copy_size. copy_range
295 * does not respect max_transfer (it's a TODO), so we factor
296 * that in here.
297 *
298 * Note: we double-check s->use_copy_range for the case when
299 * parallel block-copy request unsets it during previous
300 * bdrv_co_copy_range call.
301 */
302 s->copy_size =
303 MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
304 QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
305 s->target),
306 s->cluster_size));
307 }
e332a726
VSO
308 goto out;
309 }
310 }
311
0e240245
VSO
312 /*
313 * In case of failed copy_range request above, we may proceed with buffered
314 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
9d31bc53
VSO
315 * be properly limited, so don't care too much. Moreover the most likely
316 * case (copy_range is unsupported for the configuration, so the very first
317 * copy_range request fails) is handled by setting large copy_size only
318 * after first successful copy_range.
0e240245
VSO
319 */
320
e332a726 321 bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
beb5f545 322
8719091f 323 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
beb5f545 324 if (ret < 0) {
8719091f 325 trace_block_copy_read_fail(s, offset, ret);
beb5f545
VSO
326 if (error_is_read) {
327 *error_is_read = true;
328 }
e332a726 329 goto out;
beb5f545
VSO
330 }
331
8719091f 332 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
00e30f05 333 s->write_flags);
beb5f545 334 if (ret < 0) {
8719091f 335 trace_block_copy_write_fail(s, offset, ret);
beb5f545
VSO
336 if (error_is_read) {
337 *error_is_read = false;
338 }
e332a726 339 goto out;
beb5f545
VSO
340 }
341
e332a726 342out:
3816edd2
VSO
343 qemu_vfree(bounce_buffer);
344
beb5f545 345 return ret;
beb5f545
VSO
346}
347
2d57511a
VSO
348static int block_copy_block_status(BlockCopyState *s, int64_t offset,
349 int64_t bytes, int64_t *pnum)
350{
351 int64_t num;
352 BlockDriverState *base;
353 int ret;
354
355 if (s->skip_unallocated && s->source->bs->backing) {
356 base = s->source->bs->backing->bs;
357 } else {
358 base = NULL;
359 }
360
361 ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
362 NULL, NULL);
363 if (ret < 0 || num < s->cluster_size) {
364 /*
365 * On error or if failed to obtain large enough chunk just fallback to
366 * copy one cluster.
367 */
368 num = s->cluster_size;
369 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
370 } else if (offset + num == s->len) {
371 num = QEMU_ALIGN_UP(num, s->cluster_size);
372 } else {
373 num = QEMU_ALIGN_DOWN(num, s->cluster_size);
374 }
375
376 *pnum = num;
377 return ret;
378}
379
beb5f545
VSO
380/*
381 * Check if the cluster starting at offset is allocated or not.
382 * return via pnum the number of contiguous clusters sharing this allocation.
383 */
384static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
385 int64_t *pnum)
386{
00e30f05 387 BlockDriverState *bs = s->source->bs;
beb5f545
VSO
388 int64_t count, total_count = 0;
389 int64_t bytes = s->len - offset;
390 int ret;
391
392 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
393
394 while (true) {
395 ret = bdrv_is_allocated(bs, offset, bytes, &count);
396 if (ret < 0) {
397 return ret;
398 }
399
400 total_count += count;
401
402 if (ret || count == 0) {
403 /*
404 * ret: partial segment(s) are considered allocated.
405 * otherwise: unallocated tail is treated as an entire segment.
406 */
407 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
408 return ret;
409 }
410
411 /* Unallocated segment(s) with uncertain following segment(s) */
412 if (total_count >= s->cluster_size) {
413 *pnum = total_count / s->cluster_size;
414 return 0;
415 }
416
417 offset += count;
418 bytes -= count;
419 }
420}
421
422/*
423 * Reset bits in copy_bitmap starting at offset if they represent unallocated
424 * data in the image. May reset subsequent contiguous bits.
425 * @return 0 when the cluster at @offset was unallocated,
426 * 1 otherwise, and -ret on error.
427 */
428int64_t block_copy_reset_unallocated(BlockCopyState *s,
429 int64_t offset, int64_t *count)
430{
431 int ret;
432 int64_t clusters, bytes;
433
434 ret = block_copy_is_cluster_allocated(s, offset, &clusters);
435 if (ret < 0) {
436 return ret;
437 }
438
439 bytes = clusters * s->cluster_size;
440
441 if (!ret) {
442 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
d0ebeca1
VSO
443 progress_set_remaining(s->progress,
444 bdrv_get_dirty_count(s->copy_bitmap) +
445 s->in_flight_bytes);
beb5f545
VSO
446 }
447
448 *count = bytes;
449 return ret;
450}
451
5332e5d2
VSO
452/*
453 * block_copy_dirty_clusters
454 *
455 * Copy dirty clusters in @offset/@bytes range.
456 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
457 * clusters found and -errno on failure.
458 */
459static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
460 int64_t offset, int64_t bytes,
461 bool *error_is_read)
beb5f545
VSO
462{
463 int ret = 0;
5332e5d2 464 bool found_dirty = false;
beb5f545
VSO
465
466 /*
467 * block_copy() user is responsible for keeping source and target in same
468 * aio context
469 */
00e30f05
VSO
470 assert(bdrv_get_aio_context(s->source->bs) ==
471 bdrv_get_aio_context(s->target->bs));
beb5f545 472
8719091f 473 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
dafaf135 474 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
beb5f545 475
dafaf135 476 while (bytes) {
5332e5d2 477 BlockCopyInFlightReq req;
dafaf135 478 int64_t next_zero, cur_bytes, status_bytes;
beb5f545 479
8719091f
VSO
480 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
481 trace_block_copy_skip(s, offset);
482 offset += s->cluster_size;
dafaf135 483 bytes -= s->cluster_size;
beb5f545
VSO
484 continue; /* already copied */
485 }
486
5332e5d2
VSO
487 found_dirty = true;
488
dafaf135 489 cur_bytes = MIN(bytes, s->copy_size);
e332a726 490
8719091f 491 next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
dafaf135 492 cur_bytes);
e332a726 493 if (next_zero >= 0) {
8719091f
VSO
494 assert(next_zero > offset); /* offset is dirty */
495 assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
496 cur_bytes = next_zero - offset;
beb5f545 497 }
5332e5d2 498 block_copy_inflight_req_begin(s, &req, offset, cur_bytes);
beb5f545 499
8719091f 500 ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
5332e5d2
VSO
501 assert(ret >= 0); /* never fail */
502 cur_bytes = MIN(cur_bytes, status_bytes);
503 block_copy_inflight_req_shrink(s, &req, cur_bytes);
2d57511a 504 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
5332e5d2 505 block_copy_inflight_req_end(s, &req, 0);
2d57511a
VSO
506 progress_set_remaining(s->progress,
507 bdrv_get_dirty_count(s->copy_bitmap) +
508 s->in_flight_bytes);
8719091f
VSO
509 trace_block_copy_skip_range(s, offset, status_bytes);
510 offset += status_bytes;
dafaf135 511 bytes -= status_bytes;
2d57511a 512 continue;
beb5f545
VSO
513 }
514
8719091f 515 trace_block_copy_process(s, offset);
beb5f545 516
dafaf135 517 co_get_from_shres(s->mem, cur_bytes);
8719091f 518 ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
2d57511a 519 error_is_read);
dafaf135 520 co_put_to_shres(s->mem, cur_bytes);
5332e5d2 521 block_copy_inflight_req_end(s, &req, ret);
beb5f545 522 if (ret < 0) {
5332e5d2 523 return ret;
beb5f545
VSO
524 }
525
dafaf135
VSO
526 progress_work_done(s->progress, cur_bytes);
527 s->progress_bytes_callback(cur_bytes, s->progress_opaque);
8719091f 528 offset += cur_bytes;
dafaf135 529 bytes -= cur_bytes;
beb5f545
VSO
530 }
531
5332e5d2
VSO
532 return found_dirty;
533}
534
535/*
536 * block_copy
537 *
538 * Copy requested region, accordingly to dirty bitmap.
539 * Collaborate with parallel block_copy requests: if they succeed it will help
540 * us. If they fail, we will retry not-copied regions. So, if we return error,
541 * it means that some I/O operation failed in context of _this_ block_copy call,
542 * not some parallel operation.
543 */
544int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
545 bool *error_is_read)
546{
547 int ret;
548
549 do {
550 ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
551
552 if (ret == 0) {
553 ret = block_copy_wait_one(s, offset, bytes);
554 }
555
556 /*
557 * We retry in two cases:
558 * 1. Some progress done
559 * Something was copied, which means that there were yield points
560 * and some new dirty bits may have appeared (due to failed parallel
561 * block-copy requests).
562 * 2. We have waited for some intersecting block-copy request
563 * It may have failed and produced new dirty bits.
564 */
565 } while (ret > 0);
a6ffe199 566
beb5f545
VSO
567 return ret;
568}
397f4e9d
VSO
569
570BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
571{
572 return s->copy_bitmap;
573}
574
575void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
576{
577 s->skip_unallocated = skip;
578}