]> git.ipfire.org Git - thirdparty/qemu.git/blame_incremental - block.c
block: Change BDS parameter of bdrv_open() to **
[thirdparty/qemu.git] / block.c
... / ...
CommitLineData
1/*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24#include "config-host.h"
25#include "qemu-common.h"
26#include "trace.h"
27#include "monitor/monitor.h"
28#include "block/block_int.h"
29#include "block/blockjob.h"
30#include "qemu/module.h"
31#include "qapi/qmp/qjson.h"
32#include "sysemu/sysemu.h"
33#include "qemu/notify.h"
34#include "block/coroutine.h"
35#include "block/qapi.h"
36#include "qmp-commands.h"
37#include "qemu/timer.h"
38
39#ifdef CONFIG_BSD
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <sys/ioctl.h>
43#include <sys/queue.h>
44#ifndef __DragonFly__
45#include <sys/disk.h>
46#endif
47#endif
48
49#ifdef _WIN32
50#include <windows.h>
51#endif
52
53struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
56};
57
58#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59
60static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
61static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
63 BlockDriverCompletionFunc *cb, void *opaque);
64static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
66 BlockDriverCompletionFunc *cb, void *opaque);
67static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
73static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
75 BdrvRequestFlags flags);
76static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
78 BdrvRequestFlags flags);
79static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
83 BdrvRequestFlags flags,
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
86 bool is_write);
87static void coroutine_fn bdrv_co_do_rw(void *opaque);
88static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
90
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
93
94static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
97static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
123/* throttling disk I/O limits */
124void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
126{
127 int i;
128
129 throttle_config(&bs->throttle_state, cfg);
130
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
133 }
134}
135
136/* this function drain all the throttled IOs */
137static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138{
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
152
153 return drained;
154}
155
156void bdrv_io_limits_disable(BlockDriverState *bs)
157{
158 bs->io_limits_enabled = false;
159
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
163}
164
165static void bdrv_throttle_read_timer_cb(void *opaque)
166{
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
169}
170
171static void bdrv_throttle_write_timer_cb(void *opaque)
172{
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
175}
176
177/* should be called before bdrv_set_io_limits if a limit is set */
178void bdrv_io_limits_enable(BlockDriverState *bs)
179{
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 QEMU_CLOCK_VIRTUAL,
183 bdrv_throttle_read_timer_cb,
184 bdrv_throttle_write_timer_cb,
185 bs);
186 bs->io_limits_enabled = true;
187}
188
189/* This function makes an IO wait if needed
190 *
191 * @nb_sectors: the number of sectors of the IO
192 * @is_write: is the IO a write
193 */
194static void bdrv_io_limits_intercept(BlockDriverState *bs,
195 unsigned int bytes,
196 bool is_write)
197{
198 /* does this io must wait */
199 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
200
201 /* if must wait or any request of this type throttled queue the IO */
202 if (must_wait ||
203 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
205 }
206
207 /* the IO will be executed, do the accounting */
208 throttle_account(&bs->throttle_state, is_write, bytes);
209
210
211 /* if the next request must wait -> do nothing */
212 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213 return;
214 }
215
216 /* else queue next request for execution */
217 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
218}
219
220size_t bdrv_opt_mem_align(BlockDriverState *bs)
221{
222 if (!bs || !bs->drv) {
223 /* 4k should be on the safe side */
224 return 4096;
225 }
226
227 return bs->bl.opt_mem_alignment;
228}
229
230/* check if the path starts with "<protocol>:" */
231static int path_has_protocol(const char *path)
232{
233 const char *p;
234
235#ifdef _WIN32
236 if (is_windows_drive(path) ||
237 is_windows_drive_prefix(path)) {
238 return 0;
239 }
240 p = path + strcspn(path, ":/\\");
241#else
242 p = path + strcspn(path, ":/");
243#endif
244
245 return *p == ':';
246}
247
248int path_is_absolute(const char *path)
249{
250#ifdef _WIN32
251 /* specific case for names like: "\\.\d:" */
252 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
253 return 1;
254 }
255 return (*path == '/' || *path == '\\');
256#else
257 return (*path == '/');
258#endif
259}
260
261/* if filename is absolute, just copy it to dest. Otherwise, build a
262 path to it by considering it is relative to base_path. URL are
263 supported. */
264void path_combine(char *dest, int dest_size,
265 const char *base_path,
266 const char *filename)
267{
268 const char *p, *p1;
269 int len;
270
271 if (dest_size <= 0)
272 return;
273 if (path_is_absolute(filename)) {
274 pstrcpy(dest, dest_size, filename);
275 } else {
276 p = strchr(base_path, ':');
277 if (p)
278 p++;
279 else
280 p = base_path;
281 p1 = strrchr(base_path, '/');
282#ifdef _WIN32
283 {
284 const char *p2;
285 p2 = strrchr(base_path, '\\');
286 if (!p1 || p2 > p1)
287 p1 = p2;
288 }
289#endif
290 if (p1)
291 p1++;
292 else
293 p1 = base_path;
294 if (p1 > p)
295 p = p1;
296 len = p - base_path;
297 if (len > dest_size - 1)
298 len = dest_size - 1;
299 memcpy(dest, base_path, len);
300 dest[len] = '\0';
301 pstrcat(dest, dest_size, filename);
302 }
303}
304
305void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306{
307 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308 pstrcpy(dest, sz, bs->backing_file);
309 } else {
310 path_combine(dest, sz, bs->filename, bs->backing_file);
311 }
312}
313
314void bdrv_register(BlockDriver *bdrv)
315{
316 /* Block drivers without coroutine functions need emulation */
317 if (!bdrv->bdrv_co_readv) {
318 bdrv->bdrv_co_readv = bdrv_co_readv_em;
319 bdrv->bdrv_co_writev = bdrv_co_writev_em;
320
321 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322 * the block driver lacks aio we need to emulate that too.
323 */
324 if (!bdrv->bdrv_aio_readv) {
325 /* add AIO emulation layer */
326 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
328 }
329 }
330
331 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
332}
333
334/* create a new block device (by default it is empty) */
335BlockDriverState *bdrv_new(const char *device_name)
336{
337 BlockDriverState *bs;
338
339 bs = g_malloc0(sizeof(BlockDriverState));
340 QLIST_INIT(&bs->dirty_bitmaps);
341 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
342 if (device_name[0] != '\0') {
343 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
344 }
345 bdrv_iostatus_disable(bs);
346 notifier_list_init(&bs->close_notifiers);
347 notifier_with_return_list_init(&bs->before_write_notifiers);
348 qemu_co_queue_init(&bs->throttled_reqs[0]);
349 qemu_co_queue_init(&bs->throttled_reqs[1]);
350 bs->refcnt = 1;
351
352 return bs;
353}
354
355void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
356{
357 notifier_list_add(&bs->close_notifiers, notify);
358}
359
360BlockDriver *bdrv_find_format(const char *format_name)
361{
362 BlockDriver *drv1;
363 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
364 if (!strcmp(drv1->format_name, format_name)) {
365 return drv1;
366 }
367 }
368 return NULL;
369}
370
371static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
372{
373 static const char *whitelist_rw[] = {
374 CONFIG_BDRV_RW_WHITELIST
375 };
376 static const char *whitelist_ro[] = {
377 CONFIG_BDRV_RO_WHITELIST
378 };
379 const char **p;
380
381 if (!whitelist_rw[0] && !whitelist_ro[0]) {
382 return 1; /* no whitelist, anything goes */
383 }
384
385 for (p = whitelist_rw; *p; p++) {
386 if (!strcmp(drv->format_name, *p)) {
387 return 1;
388 }
389 }
390 if (read_only) {
391 for (p = whitelist_ro; *p; p++) {
392 if (!strcmp(drv->format_name, *p)) {
393 return 1;
394 }
395 }
396 }
397 return 0;
398}
399
400BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
401 bool read_only)
402{
403 BlockDriver *drv = bdrv_find_format(format_name);
404 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
405}
406
407typedef struct CreateCo {
408 BlockDriver *drv;
409 char *filename;
410 QEMUOptionParameter *options;
411 int ret;
412 Error *err;
413} CreateCo;
414
415static void coroutine_fn bdrv_create_co_entry(void *opaque)
416{
417 Error *local_err = NULL;
418 int ret;
419
420 CreateCo *cco = opaque;
421 assert(cco->drv);
422
423 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
424 if (local_err) {
425 error_propagate(&cco->err, local_err);
426 }
427 cco->ret = ret;
428}
429
430int bdrv_create(BlockDriver *drv, const char* filename,
431 QEMUOptionParameter *options, Error **errp)
432{
433 int ret;
434
435 Coroutine *co;
436 CreateCo cco = {
437 .drv = drv,
438 .filename = g_strdup(filename),
439 .options = options,
440 .ret = NOT_DONE,
441 .err = NULL,
442 };
443
444 if (!drv->bdrv_create) {
445 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
446 ret = -ENOTSUP;
447 goto out;
448 }
449
450 if (qemu_in_coroutine()) {
451 /* Fast-path if already in coroutine context */
452 bdrv_create_co_entry(&cco);
453 } else {
454 co = qemu_coroutine_create(bdrv_create_co_entry);
455 qemu_coroutine_enter(co, &cco);
456 while (cco.ret == NOT_DONE) {
457 qemu_aio_wait();
458 }
459 }
460
461 ret = cco.ret;
462 if (ret < 0) {
463 if (cco.err) {
464 error_propagate(errp, cco.err);
465 } else {
466 error_setg_errno(errp, -ret, "Could not create image");
467 }
468 }
469
470out:
471 g_free(cco.filename);
472 return ret;
473}
474
475int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
476 Error **errp)
477{
478 BlockDriver *drv;
479 Error *local_err = NULL;
480 int ret;
481
482 drv = bdrv_find_protocol(filename, true);
483 if (drv == NULL) {
484 error_setg(errp, "Could not find protocol for file '%s'", filename);
485 return -ENOENT;
486 }
487
488 ret = bdrv_create(drv, filename, options, &local_err);
489 if (local_err) {
490 error_propagate(errp, local_err);
491 }
492 return ret;
493}
494
495int bdrv_refresh_limits(BlockDriverState *bs)
496{
497 BlockDriver *drv = bs->drv;
498
499 memset(&bs->bl, 0, sizeof(bs->bl));
500
501 if (!drv) {
502 return 0;
503 }
504
505 /* Take some limits from the children as a default */
506 if (bs->file) {
507 bdrv_refresh_limits(bs->file);
508 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
509 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
510 } else {
511 bs->bl.opt_mem_alignment = 512;
512 }
513
514 if (bs->backing_hd) {
515 bdrv_refresh_limits(bs->backing_hd);
516 bs->bl.opt_transfer_length =
517 MAX(bs->bl.opt_transfer_length,
518 bs->backing_hd->bl.opt_transfer_length);
519 bs->bl.opt_mem_alignment =
520 MAX(bs->bl.opt_mem_alignment,
521 bs->backing_hd->bl.opt_mem_alignment);
522 }
523
524 /* Then let the driver override it */
525 if (drv->bdrv_refresh_limits) {
526 return drv->bdrv_refresh_limits(bs);
527 }
528
529 return 0;
530}
531
532/*
533 * Create a uniquely-named empty temporary file.
534 * Return 0 upon success, otherwise a negative errno value.
535 */
536int get_tmp_filename(char *filename, int size)
537{
538#ifdef _WIN32
539 char temp_dir[MAX_PATH];
540 /* GetTempFileName requires that its output buffer (4th param)
541 have length MAX_PATH or greater. */
542 assert(size >= MAX_PATH);
543 return (GetTempPath(MAX_PATH, temp_dir)
544 && GetTempFileName(temp_dir, "qem", 0, filename)
545 ? 0 : -GetLastError());
546#else
547 int fd;
548 const char *tmpdir;
549 tmpdir = getenv("TMPDIR");
550 if (!tmpdir)
551 tmpdir = "/tmp";
552 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
553 return -EOVERFLOW;
554 }
555 fd = mkstemp(filename);
556 if (fd < 0) {
557 return -errno;
558 }
559 if (close(fd) != 0) {
560 unlink(filename);
561 return -errno;
562 }
563 return 0;
564#endif
565}
566
567/*
568 * Detect host devices. By convention, /dev/cdrom[N] is always
569 * recognized as a host CDROM.
570 */
571static BlockDriver *find_hdev_driver(const char *filename)
572{
573 int score_max = 0, score;
574 BlockDriver *drv = NULL, *d;
575
576 QLIST_FOREACH(d, &bdrv_drivers, list) {
577 if (d->bdrv_probe_device) {
578 score = d->bdrv_probe_device(filename);
579 if (score > score_max) {
580 score_max = score;
581 drv = d;
582 }
583 }
584 }
585
586 return drv;
587}
588
589BlockDriver *bdrv_find_protocol(const char *filename,
590 bool allow_protocol_prefix)
591{
592 BlockDriver *drv1;
593 char protocol[128];
594 int len;
595 const char *p;
596
597 /* TODO Drivers without bdrv_file_open must be specified explicitly */
598
599 /*
600 * XXX(hch): we really should not let host device detection
601 * override an explicit protocol specification, but moving this
602 * later breaks access to device names with colons in them.
603 * Thanks to the brain-dead persistent naming schemes on udev-
604 * based Linux systems those actually are quite common.
605 */
606 drv1 = find_hdev_driver(filename);
607 if (drv1) {
608 return drv1;
609 }
610
611 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
612 return bdrv_find_format("file");
613 }
614
615 p = strchr(filename, ':');
616 assert(p != NULL);
617 len = p - filename;
618 if (len > sizeof(protocol) - 1)
619 len = sizeof(protocol) - 1;
620 memcpy(protocol, filename, len);
621 protocol[len] = '\0';
622 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
623 if (drv1->protocol_name &&
624 !strcmp(drv1->protocol_name, protocol)) {
625 return drv1;
626 }
627 }
628 return NULL;
629}
630
631static int find_image_format(BlockDriverState *bs, const char *filename,
632 BlockDriver **pdrv, Error **errp)
633{
634 int score, score_max;
635 BlockDriver *drv1, *drv;
636 uint8_t buf[2048];
637 int ret = 0;
638
639 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
640 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
641 drv = bdrv_find_format("raw");
642 if (!drv) {
643 error_setg(errp, "Could not find raw image format");
644 ret = -ENOENT;
645 }
646 *pdrv = drv;
647 return ret;
648 }
649
650 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
651 if (ret < 0) {
652 error_setg_errno(errp, -ret, "Could not read image for determining its "
653 "format");
654 *pdrv = NULL;
655 return ret;
656 }
657
658 score_max = 0;
659 drv = NULL;
660 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
661 if (drv1->bdrv_probe) {
662 score = drv1->bdrv_probe(buf, ret, filename);
663 if (score > score_max) {
664 score_max = score;
665 drv = drv1;
666 }
667 }
668 }
669 if (!drv) {
670 error_setg(errp, "Could not determine image format: No compatible "
671 "driver found");
672 ret = -ENOENT;
673 }
674 *pdrv = drv;
675 return ret;
676}
677
678/**
679 * Set the current 'total_sectors' value
680 */
681static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
682{
683 BlockDriver *drv = bs->drv;
684
685 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
686 if (bs->sg)
687 return 0;
688
689 /* query actual device if possible, otherwise just trust the hint */
690 if (drv->bdrv_getlength) {
691 int64_t length = drv->bdrv_getlength(bs);
692 if (length < 0) {
693 return length;
694 }
695 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
696 }
697
698 bs->total_sectors = hint;
699 return 0;
700}
701
702/**
703 * Set open flags for a given discard mode
704 *
705 * Return 0 on success, -1 if the discard mode was invalid.
706 */
707int bdrv_parse_discard_flags(const char *mode, int *flags)
708{
709 *flags &= ~BDRV_O_UNMAP;
710
711 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
712 /* do nothing */
713 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
714 *flags |= BDRV_O_UNMAP;
715 } else {
716 return -1;
717 }
718
719 return 0;
720}
721
722/**
723 * Set open flags for a given cache mode
724 *
725 * Return 0 on success, -1 if the cache mode was invalid.
726 */
727int bdrv_parse_cache_flags(const char *mode, int *flags)
728{
729 *flags &= ~BDRV_O_CACHE_MASK;
730
731 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
732 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
733 } else if (!strcmp(mode, "directsync")) {
734 *flags |= BDRV_O_NOCACHE;
735 } else if (!strcmp(mode, "writeback")) {
736 *flags |= BDRV_O_CACHE_WB;
737 } else if (!strcmp(mode, "unsafe")) {
738 *flags |= BDRV_O_CACHE_WB;
739 *flags |= BDRV_O_NO_FLUSH;
740 } else if (!strcmp(mode, "writethrough")) {
741 /* this is the default */
742 } else {
743 return -1;
744 }
745
746 return 0;
747}
748
749/**
750 * The copy-on-read flag is actually a reference count so multiple users may
751 * use the feature without worrying about clobbering its previous state.
752 * Copy-on-read stays enabled until all users have called to disable it.
753 */
754void bdrv_enable_copy_on_read(BlockDriverState *bs)
755{
756 bs->copy_on_read++;
757}
758
759void bdrv_disable_copy_on_read(BlockDriverState *bs)
760{
761 assert(bs->copy_on_read > 0);
762 bs->copy_on_read--;
763}
764
765static int bdrv_open_flags(BlockDriverState *bs, int flags)
766{
767 int open_flags = flags | BDRV_O_CACHE_WB;
768
769 /*
770 * Clear flags that are internal to the block layer before opening the
771 * image.
772 */
773 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774
775 /*
776 * Snapshots should be writable.
777 */
778 if (bs->is_temporary) {
779 open_flags |= BDRV_O_RDWR;
780 }
781
782 return open_flags;
783}
784
785static int bdrv_assign_node_name(BlockDriverState *bs,
786 const char *node_name,
787 Error **errp)
788{
789 if (!node_name) {
790 return 0;
791 }
792
793 /* empty string node name is invalid */
794 if (node_name[0] == '\0') {
795 error_setg(errp, "Empty node name");
796 return -EINVAL;
797 }
798
799 /* takes care of avoiding namespaces collisions */
800 if (bdrv_find(node_name)) {
801 error_setg(errp, "node-name=%s is conflicting with a device id",
802 node_name);
803 return -EINVAL;
804 }
805
806 /* takes care of avoiding duplicates node names */
807 if (bdrv_find_node(node_name)) {
808 error_setg(errp, "Duplicate node name");
809 return -EINVAL;
810 }
811
812 /* copy node name into the bs and insert it into the graph list */
813 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
814 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
815
816 return 0;
817}
818
819/*
820 * Common part for opening disk images and files
821 *
822 * Removes all processed options from *options.
823 */
824static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
825 QDict *options, int flags, BlockDriver *drv, Error **errp)
826{
827 int ret, open_flags;
828 const char *filename;
829 const char *node_name = NULL;
830 Error *local_err = NULL;
831
832 assert(drv != NULL);
833 assert(bs->file == NULL);
834 assert(options != NULL && bs->options != options);
835
836 if (file != NULL) {
837 filename = file->filename;
838 } else {
839 filename = qdict_get_try_str(options, "filename");
840 }
841
842 if (drv->bdrv_needs_filename && !filename) {
843 error_setg(errp, "The '%s' block driver requires a file name",
844 drv->format_name);
845 return -EINVAL;
846 }
847
848 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
849
850 node_name = qdict_get_try_str(options, "node-name");
851 ret = bdrv_assign_node_name(bs, node_name, errp);
852 if (ret < 0) {
853 return ret;
854 }
855 qdict_del(options, "node-name");
856
857 /* bdrv_open() with directly using a protocol as drv. This layer is already
858 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
859 * and return immediately. */
860 if (file != NULL && drv->bdrv_file_open) {
861 bdrv_swap(file, bs);
862 return 0;
863 }
864
865 bs->open_flags = flags;
866 bs->guest_block_size = 512;
867 bs->request_alignment = 512;
868 bs->zero_beyond_eof = true;
869 open_flags = bdrv_open_flags(bs, flags);
870 bs->read_only = !(open_flags & BDRV_O_RDWR);
871
872 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
873 error_setg(errp,
874 !bs->read_only && bdrv_is_whitelisted(drv, true)
875 ? "Driver '%s' can only be used for read-only devices"
876 : "Driver '%s' is not whitelisted",
877 drv->format_name);
878 return -ENOTSUP;
879 }
880
881 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
882 if (flags & BDRV_O_COPY_ON_READ) {
883 if (!bs->read_only) {
884 bdrv_enable_copy_on_read(bs);
885 } else {
886 error_setg(errp, "Can't use copy-on-read on read-only device");
887 return -EINVAL;
888 }
889 }
890
891 if (filename != NULL) {
892 pstrcpy(bs->filename, sizeof(bs->filename), filename);
893 } else {
894 bs->filename[0] = '\0';
895 }
896
897 bs->drv = drv;
898 bs->opaque = g_malloc0(drv->instance_size);
899
900 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
901
902 /* Open the image, either directly or using a protocol */
903 if (drv->bdrv_file_open) {
904 assert(file == NULL);
905 assert(!drv->bdrv_needs_filename || filename != NULL);
906 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
907 } else {
908 if (file == NULL) {
909 error_setg(errp, "Can't use '%s' as a block driver for the "
910 "protocol level", drv->format_name);
911 ret = -EINVAL;
912 goto free_and_fail;
913 }
914 bs->file = file;
915 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
916 }
917
918 if (ret < 0) {
919 if (local_err) {
920 error_propagate(errp, local_err);
921 } else if (bs->filename[0]) {
922 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
923 } else {
924 error_setg_errno(errp, -ret, "Could not open image");
925 }
926 goto free_and_fail;
927 }
928
929 ret = refresh_total_sectors(bs, bs->total_sectors);
930 if (ret < 0) {
931 error_setg_errno(errp, -ret, "Could not refresh total sector count");
932 goto free_and_fail;
933 }
934
935 bdrv_refresh_limits(bs);
936 assert(bdrv_opt_mem_align(bs) != 0);
937 assert(bs->request_alignment != 0);
938
939#ifndef _WIN32
940 if (bs->is_temporary) {
941 assert(bs->filename[0] != '\0');
942 unlink(bs->filename);
943 }
944#endif
945 return 0;
946
947free_and_fail:
948 bs->file = NULL;
949 g_free(bs->opaque);
950 bs->opaque = NULL;
951 bs->drv = NULL;
952 return ret;
953}
954
955/*
956 * Opens a file using a protocol (file, host_device, nbd, ...)
957 *
958 * options is a QDict of options to pass to the block drivers, or NULL for an
959 * empty set of options. The reference to the QDict belongs to the block layer
960 * after the call (even on failure), so if the caller intends to reuse the
961 * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
962 */
963int bdrv_file_open(BlockDriverState **pbs, const char *filename,
964 const char *reference, QDict *options, int flags,
965 Error **errp)
966{
967 BlockDriverState *bs = NULL;
968 BlockDriver *drv;
969 const char *drvname;
970 bool allow_protocol_prefix = false;
971 Error *local_err = NULL;
972 int ret;
973
974 /* NULL means an empty set of options */
975 if (options == NULL) {
976 options = qdict_new();
977 }
978
979 if (reference) {
980 if (filename || qdict_size(options)) {
981 error_setg(errp, "Cannot reference an existing block device with "
982 "additional options or a new filename");
983 return -EINVAL;
984 }
985 QDECREF(options);
986
987 bs = bdrv_lookup_bs(reference, reference, errp);
988 if (!bs) {
989 return -ENODEV;
990 }
991 bdrv_ref(bs);
992 *pbs = bs;
993 return 0;
994 }
995
996 bs = bdrv_new("");
997 bs->options = options;
998 options = qdict_clone_shallow(options);
999
1000 /* Fetch the file name from the options QDict if necessary */
1001 if (!filename) {
1002 filename = qdict_get_try_str(options, "filename");
1003 } else if (filename && !qdict_haskey(options, "filename")) {
1004 qdict_put(options, "filename", qstring_from_str(filename));
1005 allow_protocol_prefix = true;
1006 } else {
1007 error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1008 "same time");
1009 ret = -EINVAL;
1010 goto fail;
1011 }
1012
1013 /* Find the right block driver */
1014 drvname = qdict_get_try_str(options, "driver");
1015 if (drvname) {
1016 drv = bdrv_find_format(drvname);
1017 if (!drv) {
1018 error_setg(errp, "Unknown driver '%s'", drvname);
1019 }
1020 qdict_del(options, "driver");
1021 } else if (filename) {
1022 drv = bdrv_find_protocol(filename, allow_protocol_prefix);
1023 if (!drv) {
1024 error_setg(errp, "Unknown protocol");
1025 }
1026 } else {
1027 error_setg(errp, "Must specify either driver or file");
1028 drv = NULL;
1029 }
1030
1031 if (!drv) {
1032 /* errp has been set already */
1033 ret = -ENOENT;
1034 goto fail;
1035 }
1036
1037 /* Parse the filename and open it */
1038 if (drv->bdrv_parse_filename && filename) {
1039 drv->bdrv_parse_filename(filename, options, &local_err);
1040 if (local_err) {
1041 error_propagate(errp, local_err);
1042 ret = -EINVAL;
1043 goto fail;
1044 }
1045 qdict_del(options, "filename");
1046 }
1047
1048 if (!drv->bdrv_file_open) {
1049 ret = bdrv_open(&bs, filename, options, flags, drv, &local_err);
1050 options = NULL;
1051 } else {
1052 ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
1053 }
1054 if (ret < 0) {
1055 error_propagate(errp, local_err);
1056 goto fail;
1057 }
1058
1059 /* Check if any unknown options were used */
1060 if (options && (qdict_size(options) != 0)) {
1061 const QDictEntry *entry = qdict_first(options);
1062 error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
1063 drv->format_name, entry->key);
1064 ret = -EINVAL;
1065 goto fail;
1066 }
1067 QDECREF(options);
1068
1069 bs->growable = 1;
1070 *pbs = bs;
1071 return 0;
1072
1073fail:
1074 QDECREF(options);
1075 if (!bs->drv) {
1076 QDECREF(bs->options);
1077 }
1078 bdrv_unref(bs);
1079 return ret;
1080}
1081
1082/*
1083 * Opens the backing file for a BlockDriverState if not yet open
1084 *
1085 * options is a QDict of options to pass to the block drivers, or NULL for an
1086 * empty set of options. The reference to the QDict is transferred to this
1087 * function (even on failure), so if the caller intends to reuse the dictionary,
1088 * it needs to use QINCREF() before calling bdrv_file_open.
1089 */
1090int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
1091{
1092 char backing_filename[PATH_MAX];
1093 int back_flags, ret;
1094 BlockDriver *back_drv = NULL;
1095 Error *local_err = NULL;
1096
1097 if (bs->backing_hd != NULL) {
1098 QDECREF(options);
1099 return 0;
1100 }
1101
1102 /* NULL means an empty set of options */
1103 if (options == NULL) {
1104 options = qdict_new();
1105 }
1106
1107 bs->open_flags &= ~BDRV_O_NO_BACKING;
1108 if (qdict_haskey(options, "file.filename")) {
1109 backing_filename[0] = '\0';
1110 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
1111 QDECREF(options);
1112 return 0;
1113 } else {
1114 bdrv_get_full_backing_filename(bs, backing_filename,
1115 sizeof(backing_filename));
1116 }
1117
1118 if (bs->backing_format[0] != '\0') {
1119 back_drv = bdrv_find_format(bs->backing_format);
1120 }
1121
1122 /* backing files always opened read-only */
1123 back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1124 BDRV_O_COPY_ON_READ);
1125
1126 assert(bs->backing_hd == NULL);
1127 ret = bdrv_open(&bs->backing_hd,
1128 *backing_filename ? backing_filename : NULL, options,
1129 back_flags, back_drv, &local_err);
1130 if (ret < 0) {
1131 bs->backing_hd = NULL;
1132 bs->open_flags |= BDRV_O_NO_BACKING;
1133 error_setg(errp, "Could not open backing file: %s",
1134 error_get_pretty(local_err));
1135 error_free(local_err);
1136 return ret;
1137 }
1138
1139 if (bs->backing_hd->file) {
1140 pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1141 bs->backing_hd->file->filename);
1142 }
1143
1144 /* Recalculate the BlockLimits with the backing file */
1145 bdrv_refresh_limits(bs);
1146
1147 return 0;
1148}
1149
1150/*
1151 * Opens a disk image whose options are given as BlockdevRef in another block
1152 * device's options.
1153 *
1154 * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
1155 * image format auto-detection. If it is false and a filename is given,
1156 * bdrv_open() will be used for auto-detection.
1157 *
1158 * If allow_none is true, no image will be opened if filename is false and no
1159 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1160 *
1161 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1162 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1163 * itself, all options starting with "${bdref_key}." are considered part of the
1164 * BlockdevRef.
1165 *
1166 * The BlockdevRef will be removed from the options QDict.
1167 *
1168 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
1169 */
1170int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1171 QDict *options, const char *bdref_key, int flags,
1172 bool force_raw, bool allow_none, Error **errp)
1173{
1174 QDict *image_options;
1175 int ret;
1176 char *bdref_key_dot;
1177 const char *reference;
1178
1179 assert(pbs);
1180 assert(*pbs == NULL);
1181
1182 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1183 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1184 g_free(bdref_key_dot);
1185
1186 reference = qdict_get_try_str(options, bdref_key);
1187 if (!filename && !reference && !qdict_size(image_options)) {
1188 if (allow_none) {
1189 ret = 0;
1190 } else {
1191 error_setg(errp, "A block device must be specified for \"%s\"",
1192 bdref_key);
1193 ret = -EINVAL;
1194 }
1195 goto done;
1196 }
1197
1198 if (filename && !force_raw) {
1199 /* If a filename is given and the block driver should be detected
1200 automatically (instead of using none), use bdrv_open() in order to do
1201 that auto-detection. */
1202 if (reference) {
1203 error_setg(errp, "Cannot reference an existing block device while "
1204 "giving a filename");
1205 ret = -EINVAL;
1206 goto done;
1207 }
1208
1209 ret = bdrv_open(pbs, filename, image_options, flags, NULL, errp);
1210 } else {
1211 ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
1212 errp);
1213 }
1214
1215done:
1216 qdict_del(options, bdref_key);
1217 return ret;
1218}
1219
1220/*
1221 * Opens a disk image (raw, qcow2, vmdk, ...)
1222 *
1223 * options is a QDict of options to pass to the block drivers, or NULL for an
1224 * empty set of options. The reference to the QDict belongs to the block layer
1225 * after the call (even on failure), so if the caller intends to reuse the
1226 * dictionary, it needs to use QINCREF() before calling bdrv_open.
1227 *
1228 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1229 * If it is not NULL, the referenced BDS will be reused.
1230 */
1231int bdrv_open(BlockDriverState **pbs, const char *filename, QDict *options,
1232 int flags, BlockDriver *drv, Error **errp)
1233{
1234 int ret;
1235 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1236 char tmp_filename[PATH_MAX + 1];
1237 BlockDriverState *file = NULL, *bs;
1238 const char *drvname;
1239 Error *local_err = NULL;
1240
1241 assert(pbs);
1242
1243 if (*pbs) {
1244 bs = *pbs;
1245 } else {
1246 bs = bdrv_new("");
1247 }
1248
1249 /* NULL means an empty set of options */
1250 if (options == NULL) {
1251 options = qdict_new();
1252 }
1253
1254 bs->options = options;
1255 options = qdict_clone_shallow(options);
1256
1257 /* For snapshot=on, create a temporary qcow2 overlay */
1258 if (flags & BDRV_O_SNAPSHOT) {
1259 BlockDriverState *bs1;
1260 int64_t total_size;
1261 BlockDriver *bdrv_qcow2;
1262 QEMUOptionParameter *create_options;
1263 QDict *snapshot_options;
1264
1265 /* if snapshot, we create a temporary backing file and open it
1266 instead of opening 'filename' directly */
1267
1268 /* Get the required size from the image */
1269 QINCREF(options);
1270 bs1 = NULL;
1271 ret = bdrv_open(&bs1, filename, options, BDRV_O_NO_BACKING,
1272 drv, &local_err);
1273 if (ret < 0) {
1274 goto fail;
1275 }
1276 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
1277
1278 bdrv_unref(bs1);
1279
1280 /* Create the temporary image */
1281 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
1282 if (ret < 0) {
1283 error_setg_errno(errp, -ret, "Could not get temporary filename");
1284 goto fail;
1285 }
1286
1287 bdrv_qcow2 = bdrv_find_format("qcow2");
1288 create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1289 NULL);
1290
1291 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1292
1293 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1294 free_option_parameters(create_options);
1295 if (ret < 0) {
1296 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1297 "'%s': %s", tmp_filename,
1298 error_get_pretty(local_err));
1299 error_free(local_err);
1300 local_err = NULL;
1301 goto fail;
1302 }
1303
1304 /* Prepare a new options QDict for the temporary file, where user
1305 * options refer to the backing file */
1306 if (filename) {
1307 qdict_put(options, "file.filename", qstring_from_str(filename));
1308 }
1309 if (drv) {
1310 qdict_put(options, "driver", qstring_from_str(drv->format_name));
1311 }
1312
1313 snapshot_options = qdict_new();
1314 qdict_put(snapshot_options, "backing", options);
1315 qdict_flatten(snapshot_options);
1316
1317 bs->options = snapshot_options;
1318 options = qdict_clone_shallow(bs->options);
1319
1320 filename = tmp_filename;
1321 drv = bdrv_qcow2;
1322 bs->is_temporary = 1;
1323 }
1324
1325 /* Open image file without format layer */
1326 if (flags & BDRV_O_RDWR) {
1327 flags |= BDRV_O_ALLOW_RDWR;
1328 }
1329
1330 assert(file == NULL);
1331 ret = bdrv_open_image(&file, filename, options, "file",
1332 bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
1333 &local_err);
1334 if (ret < 0) {
1335 goto fail;
1336 }
1337
1338 /* Find the right image format driver */
1339 drvname = qdict_get_try_str(options, "driver");
1340 if (drvname) {
1341 drv = bdrv_find_format(drvname);
1342 qdict_del(options, "driver");
1343 if (!drv) {
1344 error_setg(errp, "Invalid driver: '%s'", drvname);
1345 ret = -EINVAL;
1346 goto unlink_and_fail;
1347 }
1348 }
1349
1350 if (!drv) {
1351 if (file) {
1352 ret = find_image_format(file, filename, &drv, &local_err);
1353 } else {
1354 error_setg(errp, "Must specify either driver or file");
1355 ret = -EINVAL;
1356 goto unlink_and_fail;
1357 }
1358 }
1359
1360 if (!drv) {
1361 goto unlink_and_fail;
1362 }
1363
1364 /* Open the image */
1365 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
1366 if (ret < 0) {
1367 goto unlink_and_fail;
1368 }
1369
1370 if (file && (bs->file != file)) {
1371 bdrv_unref(file);
1372 file = NULL;
1373 }
1374
1375 /* If there is a backing file, use it */
1376 if ((flags & BDRV_O_NO_BACKING) == 0) {
1377 QDict *backing_options;
1378
1379 qdict_extract_subqdict(options, &backing_options, "backing.");
1380 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
1381 if (ret < 0) {
1382 goto close_and_fail;
1383 }
1384 }
1385
1386 /* Check if any unknown options were used */
1387 if (qdict_size(options) != 0) {
1388 const QDictEntry *entry = qdict_first(options);
1389 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1390 "support the option '%s'", drv->format_name, bs->device_name,
1391 entry->key);
1392
1393 ret = -EINVAL;
1394 goto close_and_fail;
1395 }
1396 QDECREF(options);
1397
1398 if (!bdrv_key_required(bs)) {
1399 bdrv_dev_change_media_cb(bs, true);
1400 }
1401
1402 *pbs = bs;
1403 return 0;
1404
1405unlink_and_fail:
1406 if (file != NULL) {
1407 bdrv_unref(file);
1408 }
1409 if (bs->is_temporary) {
1410 unlink(filename);
1411 }
1412fail:
1413 QDECREF(bs->options);
1414 QDECREF(options);
1415 bs->options = NULL;
1416 if (!*pbs) {
1417 /* If *pbs is NULL, a new BDS has been created in this function and
1418 needs to be freed now. Otherwise, it does not need to be closed,
1419 since it has not really been opened yet. */
1420 bdrv_unref(bs);
1421 }
1422 if (local_err) {
1423 error_propagate(errp, local_err);
1424 }
1425 return ret;
1426
1427close_and_fail:
1428 /* See fail path, but now the BDS has to be always closed */
1429 if (*pbs) {
1430 bdrv_close(bs);
1431 } else {
1432 bdrv_unref(bs);
1433 }
1434 QDECREF(options);
1435 if (local_err) {
1436 error_propagate(errp, local_err);
1437 }
1438 return ret;
1439}
1440
1441typedef struct BlockReopenQueueEntry {
1442 bool prepared;
1443 BDRVReopenState state;
1444 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1445} BlockReopenQueueEntry;
1446
1447/*
1448 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1449 * reopen of multiple devices.
1450 *
1451 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1452 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1453 * be created and initialized. This newly created BlockReopenQueue should be
1454 * passed back in for subsequent calls that are intended to be of the same
1455 * atomic 'set'.
1456 *
1457 * bs is the BlockDriverState to add to the reopen queue.
1458 *
1459 * flags contains the open flags for the associated bs
1460 *
1461 * returns a pointer to bs_queue, which is either the newly allocated
1462 * bs_queue, or the existing bs_queue being used.
1463 *
1464 */
1465BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1466 BlockDriverState *bs, int flags)
1467{
1468 assert(bs != NULL);
1469
1470 BlockReopenQueueEntry *bs_entry;
1471 if (bs_queue == NULL) {
1472 bs_queue = g_new0(BlockReopenQueue, 1);
1473 QSIMPLEQ_INIT(bs_queue);
1474 }
1475
1476 if (bs->file) {
1477 bdrv_reopen_queue(bs_queue, bs->file, flags);
1478 }
1479
1480 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1481 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1482
1483 bs_entry->state.bs = bs;
1484 bs_entry->state.flags = flags;
1485
1486 return bs_queue;
1487}
1488
1489/*
1490 * Reopen multiple BlockDriverStates atomically & transactionally.
1491 *
1492 * The queue passed in (bs_queue) must have been built up previous
1493 * via bdrv_reopen_queue().
1494 *
1495 * Reopens all BDS specified in the queue, with the appropriate
1496 * flags. All devices are prepared for reopen, and failure of any
1497 * device will cause all device changes to be abandonded, and intermediate
1498 * data cleaned up.
1499 *
1500 * If all devices prepare successfully, then the changes are committed
1501 * to all devices.
1502 *
1503 */
1504int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1505{
1506 int ret = -1;
1507 BlockReopenQueueEntry *bs_entry, *next;
1508 Error *local_err = NULL;
1509
1510 assert(bs_queue != NULL);
1511
1512 bdrv_drain_all();
1513
1514 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1515 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1516 error_propagate(errp, local_err);
1517 goto cleanup;
1518 }
1519 bs_entry->prepared = true;
1520 }
1521
1522 /* If we reach this point, we have success and just need to apply the
1523 * changes
1524 */
1525 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1526 bdrv_reopen_commit(&bs_entry->state);
1527 }
1528
1529 ret = 0;
1530
1531cleanup:
1532 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1533 if (ret && bs_entry->prepared) {
1534 bdrv_reopen_abort(&bs_entry->state);
1535 }
1536 g_free(bs_entry);
1537 }
1538 g_free(bs_queue);
1539 return ret;
1540}
1541
1542
1543/* Reopen a single BlockDriverState with the specified flags. */
1544int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1545{
1546 int ret = -1;
1547 Error *local_err = NULL;
1548 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1549
1550 ret = bdrv_reopen_multiple(queue, &local_err);
1551 if (local_err != NULL) {
1552 error_propagate(errp, local_err);
1553 }
1554 return ret;
1555}
1556
1557
1558/*
1559 * Prepares a BlockDriverState for reopen. All changes are staged in the
1560 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1561 * the block driver layer .bdrv_reopen_prepare()
1562 *
1563 * bs is the BlockDriverState to reopen
1564 * flags are the new open flags
1565 * queue is the reopen queue
1566 *
1567 * Returns 0 on success, non-zero on error. On error errp will be set
1568 * as well.
1569 *
1570 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1571 * It is the responsibility of the caller to then call the abort() or
1572 * commit() for any other BDS that have been left in a prepare() state
1573 *
1574 */
1575int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1576 Error **errp)
1577{
1578 int ret = -1;
1579 Error *local_err = NULL;
1580 BlockDriver *drv;
1581
1582 assert(reopen_state != NULL);
1583 assert(reopen_state->bs->drv != NULL);
1584 drv = reopen_state->bs->drv;
1585
1586 /* if we are to stay read-only, do not allow permission change
1587 * to r/w */
1588 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1589 reopen_state->flags & BDRV_O_RDWR) {
1590 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1591 reopen_state->bs->device_name);
1592 goto error;
1593 }
1594
1595
1596 ret = bdrv_flush(reopen_state->bs);
1597 if (ret) {
1598 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1599 strerror(-ret));
1600 goto error;
1601 }
1602
1603 if (drv->bdrv_reopen_prepare) {
1604 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1605 if (ret) {
1606 if (local_err != NULL) {
1607 error_propagate(errp, local_err);
1608 } else {
1609 error_setg(errp, "failed while preparing to reopen image '%s'",
1610 reopen_state->bs->filename);
1611 }
1612 goto error;
1613 }
1614 } else {
1615 /* It is currently mandatory to have a bdrv_reopen_prepare()
1616 * handler for each supported drv. */
1617 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1618 drv->format_name, reopen_state->bs->device_name,
1619 "reopening of file");
1620 ret = -1;
1621 goto error;
1622 }
1623
1624 ret = 0;
1625
1626error:
1627 return ret;
1628}
1629
1630/*
1631 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1632 * makes them final by swapping the staging BlockDriverState contents into
1633 * the active BlockDriverState contents.
1634 */
1635void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1636{
1637 BlockDriver *drv;
1638
1639 assert(reopen_state != NULL);
1640 drv = reopen_state->bs->drv;
1641 assert(drv != NULL);
1642
1643 /* If there are any driver level actions to take */
1644 if (drv->bdrv_reopen_commit) {
1645 drv->bdrv_reopen_commit(reopen_state);
1646 }
1647
1648 /* set BDS specific flags now */
1649 reopen_state->bs->open_flags = reopen_state->flags;
1650 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1651 BDRV_O_CACHE_WB);
1652 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1653
1654 bdrv_refresh_limits(reopen_state->bs);
1655}
1656
1657/*
1658 * Abort the reopen, and delete and free the staged changes in
1659 * reopen_state
1660 */
1661void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1662{
1663 BlockDriver *drv;
1664
1665 assert(reopen_state != NULL);
1666 drv = reopen_state->bs->drv;
1667 assert(drv != NULL);
1668
1669 if (drv->bdrv_reopen_abort) {
1670 drv->bdrv_reopen_abort(reopen_state);
1671 }
1672}
1673
1674
1675void bdrv_close(BlockDriverState *bs)
1676{
1677 if (bs->job) {
1678 block_job_cancel_sync(bs->job);
1679 }
1680 bdrv_drain_all(); /* complete I/O */
1681 bdrv_flush(bs);
1682 bdrv_drain_all(); /* in case flush left pending I/O */
1683 notifier_list_notify(&bs->close_notifiers, bs);
1684
1685 if (bs->drv) {
1686 if (bs->backing_hd) {
1687 bdrv_unref(bs->backing_hd);
1688 bs->backing_hd = NULL;
1689 }
1690 bs->drv->bdrv_close(bs);
1691 g_free(bs->opaque);
1692#ifdef _WIN32
1693 if (bs->is_temporary) {
1694 unlink(bs->filename);
1695 }
1696#endif
1697 bs->opaque = NULL;
1698 bs->drv = NULL;
1699 bs->copy_on_read = 0;
1700 bs->backing_file[0] = '\0';
1701 bs->backing_format[0] = '\0';
1702 bs->total_sectors = 0;
1703 bs->encrypted = 0;
1704 bs->valid_key = 0;
1705 bs->sg = 0;
1706 bs->growable = 0;
1707 bs->zero_beyond_eof = false;
1708 QDECREF(bs->options);
1709 bs->options = NULL;
1710
1711 if (bs->file != NULL) {
1712 bdrv_unref(bs->file);
1713 bs->file = NULL;
1714 }
1715 }
1716
1717 bdrv_dev_change_media_cb(bs, false);
1718
1719 /*throttling disk I/O limits*/
1720 if (bs->io_limits_enabled) {
1721 bdrv_io_limits_disable(bs);
1722 }
1723}
1724
1725void bdrv_close_all(void)
1726{
1727 BlockDriverState *bs;
1728
1729 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1730 bdrv_close(bs);
1731 }
1732}
1733
1734/* Check if any requests are in-flight (including throttled requests) */
1735static bool bdrv_requests_pending(BlockDriverState *bs)
1736{
1737 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1738 return true;
1739 }
1740 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1741 return true;
1742 }
1743 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
1744 return true;
1745 }
1746 if (bs->file && bdrv_requests_pending(bs->file)) {
1747 return true;
1748 }
1749 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1750 return true;
1751 }
1752 return false;
1753}
1754
1755static bool bdrv_requests_pending_all(void)
1756{
1757 BlockDriverState *bs;
1758 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1759 if (bdrv_requests_pending(bs)) {
1760 return true;
1761 }
1762 }
1763 return false;
1764}
1765
1766/*
1767 * Wait for pending requests to complete across all BlockDriverStates
1768 *
1769 * This function does not flush data to disk, use bdrv_flush_all() for that
1770 * after calling this function.
1771 *
1772 * Note that completion of an asynchronous I/O operation can trigger any
1773 * number of other I/O operations on other devices---for example a coroutine
1774 * can be arbitrarily complex and a constant flow of I/O can come until the
1775 * coroutine is complete. Because of this, it is not possible to have a
1776 * function to drain a single device's I/O queue.
1777 */
1778void bdrv_drain_all(void)
1779{
1780 /* Always run first iteration so any pending completion BHs run */
1781 bool busy = true;
1782 BlockDriverState *bs;
1783
1784 while (busy) {
1785 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1786 bdrv_start_throttled_reqs(bs);
1787 }
1788
1789 busy = bdrv_requests_pending_all();
1790 busy |= aio_poll(qemu_get_aio_context(), busy);
1791 }
1792}
1793
1794/* make a BlockDriverState anonymous by removing from bdrv_state and
1795 * graph_bdrv_state list.
1796 Also, NULL terminate the device_name to prevent double remove */
1797void bdrv_make_anon(BlockDriverState *bs)
1798{
1799 if (bs->device_name[0] != '\0') {
1800 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
1801 }
1802 bs->device_name[0] = '\0';
1803 if (bs->node_name[0] != '\0') {
1804 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1805 }
1806 bs->node_name[0] = '\0';
1807}
1808
1809static void bdrv_rebind(BlockDriverState *bs)
1810{
1811 if (bs->drv && bs->drv->bdrv_rebind) {
1812 bs->drv->bdrv_rebind(bs);
1813 }
1814}
1815
1816static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1817 BlockDriverState *bs_src)
1818{
1819 /* move some fields that need to stay attached to the device */
1820 bs_dest->open_flags = bs_src->open_flags;
1821
1822 /* dev info */
1823 bs_dest->dev_ops = bs_src->dev_ops;
1824 bs_dest->dev_opaque = bs_src->dev_opaque;
1825 bs_dest->dev = bs_src->dev;
1826 bs_dest->guest_block_size = bs_src->guest_block_size;
1827 bs_dest->copy_on_read = bs_src->copy_on_read;
1828
1829 bs_dest->enable_write_cache = bs_src->enable_write_cache;
1830
1831 /* i/o throttled req */
1832 memcpy(&bs_dest->throttle_state,
1833 &bs_src->throttle_state,
1834 sizeof(ThrottleState));
1835 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1836 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
1837 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
1838
1839 /* r/w error */
1840 bs_dest->on_read_error = bs_src->on_read_error;
1841 bs_dest->on_write_error = bs_src->on_write_error;
1842
1843 /* i/o status */
1844 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1845 bs_dest->iostatus = bs_src->iostatus;
1846
1847 /* dirty bitmap */
1848 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
1849
1850 /* reference count */
1851 bs_dest->refcnt = bs_src->refcnt;
1852
1853 /* job */
1854 bs_dest->in_use = bs_src->in_use;
1855 bs_dest->job = bs_src->job;
1856
1857 /* keep the same entry in bdrv_states */
1858 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1859 bs_src->device_name);
1860 bs_dest->device_list = bs_src->device_list;
1861
1862 /* keep the same entry in graph_bdrv_states
1863 * We do want to swap name but don't want to swap linked list entries
1864 */
1865 bs_dest->node_list = bs_src->node_list;
1866}
1867
1868/*
1869 * Swap bs contents for two image chains while they are live,
1870 * while keeping required fields on the BlockDriverState that is
1871 * actually attached to a device.
1872 *
1873 * This will modify the BlockDriverState fields, and swap contents
1874 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1875 *
1876 * bs_new is required to be anonymous.
1877 *
1878 * This function does not create any image files.
1879 */
1880void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1881{
1882 BlockDriverState tmp;
1883
1884 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1885 assert(bs_new->device_name[0] == '\0');
1886 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
1887 assert(bs_new->job == NULL);
1888 assert(bs_new->dev == NULL);
1889 assert(bs_new->in_use == 0);
1890 assert(bs_new->io_limits_enabled == false);
1891 assert(!throttle_have_timer(&bs_new->throttle_state));
1892
1893 tmp = *bs_new;
1894 *bs_new = *bs_old;
1895 *bs_old = tmp;
1896
1897 /* there are some fields that should not be swapped, move them back */
1898 bdrv_move_feature_fields(&tmp, bs_old);
1899 bdrv_move_feature_fields(bs_old, bs_new);
1900 bdrv_move_feature_fields(bs_new, &tmp);
1901
1902 /* bs_new shouldn't be in bdrv_states even after the swap! */
1903 assert(bs_new->device_name[0] == '\0');
1904
1905 /* Check a few fields that should remain attached to the device */
1906 assert(bs_new->dev == NULL);
1907 assert(bs_new->job == NULL);
1908 assert(bs_new->in_use == 0);
1909 assert(bs_new->io_limits_enabled == false);
1910 assert(!throttle_have_timer(&bs_new->throttle_state));
1911
1912 bdrv_rebind(bs_new);
1913 bdrv_rebind(bs_old);
1914}
1915
1916/*
1917 * Add new bs contents at the top of an image chain while the chain is
1918 * live, while keeping required fields on the top layer.
1919 *
1920 * This will modify the BlockDriverState fields, and swap contents
1921 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1922 *
1923 * bs_new is required to be anonymous.
1924 *
1925 * This function does not create any image files.
1926 */
1927void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1928{
1929 bdrv_swap(bs_new, bs_top);
1930
1931 /* The contents of 'tmp' will become bs_top, as we are
1932 * swapping bs_new and bs_top contents. */
1933 bs_top->backing_hd = bs_new;
1934 bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1935 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1936 bs_new->filename);
1937 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1938 bs_new->drv ? bs_new->drv->format_name : "");
1939}
1940
1941static void bdrv_delete(BlockDriverState *bs)
1942{
1943 assert(!bs->dev);
1944 assert(!bs->job);
1945 assert(!bs->in_use);
1946 assert(!bs->refcnt);
1947 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1948
1949 bdrv_close(bs);
1950
1951 /* remove from list, if necessary */
1952 bdrv_make_anon(bs);
1953
1954 g_free(bs);
1955}
1956
1957int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1958/* TODO change to DeviceState *dev when all users are qdevified */
1959{
1960 if (bs->dev) {
1961 return -EBUSY;
1962 }
1963 bs->dev = dev;
1964 bdrv_iostatus_reset(bs);
1965 return 0;
1966}
1967
1968/* TODO qdevified devices don't use this, remove when devices are qdevified */
1969void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1970{
1971 if (bdrv_attach_dev(bs, dev) < 0) {
1972 abort();
1973 }
1974}
1975
1976void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1977/* TODO change to DeviceState *dev when all users are qdevified */
1978{
1979 assert(bs->dev == dev);
1980 bs->dev = NULL;
1981 bs->dev_ops = NULL;
1982 bs->dev_opaque = NULL;
1983 bs->guest_block_size = 512;
1984}
1985
1986/* TODO change to return DeviceState * when all users are qdevified */
1987void *bdrv_get_attached_dev(BlockDriverState *bs)
1988{
1989 return bs->dev;
1990}
1991
1992void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1993 void *opaque)
1994{
1995 bs->dev_ops = ops;
1996 bs->dev_opaque = opaque;
1997}
1998
1999void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2000 enum MonitorEvent ev,
2001 BlockErrorAction action, bool is_read)
2002{
2003 QObject *data;
2004 const char *action_str;
2005
2006 switch (action) {
2007 case BDRV_ACTION_REPORT:
2008 action_str = "report";
2009 break;
2010 case BDRV_ACTION_IGNORE:
2011 action_str = "ignore";
2012 break;
2013 case BDRV_ACTION_STOP:
2014 action_str = "stop";
2015 break;
2016 default:
2017 abort();
2018 }
2019
2020 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2021 bdrv->device_name,
2022 action_str,
2023 is_read ? "read" : "write");
2024 monitor_protocol_event(ev, data);
2025
2026 qobject_decref(data);
2027}
2028
2029static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2030{
2031 QObject *data;
2032
2033 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2034 bdrv_get_device_name(bs), ejected);
2035 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2036
2037 qobject_decref(data);
2038}
2039
2040static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
2041{
2042 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
2043 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
2044 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
2045 if (tray_was_closed) {
2046 /* tray open */
2047 bdrv_emit_qmp_eject_event(bs, true);
2048 }
2049 if (load) {
2050 /* tray close */
2051 bdrv_emit_qmp_eject_event(bs, false);
2052 }
2053 }
2054}
2055
2056bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2057{
2058 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2059}
2060
2061void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2062{
2063 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2064 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2065 }
2066}
2067
2068bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2069{
2070 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2071 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2072 }
2073 return false;
2074}
2075
2076static void bdrv_dev_resize_cb(BlockDriverState *bs)
2077{
2078 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2079 bs->dev_ops->resize_cb(bs->dev_opaque);
2080 }
2081}
2082
2083bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2084{
2085 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2086 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2087 }
2088 return false;
2089}
2090
2091/*
2092 * Run consistency checks on an image
2093 *
2094 * Returns 0 if the check could be completed (it doesn't mean that the image is
2095 * free of errors) or -errno when an internal error occurred. The results of the
2096 * check are stored in res.
2097 */
2098int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2099{
2100 if (bs->drv->bdrv_check == NULL) {
2101 return -ENOTSUP;
2102 }
2103
2104 memset(res, 0, sizeof(*res));
2105 return bs->drv->bdrv_check(bs, res, fix);
2106}
2107
2108#define COMMIT_BUF_SECTORS 2048
2109
2110/* commit COW file into the raw image */
2111int bdrv_commit(BlockDriverState *bs)
2112{
2113 BlockDriver *drv = bs->drv;
2114 int64_t sector, total_sectors, length, backing_length;
2115 int n, ro, open_flags;
2116 int ret = 0;
2117 uint8_t *buf = NULL;
2118 char filename[PATH_MAX];
2119
2120 if (!drv)
2121 return -ENOMEDIUM;
2122
2123 if (!bs->backing_hd) {
2124 return -ENOTSUP;
2125 }
2126
2127 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2128 return -EBUSY;
2129 }
2130
2131 ro = bs->backing_hd->read_only;
2132 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2133 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
2134 open_flags = bs->backing_hd->open_flags;
2135
2136 if (ro) {
2137 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2138 return -EACCES;
2139 }
2140 }
2141
2142 length = bdrv_getlength(bs);
2143 if (length < 0) {
2144 ret = length;
2145 goto ro_cleanup;
2146 }
2147
2148 backing_length = bdrv_getlength(bs->backing_hd);
2149 if (backing_length < 0) {
2150 ret = backing_length;
2151 goto ro_cleanup;
2152 }
2153
2154 /* If our top snapshot is larger than the backing file image,
2155 * grow the backing file image if possible. If not possible,
2156 * we must return an error */
2157 if (length > backing_length) {
2158 ret = bdrv_truncate(bs->backing_hd, length);
2159 if (ret < 0) {
2160 goto ro_cleanup;
2161 }
2162 }
2163
2164 total_sectors = length >> BDRV_SECTOR_BITS;
2165 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
2166
2167 for (sector = 0; sector < total_sectors; sector += n) {
2168 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2169 if (ret < 0) {
2170 goto ro_cleanup;
2171 }
2172 if (ret) {
2173 ret = bdrv_read(bs, sector, buf, n);
2174 if (ret < 0) {
2175 goto ro_cleanup;
2176 }
2177
2178 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2179 if (ret < 0) {
2180 goto ro_cleanup;
2181 }
2182 }
2183 }
2184
2185 if (drv->bdrv_make_empty) {
2186 ret = drv->bdrv_make_empty(bs);
2187 if (ret < 0) {
2188 goto ro_cleanup;
2189 }
2190 bdrv_flush(bs);
2191 }
2192
2193 /*
2194 * Make sure all data we wrote to the backing device is actually
2195 * stable on disk.
2196 */
2197 if (bs->backing_hd) {
2198 bdrv_flush(bs->backing_hd);
2199 }
2200
2201 ret = 0;
2202ro_cleanup:
2203 g_free(buf);
2204
2205 if (ro) {
2206 /* ignoring error return here */
2207 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
2208 }
2209
2210 return ret;
2211}
2212
2213int bdrv_commit_all(void)
2214{
2215 BlockDriverState *bs;
2216
2217 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2218 if (bs->drv && bs->backing_hd) {
2219 int ret = bdrv_commit(bs);
2220 if (ret < 0) {
2221 return ret;
2222 }
2223 }
2224 }
2225 return 0;
2226}
2227
2228/**
2229 * Remove an active request from the tracked requests list
2230 *
2231 * This function should be called when a tracked request is completing.
2232 */
2233static void tracked_request_end(BdrvTrackedRequest *req)
2234{
2235 if (req->serialising) {
2236 req->bs->serialising_in_flight--;
2237 }
2238
2239 QLIST_REMOVE(req, list);
2240 qemu_co_queue_restart_all(&req->wait_queue);
2241}
2242
2243/**
2244 * Add an active request to the tracked requests list
2245 */
2246static void tracked_request_begin(BdrvTrackedRequest *req,
2247 BlockDriverState *bs,
2248 int64_t offset,
2249 unsigned int bytes, bool is_write)
2250{
2251 *req = (BdrvTrackedRequest){
2252 .bs = bs,
2253 .offset = offset,
2254 .bytes = bytes,
2255 .is_write = is_write,
2256 .co = qemu_coroutine_self(),
2257 .serialising = false,
2258 .overlap_offset = offset,
2259 .overlap_bytes = bytes,
2260 };
2261
2262 qemu_co_queue_init(&req->wait_queue);
2263
2264 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2265}
2266
2267static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2268{
2269 int64_t overlap_offset = req->offset & ~(align - 1);
2270 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2271 - overlap_offset;
2272
2273 if (!req->serialising) {
2274 req->bs->serialising_in_flight++;
2275 req->serialising = true;
2276 }
2277
2278 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2279 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2280}
2281
2282/**
2283 * Round a region to cluster boundaries
2284 */
2285void bdrv_round_to_clusters(BlockDriverState *bs,
2286 int64_t sector_num, int nb_sectors,
2287 int64_t *cluster_sector_num,
2288 int *cluster_nb_sectors)
2289{
2290 BlockDriverInfo bdi;
2291
2292 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2293 *cluster_sector_num = sector_num;
2294 *cluster_nb_sectors = nb_sectors;
2295 } else {
2296 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2297 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2298 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2299 nb_sectors, c);
2300 }
2301}
2302
2303static int bdrv_get_cluster_size(BlockDriverState *bs)
2304{
2305 BlockDriverInfo bdi;
2306 int ret;
2307
2308 ret = bdrv_get_info(bs, &bdi);
2309 if (ret < 0 || bdi.cluster_size == 0) {
2310 return bs->request_alignment;
2311 } else {
2312 return bdi.cluster_size;
2313 }
2314}
2315
2316static bool tracked_request_overlaps(BdrvTrackedRequest *req,
2317 int64_t offset, unsigned int bytes)
2318{
2319 /* aaaa bbbb */
2320 if (offset >= req->overlap_offset + req->overlap_bytes) {
2321 return false;
2322 }
2323 /* bbbb aaaa */
2324 if (req->overlap_offset >= offset + bytes) {
2325 return false;
2326 }
2327 return true;
2328}
2329
2330static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
2331{
2332 BlockDriverState *bs = self->bs;
2333 BdrvTrackedRequest *req;
2334 bool retry;
2335 bool waited = false;
2336
2337 if (!bs->serialising_in_flight) {
2338 return false;
2339 }
2340
2341 do {
2342 retry = false;
2343 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2344 if (req == self || (!req->serialising && !self->serialising)) {
2345 continue;
2346 }
2347 if (tracked_request_overlaps(req, self->overlap_offset,
2348 self->overlap_bytes))
2349 {
2350 /* Hitting this means there was a reentrant request, for
2351 * example, a block driver issuing nested requests. This must
2352 * never happen since it means deadlock.
2353 */
2354 assert(qemu_coroutine_self() != req->co);
2355
2356 /* If the request is already (indirectly) waiting for us, or
2357 * will wait for us as soon as it wakes up, then just go on
2358 * (instead of producing a deadlock in the former case). */
2359 if (!req->waiting_for) {
2360 self->waiting_for = req;
2361 qemu_co_queue_wait(&req->wait_queue);
2362 self->waiting_for = NULL;
2363 retry = true;
2364 waited = true;
2365 break;
2366 }
2367 }
2368 }
2369 } while (retry);
2370
2371 return waited;
2372}
2373
2374/*
2375 * Return values:
2376 * 0 - success
2377 * -EINVAL - backing format specified, but no file
2378 * -ENOSPC - can't update the backing file because no space is left in the
2379 * image file header
2380 * -ENOTSUP - format driver doesn't support changing the backing file
2381 */
2382int bdrv_change_backing_file(BlockDriverState *bs,
2383 const char *backing_file, const char *backing_fmt)
2384{
2385 BlockDriver *drv = bs->drv;
2386 int ret;
2387
2388 /* Backing file format doesn't make sense without a backing file */
2389 if (backing_fmt && !backing_file) {
2390 return -EINVAL;
2391 }
2392
2393 if (drv->bdrv_change_backing_file != NULL) {
2394 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
2395 } else {
2396 ret = -ENOTSUP;
2397 }
2398
2399 if (ret == 0) {
2400 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2401 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2402 }
2403 return ret;
2404}
2405
2406/*
2407 * Finds the image layer in the chain that has 'bs' as its backing file.
2408 *
2409 * active is the current topmost image.
2410 *
2411 * Returns NULL if bs is not found in active's image chain,
2412 * or if active == bs.
2413 */
2414BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2415 BlockDriverState *bs)
2416{
2417 BlockDriverState *overlay = NULL;
2418 BlockDriverState *intermediate;
2419
2420 assert(active != NULL);
2421 assert(bs != NULL);
2422
2423 /* if bs is the same as active, then by definition it has no overlay
2424 */
2425 if (active == bs) {
2426 return NULL;
2427 }
2428
2429 intermediate = active;
2430 while (intermediate->backing_hd) {
2431 if (intermediate->backing_hd == bs) {
2432 overlay = intermediate;
2433 break;
2434 }
2435 intermediate = intermediate->backing_hd;
2436 }
2437
2438 return overlay;
2439}
2440
2441typedef struct BlkIntermediateStates {
2442 BlockDriverState *bs;
2443 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2444} BlkIntermediateStates;
2445
2446
2447/*
2448 * Drops images above 'base' up to and including 'top', and sets the image
2449 * above 'top' to have base as its backing file.
2450 *
2451 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2452 * information in 'bs' can be properly updated.
2453 *
2454 * E.g., this will convert the following chain:
2455 * bottom <- base <- intermediate <- top <- active
2456 *
2457 * to
2458 *
2459 * bottom <- base <- active
2460 *
2461 * It is allowed for bottom==base, in which case it converts:
2462 *
2463 * base <- intermediate <- top <- active
2464 *
2465 * to
2466 *
2467 * base <- active
2468 *
2469 * Error conditions:
2470 * if active == top, that is considered an error
2471 *
2472 */
2473int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2474 BlockDriverState *base)
2475{
2476 BlockDriverState *intermediate;
2477 BlockDriverState *base_bs = NULL;
2478 BlockDriverState *new_top_bs = NULL;
2479 BlkIntermediateStates *intermediate_state, *next;
2480 int ret = -EIO;
2481
2482 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2483 QSIMPLEQ_INIT(&states_to_delete);
2484
2485 if (!top->drv || !base->drv) {
2486 goto exit;
2487 }
2488
2489 new_top_bs = bdrv_find_overlay(active, top);
2490
2491 if (new_top_bs == NULL) {
2492 /* we could not find the image above 'top', this is an error */
2493 goto exit;
2494 }
2495
2496 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2497 * to do, no intermediate images */
2498 if (new_top_bs->backing_hd == base) {
2499 ret = 0;
2500 goto exit;
2501 }
2502
2503 intermediate = top;
2504
2505 /* now we will go down through the list, and add each BDS we find
2506 * into our deletion queue, until we hit the 'base'
2507 */
2508 while (intermediate) {
2509 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2510 intermediate_state->bs = intermediate;
2511 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2512
2513 if (intermediate->backing_hd == base) {
2514 base_bs = intermediate->backing_hd;
2515 break;
2516 }
2517 intermediate = intermediate->backing_hd;
2518 }
2519 if (base_bs == NULL) {
2520 /* something went wrong, we did not end at the base. safely
2521 * unravel everything, and exit with error */
2522 goto exit;
2523 }
2524
2525 /* success - we can delete the intermediate states, and link top->base */
2526 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2527 base_bs->drv ? base_bs->drv->format_name : "");
2528 if (ret) {
2529 goto exit;
2530 }
2531 new_top_bs->backing_hd = base_bs;
2532
2533 bdrv_refresh_limits(new_top_bs);
2534
2535 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2536 /* so that bdrv_close() does not recursively close the chain */
2537 intermediate_state->bs->backing_hd = NULL;
2538 bdrv_unref(intermediate_state->bs);
2539 }
2540 ret = 0;
2541
2542exit:
2543 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2544 g_free(intermediate_state);
2545 }
2546 return ret;
2547}
2548
2549
2550static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2551 size_t size)
2552{
2553 int64_t len;
2554
2555 if (!bdrv_is_inserted(bs))
2556 return -ENOMEDIUM;
2557
2558 if (bs->growable)
2559 return 0;
2560
2561 len = bdrv_getlength(bs);
2562
2563 if (offset < 0)
2564 return -EIO;
2565
2566 if ((offset > len) || (len - offset < size))
2567 return -EIO;
2568
2569 return 0;
2570}
2571
2572static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2573 int nb_sectors)
2574{
2575 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2576 nb_sectors * BDRV_SECTOR_SIZE);
2577}
2578
2579typedef struct RwCo {
2580 BlockDriverState *bs;
2581 int64_t offset;
2582 QEMUIOVector *qiov;
2583 bool is_write;
2584 int ret;
2585 BdrvRequestFlags flags;
2586} RwCo;
2587
2588static void coroutine_fn bdrv_rw_co_entry(void *opaque)
2589{
2590 RwCo *rwco = opaque;
2591
2592 if (!rwco->is_write) {
2593 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2594 rwco->qiov->size, rwco->qiov,
2595 rwco->flags);
2596 } else {
2597 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2598 rwco->qiov->size, rwco->qiov,
2599 rwco->flags);
2600 }
2601}
2602
2603/*
2604 * Process a vectored synchronous request using coroutines
2605 */
2606static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2607 QEMUIOVector *qiov, bool is_write,
2608 BdrvRequestFlags flags)
2609{
2610 Coroutine *co;
2611 RwCo rwco = {
2612 .bs = bs,
2613 .offset = offset,
2614 .qiov = qiov,
2615 .is_write = is_write,
2616 .ret = NOT_DONE,
2617 .flags = flags,
2618 };
2619
2620 /**
2621 * In sync call context, when the vcpu is blocked, this throttling timer
2622 * will not fire; so the I/O throttling function has to be disabled here
2623 * if it has been enabled.
2624 */
2625 if (bs->io_limits_enabled) {
2626 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2627 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2628 bdrv_io_limits_disable(bs);
2629 }
2630
2631 if (qemu_in_coroutine()) {
2632 /* Fast-path if already in coroutine context */
2633 bdrv_rw_co_entry(&rwco);
2634 } else {
2635 co = qemu_coroutine_create(bdrv_rw_co_entry);
2636 qemu_coroutine_enter(co, &rwco);
2637 while (rwco.ret == NOT_DONE) {
2638 qemu_aio_wait();
2639 }
2640 }
2641 return rwco.ret;
2642}
2643
2644/*
2645 * Process a synchronous request using coroutines
2646 */
2647static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
2648 int nb_sectors, bool is_write, BdrvRequestFlags flags)
2649{
2650 QEMUIOVector qiov;
2651 struct iovec iov = {
2652 .iov_base = (void *)buf,
2653 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2654 };
2655
2656 qemu_iovec_init_external(&qiov, &iov, 1);
2657 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2658 &qiov, is_write, flags);
2659}
2660
2661/* return < 0 if error. See bdrv_write() for the return codes */
2662int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2663 uint8_t *buf, int nb_sectors)
2664{
2665 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
2666}
2667
2668/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2669int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2670 uint8_t *buf, int nb_sectors)
2671{
2672 bool enabled;
2673 int ret;
2674
2675 enabled = bs->io_limits_enabled;
2676 bs->io_limits_enabled = false;
2677 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
2678 bs->io_limits_enabled = enabled;
2679 return ret;
2680}
2681
2682/* Return < 0 if error. Important errors are:
2683 -EIO generic I/O error (may happen for all errors)
2684 -ENOMEDIUM No media inserted.
2685 -EINVAL Invalid sector number or nb_sectors
2686 -EACCES Trying to write a read-only device
2687*/
2688int bdrv_write(BlockDriverState *bs, int64_t sector_num,
2689 const uint8_t *buf, int nb_sectors)
2690{
2691 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
2692}
2693
2694int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2695 int nb_sectors, BdrvRequestFlags flags)
2696{
2697 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
2698 BDRV_REQ_ZERO_WRITE | flags);
2699}
2700
2701/*
2702 * Completely zero out a block device with the help of bdrv_write_zeroes.
2703 * The operation is sped up by checking the block status and only writing
2704 * zeroes to the device if they currently do not return zeroes. Optional
2705 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2706 *
2707 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2708 */
2709int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2710{
2711 int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
2712 int64_t ret, nb_sectors, sector_num = 0;
2713 int n;
2714
2715 for (;;) {
2716 nb_sectors = target_size - sector_num;
2717 if (nb_sectors <= 0) {
2718 return 0;
2719 }
2720 if (nb_sectors > INT_MAX) {
2721 nb_sectors = INT_MAX;
2722 }
2723 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
2724 if (ret < 0) {
2725 error_report("error getting block status at sector %" PRId64 ": %s",
2726 sector_num, strerror(-ret));
2727 return ret;
2728 }
2729 if (ret & BDRV_BLOCK_ZERO) {
2730 sector_num += n;
2731 continue;
2732 }
2733 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2734 if (ret < 0) {
2735 error_report("error writing zeroes at sector %" PRId64 ": %s",
2736 sector_num, strerror(-ret));
2737 return ret;
2738 }
2739 sector_num += n;
2740 }
2741}
2742
2743int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
2744{
2745 QEMUIOVector qiov;
2746 struct iovec iov = {
2747 .iov_base = (void *)buf,
2748 .iov_len = bytes,
2749 };
2750 int ret;
2751
2752 if (bytes < 0) {
2753 return -EINVAL;
2754 }
2755
2756 qemu_iovec_init_external(&qiov, &iov, 1);
2757 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2758 if (ret < 0) {
2759 return ret;
2760 }
2761
2762 return bytes;
2763}
2764
2765int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
2766{
2767 int ret;
2768
2769 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2770 if (ret < 0) {
2771 return ret;
2772 }
2773
2774 return qiov->size;
2775}
2776
2777int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2778 const void *buf, int bytes)
2779{
2780 QEMUIOVector qiov;
2781 struct iovec iov = {
2782 .iov_base = (void *) buf,
2783 .iov_len = bytes,
2784 };
2785
2786 if (bytes < 0) {
2787 return -EINVAL;
2788 }
2789
2790 qemu_iovec_init_external(&qiov, &iov, 1);
2791 return bdrv_pwritev(bs, offset, &qiov);
2792}
2793
2794/*
2795 * Writes to the file and ensures that no writes are reordered across this
2796 * request (acts as a barrier)
2797 *
2798 * Returns 0 on success, -errno in error cases.
2799 */
2800int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2801 const void *buf, int count)
2802{
2803 int ret;
2804
2805 ret = bdrv_pwrite(bs, offset, buf, count);
2806 if (ret < 0) {
2807 return ret;
2808 }
2809
2810 /* No flush needed for cache modes that already do it */
2811 if (bs->enable_write_cache) {
2812 bdrv_flush(bs);
2813 }
2814
2815 return 0;
2816}
2817
2818static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
2819 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2820{
2821 /* Perform I/O through a temporary buffer so that users who scribble over
2822 * their read buffer while the operation is in progress do not end up
2823 * modifying the image file. This is critical for zero-copy guest I/O
2824 * where anything might happen inside guest memory.
2825 */
2826 void *bounce_buffer;
2827
2828 BlockDriver *drv = bs->drv;
2829 struct iovec iov;
2830 QEMUIOVector bounce_qiov;
2831 int64_t cluster_sector_num;
2832 int cluster_nb_sectors;
2833 size_t skip_bytes;
2834 int ret;
2835
2836 /* Cover entire cluster so no additional backing file I/O is required when
2837 * allocating cluster in the image file.
2838 */
2839 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2840 &cluster_sector_num, &cluster_nb_sectors);
2841
2842 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2843 cluster_sector_num, cluster_nb_sectors);
2844
2845 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2846 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2847 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2848
2849 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2850 &bounce_qiov);
2851 if (ret < 0) {
2852 goto err;
2853 }
2854
2855 if (drv->bdrv_co_write_zeroes &&
2856 buffer_is_zero(bounce_buffer, iov.iov_len)) {
2857 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2858 cluster_nb_sectors, 0);
2859 } else {
2860 /* This does not change the data on the disk, it is not necessary
2861 * to flush even in cache=writethrough mode.
2862 */
2863 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
2864 &bounce_qiov);
2865 }
2866
2867 if (ret < 0) {
2868 /* It might be okay to ignore write errors for guest requests. If this
2869 * is a deliberate copy-on-read then we don't want to ignore the error.
2870 * Simply report it in all cases.
2871 */
2872 goto err;
2873 }
2874
2875 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
2876 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2877 nb_sectors * BDRV_SECTOR_SIZE);
2878
2879err:
2880 qemu_vfree(bounce_buffer);
2881 return ret;
2882}
2883
2884/*
2885 * Forwards an already correctly aligned request to the BlockDriver. This
2886 * handles copy on read and zeroing after EOF; any other features must be
2887 * implemented by the caller.
2888 */
2889static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
2890 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
2891 int64_t align, QEMUIOVector *qiov, int flags)
2892{
2893 BlockDriver *drv = bs->drv;
2894 int ret;
2895
2896 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2897 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
2898
2899 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2900 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2901
2902 /* Handle Copy on Read and associated serialisation */
2903 if (flags & BDRV_REQ_COPY_ON_READ) {
2904 /* If we touch the same cluster it counts as an overlap. This
2905 * guarantees that allocating writes will be serialized and not race
2906 * with each other for the same cluster. For example, in copy-on-read
2907 * it ensures that the CoR read and write operations are atomic and
2908 * guest writes cannot interleave between them. */
2909 mark_request_serialising(req, bdrv_get_cluster_size(bs));
2910 }
2911
2912 wait_serialising_requests(req);
2913
2914 if (flags & BDRV_REQ_COPY_ON_READ) {
2915 int pnum;
2916
2917 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
2918 if (ret < 0) {
2919 goto out;
2920 }
2921
2922 if (!ret || pnum != nb_sectors) {
2923 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
2924 goto out;
2925 }
2926 }
2927
2928 /* Forward the request to the BlockDriver */
2929 if (!(bs->zero_beyond_eof && bs->growable)) {
2930 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
2931 } else {
2932 /* Read zeros after EOF of growable BDSes */
2933 int64_t len, total_sectors, max_nb_sectors;
2934
2935 len = bdrv_getlength(bs);
2936 if (len < 0) {
2937 ret = len;
2938 goto out;
2939 }
2940
2941 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
2942 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
2943 align >> BDRV_SECTOR_BITS);
2944 if (max_nb_sectors > 0) {
2945 ret = drv->bdrv_co_readv(bs, sector_num,
2946 MIN(nb_sectors, max_nb_sectors), qiov);
2947 } else {
2948 ret = 0;
2949 }
2950
2951 /* Reading beyond end of file is supposed to produce zeroes */
2952 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
2953 uint64_t offset = MAX(0, total_sectors - sector_num);
2954 uint64_t bytes = (sector_num + nb_sectors - offset) *
2955 BDRV_SECTOR_SIZE;
2956 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
2957 }
2958 }
2959
2960out:
2961 return ret;
2962}
2963
2964/*
2965 * Handle a read request in coroutine context
2966 */
2967static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
2968 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2969 BdrvRequestFlags flags)
2970{
2971 BlockDriver *drv = bs->drv;
2972 BdrvTrackedRequest req;
2973
2974 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
2975 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
2976 uint8_t *head_buf = NULL;
2977 uint8_t *tail_buf = NULL;
2978 QEMUIOVector local_qiov;
2979 bool use_local_qiov = false;
2980 int ret;
2981
2982 if (!drv) {
2983 return -ENOMEDIUM;
2984 }
2985 if (bdrv_check_byte_request(bs, offset, bytes)) {
2986 return -EIO;
2987 }
2988
2989 if (bs->copy_on_read) {
2990 flags |= BDRV_REQ_COPY_ON_READ;
2991 }
2992
2993 /* throttling disk I/O */
2994 if (bs->io_limits_enabled) {
2995 bdrv_io_limits_intercept(bs, bytes, false);
2996 }
2997
2998 /* Align read if necessary by padding qiov */
2999 if (offset & (align - 1)) {
3000 head_buf = qemu_blockalign(bs, align);
3001 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3002 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3003 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3004 use_local_qiov = true;
3005
3006 bytes += offset & (align - 1);
3007 offset = offset & ~(align - 1);
3008 }
3009
3010 if ((offset + bytes) & (align - 1)) {
3011 if (!use_local_qiov) {
3012 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3013 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3014 use_local_qiov = true;
3015 }
3016 tail_buf = qemu_blockalign(bs, align);
3017 qemu_iovec_add(&local_qiov, tail_buf,
3018 align - ((offset + bytes) & (align - 1)));
3019
3020 bytes = ROUND_UP(bytes, align);
3021 }
3022
3023 tracked_request_begin(&req, bs, offset, bytes, false);
3024 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
3025 use_local_qiov ? &local_qiov : qiov,
3026 flags);
3027 tracked_request_end(&req);
3028
3029 if (use_local_qiov) {
3030 qemu_iovec_destroy(&local_qiov);
3031 qemu_vfree(head_buf);
3032 qemu_vfree(tail_buf);
3033 }
3034
3035 return ret;
3036}
3037
3038static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3039 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3040 BdrvRequestFlags flags)
3041{
3042 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3043 return -EINVAL;
3044 }
3045
3046 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3047 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3048}
3049
3050int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
3051 int nb_sectors, QEMUIOVector *qiov)
3052{
3053 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
3054
3055 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3056}
3057
3058int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3059 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3060{
3061 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3062
3063 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3064 BDRV_REQ_COPY_ON_READ);
3065}
3066
3067/* if no limit is specified in the BlockLimits use a default
3068 * of 32768 512-byte sectors (16 MiB) per request.
3069 */
3070#define MAX_WRITE_ZEROES_DEFAULT 32768
3071
3072static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
3073 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
3074{
3075 BlockDriver *drv = bs->drv;
3076 QEMUIOVector qiov;
3077 struct iovec iov = {0};
3078 int ret = 0;
3079
3080 int max_write_zeroes = bs->bl.max_write_zeroes ?
3081 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
3082
3083 while (nb_sectors > 0 && !ret) {
3084 int num = nb_sectors;
3085
3086 /* Align request. Block drivers can expect the "bulk" of the request
3087 * to be aligned.
3088 */
3089 if (bs->bl.write_zeroes_alignment
3090 && num > bs->bl.write_zeroes_alignment) {
3091 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3092 /* Make a small request up to the first aligned sector. */
3093 num = bs->bl.write_zeroes_alignment;
3094 num -= sector_num % bs->bl.write_zeroes_alignment;
3095 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3096 /* Shorten the request to the last aligned sector. num cannot
3097 * underflow because num > bs->bl.write_zeroes_alignment.
3098 */
3099 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
3100 }
3101 }
3102
3103 /* limit request size */
3104 if (num > max_write_zeroes) {
3105 num = max_write_zeroes;
3106 }
3107
3108 ret = -ENOTSUP;
3109 /* First try the efficient write zeroes operation */
3110 if (drv->bdrv_co_write_zeroes) {
3111 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3112 }
3113
3114 if (ret == -ENOTSUP) {
3115 /* Fall back to bounce buffer if write zeroes is unsupported */
3116 iov.iov_len = num * BDRV_SECTOR_SIZE;
3117 if (iov.iov_base == NULL) {
3118 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3119 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
3120 }
3121 qemu_iovec_init_external(&qiov, &iov, 1);
3122
3123 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
3124
3125 /* Keep bounce buffer around if it is big enough for all
3126 * all future requests.
3127 */
3128 if (num < max_write_zeroes) {
3129 qemu_vfree(iov.iov_base);
3130 iov.iov_base = NULL;
3131 }
3132 }
3133
3134 sector_num += num;
3135 nb_sectors -= num;
3136 }
3137
3138 qemu_vfree(iov.iov_base);
3139 return ret;
3140}
3141
3142/*
3143 * Forwards an already correctly aligned write request to the BlockDriver.
3144 */
3145static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
3146 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3147 QEMUIOVector *qiov, int flags)
3148{
3149 BlockDriver *drv = bs->drv;
3150 bool waited;
3151 int ret;
3152
3153 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3154 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
3155
3156 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3157 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
3158
3159 waited = wait_serialising_requests(req);
3160 assert(!waited || !req->serialising);
3161 assert(req->overlap_offset <= offset);
3162 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
3163
3164 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
3165
3166 if (ret < 0) {
3167 /* Do nothing, write notifier decided to fail this request */
3168 } else if (flags & BDRV_REQ_ZERO_WRITE) {
3169 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
3170 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
3171 } else {
3172 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
3173 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3174 }
3175 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
3176
3177 if (ret == 0 && !bs->enable_write_cache) {
3178 ret = bdrv_co_flush(bs);
3179 }
3180
3181 bdrv_set_dirty(bs, sector_num, nb_sectors);
3182
3183 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3184 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3185 }
3186 if (bs->growable && ret >= 0) {
3187 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3188 }
3189
3190 return ret;
3191}
3192
3193/*
3194 * Handle a write request in coroutine context
3195 */
3196static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3197 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
3198 BdrvRequestFlags flags)
3199{
3200 BdrvTrackedRequest req;
3201 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3202 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3203 uint8_t *head_buf = NULL;
3204 uint8_t *tail_buf = NULL;
3205 QEMUIOVector local_qiov;
3206 bool use_local_qiov = false;
3207 int ret;
3208
3209 if (!bs->drv) {
3210 return -ENOMEDIUM;
3211 }
3212 if (bs->read_only) {
3213 return -EACCES;
3214 }
3215 if (bdrv_check_byte_request(bs, offset, bytes)) {
3216 return -EIO;
3217 }
3218
3219 /* throttling disk I/O */
3220 if (bs->io_limits_enabled) {
3221 bdrv_io_limits_intercept(bs, bytes, true);
3222 }
3223
3224 /*
3225 * Align write if necessary by performing a read-modify-write cycle.
3226 * Pad qiov with the read parts and be sure to have a tracked request not
3227 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3228 */
3229 tracked_request_begin(&req, bs, offset, bytes, true);
3230
3231 if (offset & (align - 1)) {
3232 QEMUIOVector head_qiov;
3233 struct iovec head_iov;
3234
3235 mark_request_serialising(&req, align);
3236 wait_serialising_requests(&req);
3237
3238 head_buf = qemu_blockalign(bs, align);
3239 head_iov = (struct iovec) {
3240 .iov_base = head_buf,
3241 .iov_len = align,
3242 };
3243 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3244
3245 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3246 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3247 align, &head_qiov, 0);
3248 if (ret < 0) {
3249 goto fail;
3250 }
3251 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3252
3253 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3254 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3255 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3256 use_local_qiov = true;
3257
3258 bytes += offset & (align - 1);
3259 offset = offset & ~(align - 1);
3260 }
3261
3262 if ((offset + bytes) & (align - 1)) {
3263 QEMUIOVector tail_qiov;
3264 struct iovec tail_iov;
3265 size_t tail_bytes;
3266 bool waited;
3267
3268 mark_request_serialising(&req, align);
3269 waited = wait_serialising_requests(&req);
3270 assert(!waited || !use_local_qiov);
3271
3272 tail_buf = qemu_blockalign(bs, align);
3273 tail_iov = (struct iovec) {
3274 .iov_base = tail_buf,
3275 .iov_len = align,
3276 };
3277 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3278
3279 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3280 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3281 align, &tail_qiov, 0);
3282 if (ret < 0) {
3283 goto fail;
3284 }
3285 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3286
3287 if (!use_local_qiov) {
3288 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3289 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3290 use_local_qiov = true;
3291 }
3292
3293 tail_bytes = (offset + bytes) & (align - 1);
3294 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3295
3296 bytes = ROUND_UP(bytes, align);
3297 }
3298
3299 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3300 use_local_qiov ? &local_qiov : qiov,
3301 flags);
3302
3303fail:
3304 tracked_request_end(&req);
3305
3306 if (use_local_qiov) {
3307 qemu_iovec_destroy(&local_qiov);
3308 }
3309 qemu_vfree(head_buf);
3310 qemu_vfree(tail_buf);
3311
3312 return ret;
3313}
3314
3315static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3316 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3317 BdrvRequestFlags flags)
3318{
3319 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3320 return -EINVAL;
3321 }
3322
3323 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3324 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3325}
3326
3327int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3328 int nb_sectors, QEMUIOVector *qiov)
3329{
3330 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3331
3332 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3333}
3334
3335int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
3336 int64_t sector_num, int nb_sectors,
3337 BdrvRequestFlags flags)
3338{
3339 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
3340
3341 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3342 flags &= ~BDRV_REQ_MAY_UNMAP;
3343 }
3344
3345 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
3346 BDRV_REQ_ZERO_WRITE | flags);
3347}
3348
3349/**
3350 * Truncate file to 'offset' bytes (needed only for file protocols)
3351 */
3352int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3353{
3354 BlockDriver *drv = bs->drv;
3355 int ret;
3356 if (!drv)
3357 return -ENOMEDIUM;
3358 if (!drv->bdrv_truncate)
3359 return -ENOTSUP;
3360 if (bs->read_only)
3361 return -EACCES;
3362 if (bdrv_in_use(bs))
3363 return -EBUSY;
3364 ret = drv->bdrv_truncate(bs, offset);
3365 if (ret == 0) {
3366 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3367 bdrv_dev_resize_cb(bs);
3368 }
3369 return ret;
3370}
3371
3372/**
3373 * Length of a allocated file in bytes. Sparse files are counted by actual
3374 * allocated space. Return < 0 if error or unknown.
3375 */
3376int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3377{
3378 BlockDriver *drv = bs->drv;
3379 if (!drv) {
3380 return -ENOMEDIUM;
3381 }
3382 if (drv->bdrv_get_allocated_file_size) {
3383 return drv->bdrv_get_allocated_file_size(bs);
3384 }
3385 if (bs->file) {
3386 return bdrv_get_allocated_file_size(bs->file);
3387 }
3388 return -ENOTSUP;
3389}
3390
3391/**
3392 * Length of a file in bytes. Return < 0 if error or unknown.
3393 */
3394int64_t bdrv_getlength(BlockDriverState *bs)
3395{
3396 BlockDriver *drv = bs->drv;
3397 if (!drv)
3398 return -ENOMEDIUM;
3399
3400 if (drv->has_variable_length) {
3401 int ret = refresh_total_sectors(bs, bs->total_sectors);
3402 if (ret < 0) {
3403 return ret;
3404 }
3405 }
3406 return bs->total_sectors * BDRV_SECTOR_SIZE;
3407}
3408
3409/* return 0 as number of sectors if no device present or error */
3410void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
3411{
3412 int64_t length;
3413 length = bdrv_getlength(bs);
3414 if (length < 0)
3415 length = 0;
3416 else
3417 length = length >> BDRV_SECTOR_BITS;
3418 *nb_sectors_ptr = length;
3419}
3420
3421void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3422 BlockdevOnError on_write_error)
3423{
3424 bs->on_read_error = on_read_error;
3425 bs->on_write_error = on_write_error;
3426}
3427
3428BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
3429{
3430 return is_read ? bs->on_read_error : bs->on_write_error;
3431}
3432
3433BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3434{
3435 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3436
3437 switch (on_err) {
3438 case BLOCKDEV_ON_ERROR_ENOSPC:
3439 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3440 case BLOCKDEV_ON_ERROR_STOP:
3441 return BDRV_ACTION_STOP;
3442 case BLOCKDEV_ON_ERROR_REPORT:
3443 return BDRV_ACTION_REPORT;
3444 case BLOCKDEV_ON_ERROR_IGNORE:
3445 return BDRV_ACTION_IGNORE;
3446 default:
3447 abort();
3448 }
3449}
3450
3451/* This is done by device models because, while the block layer knows
3452 * about the error, it does not know whether an operation comes from
3453 * the device or the block layer (from a job, for example).
3454 */
3455void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3456 bool is_read, int error)
3457{
3458 assert(error >= 0);
3459 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3460 if (action == BDRV_ACTION_STOP) {
3461 vm_stop(RUN_STATE_IO_ERROR);
3462 bdrv_iostatus_set_err(bs, error);
3463 }
3464}
3465
3466int bdrv_is_read_only(BlockDriverState *bs)
3467{
3468 return bs->read_only;
3469}
3470
3471int bdrv_is_sg(BlockDriverState *bs)
3472{
3473 return bs->sg;
3474}
3475
3476int bdrv_enable_write_cache(BlockDriverState *bs)
3477{
3478 return bs->enable_write_cache;
3479}
3480
3481void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3482{
3483 bs->enable_write_cache = wce;
3484
3485 /* so a reopen() will preserve wce */
3486 if (wce) {
3487 bs->open_flags |= BDRV_O_CACHE_WB;
3488 } else {
3489 bs->open_flags &= ~BDRV_O_CACHE_WB;
3490 }
3491}
3492
3493int bdrv_is_encrypted(BlockDriverState *bs)
3494{
3495 if (bs->backing_hd && bs->backing_hd->encrypted)
3496 return 1;
3497 return bs->encrypted;
3498}
3499
3500int bdrv_key_required(BlockDriverState *bs)
3501{
3502 BlockDriverState *backing_hd = bs->backing_hd;
3503
3504 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3505 return 1;
3506 return (bs->encrypted && !bs->valid_key);
3507}
3508
3509int bdrv_set_key(BlockDriverState *bs, const char *key)
3510{
3511 int ret;
3512 if (bs->backing_hd && bs->backing_hd->encrypted) {
3513 ret = bdrv_set_key(bs->backing_hd, key);
3514 if (ret < 0)
3515 return ret;
3516 if (!bs->encrypted)
3517 return 0;
3518 }
3519 if (!bs->encrypted) {
3520 return -EINVAL;
3521 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3522 return -ENOMEDIUM;
3523 }
3524 ret = bs->drv->bdrv_set_key(bs, key);
3525 if (ret < 0) {
3526 bs->valid_key = 0;
3527 } else if (!bs->valid_key) {
3528 bs->valid_key = 1;
3529 /* call the change callback now, we skipped it on open */
3530 bdrv_dev_change_media_cb(bs, true);
3531 }
3532 return ret;
3533}
3534
3535const char *bdrv_get_format_name(BlockDriverState *bs)
3536{
3537 return bs->drv ? bs->drv->format_name : NULL;
3538}
3539
3540void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
3541 void *opaque)
3542{
3543 BlockDriver *drv;
3544
3545 QLIST_FOREACH(drv, &bdrv_drivers, list) {
3546 it(opaque, drv->format_name);
3547 }
3548}
3549
3550/* This function is to find block backend bs */
3551BlockDriverState *bdrv_find(const char *name)
3552{
3553 BlockDriverState *bs;
3554
3555 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3556 if (!strcmp(name, bs->device_name)) {
3557 return bs;
3558 }
3559 }
3560 return NULL;
3561}
3562
3563/* This function is to find a node in the bs graph */
3564BlockDriverState *bdrv_find_node(const char *node_name)
3565{
3566 BlockDriverState *bs;
3567
3568 assert(node_name);
3569
3570 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3571 if (!strcmp(node_name, bs->node_name)) {
3572 return bs;
3573 }
3574 }
3575 return NULL;
3576}
3577
3578/* Put this QMP function here so it can access the static graph_bdrv_states. */
3579BlockDeviceInfoList *bdrv_named_nodes_list(void)
3580{
3581 BlockDeviceInfoList *list, *entry;
3582 BlockDriverState *bs;
3583
3584 list = NULL;
3585 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3586 entry = g_malloc0(sizeof(*entry));
3587 entry->value = bdrv_block_device_info(bs);
3588 entry->next = list;
3589 list = entry;
3590 }
3591
3592 return list;
3593}
3594
3595BlockDriverState *bdrv_lookup_bs(const char *device,
3596 const char *node_name,
3597 Error **errp)
3598{
3599 BlockDriverState *bs = NULL;
3600
3601 if (device) {
3602 bs = bdrv_find(device);
3603
3604 if (bs) {
3605 return bs;
3606 }
3607 }
3608
3609 if (node_name) {
3610 bs = bdrv_find_node(node_name);
3611
3612 if (bs) {
3613 return bs;
3614 }
3615 }
3616
3617 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3618 device ? device : "",
3619 node_name ? node_name : "");
3620 return NULL;
3621}
3622
3623BlockDriverState *bdrv_next(BlockDriverState *bs)
3624{
3625 if (!bs) {
3626 return QTAILQ_FIRST(&bdrv_states);
3627 }
3628 return QTAILQ_NEXT(bs, device_list);
3629}
3630
3631void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
3632{
3633 BlockDriverState *bs;
3634
3635 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3636 it(opaque, bs);
3637 }
3638}
3639
3640const char *bdrv_get_device_name(BlockDriverState *bs)
3641{
3642 return bs->device_name;
3643}
3644
3645int bdrv_get_flags(BlockDriverState *bs)
3646{
3647 return bs->open_flags;
3648}
3649
3650int bdrv_flush_all(void)
3651{
3652 BlockDriverState *bs;
3653 int result = 0;
3654
3655 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
3656 int ret = bdrv_flush(bs);
3657 if (ret < 0 && !result) {
3658 result = ret;
3659 }
3660 }
3661
3662 return result;
3663}
3664
3665int bdrv_has_zero_init_1(BlockDriverState *bs)
3666{
3667 return 1;
3668}
3669
3670int bdrv_has_zero_init(BlockDriverState *bs)
3671{
3672 assert(bs->drv);
3673
3674 /* If BS is a copy on write image, it is initialized to
3675 the contents of the base image, which may not be zeroes. */
3676 if (bs->backing_hd) {
3677 return 0;
3678 }
3679 if (bs->drv->bdrv_has_zero_init) {
3680 return bs->drv->bdrv_has_zero_init(bs);
3681 }
3682
3683 /* safe default */
3684 return 0;
3685}
3686
3687bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3688{
3689 BlockDriverInfo bdi;
3690
3691 if (bs->backing_hd) {
3692 return false;
3693 }
3694
3695 if (bdrv_get_info(bs, &bdi) == 0) {
3696 return bdi.unallocated_blocks_are_zero;
3697 }
3698
3699 return false;
3700}
3701
3702bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3703{
3704 BlockDriverInfo bdi;
3705
3706 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3707 return false;
3708 }
3709
3710 if (bdrv_get_info(bs, &bdi) == 0) {
3711 return bdi.can_write_zeroes_with_unmap;
3712 }
3713
3714 return false;
3715}
3716
3717typedef struct BdrvCoGetBlockStatusData {
3718 BlockDriverState *bs;
3719 BlockDriverState *base;
3720 int64_t sector_num;
3721 int nb_sectors;
3722 int *pnum;
3723 int64_t ret;
3724 bool done;
3725} BdrvCoGetBlockStatusData;
3726
3727/*
3728 * Returns true iff the specified sector is present in the disk image. Drivers
3729 * not implementing the functionality are assumed to not support backing files,
3730 * hence all their sectors are reported as allocated.
3731 *
3732 * If 'sector_num' is beyond the end of the disk image the return value is 0
3733 * and 'pnum' is set to 0.
3734 *
3735 * 'pnum' is set to the number of sectors (including and immediately following
3736 * the specified sector) that are known to be in the same
3737 * allocated/unallocated state.
3738 *
3739 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3740 * beyond the end of the disk image it will be clamped.
3741 */
3742static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3743 int64_t sector_num,
3744 int nb_sectors, int *pnum)
3745{
3746 int64_t length;
3747 int64_t n;
3748 int64_t ret, ret2;
3749
3750 length = bdrv_getlength(bs);
3751 if (length < 0) {
3752 return length;
3753 }
3754
3755 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
3756 *pnum = 0;
3757 return 0;
3758 }
3759
3760 n = bs->total_sectors - sector_num;
3761 if (n < nb_sectors) {
3762 nb_sectors = n;
3763 }
3764
3765 if (!bs->drv->bdrv_co_get_block_status) {
3766 *pnum = nb_sectors;
3767 ret = BDRV_BLOCK_DATA;
3768 if (bs->drv->protocol_name) {
3769 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3770 }
3771 return ret;
3772 }
3773
3774 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3775 if (ret < 0) {
3776 *pnum = 0;
3777 return ret;
3778 }
3779
3780 if (ret & BDRV_BLOCK_RAW) {
3781 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3782 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3783 *pnum, pnum);
3784 }
3785
3786 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3787 if (bdrv_unallocated_blocks_are_zero(bs)) {
3788 ret |= BDRV_BLOCK_ZERO;
3789 } else if (bs->backing_hd) {
3790 BlockDriverState *bs2 = bs->backing_hd;
3791 int64_t length2 = bdrv_getlength(bs2);
3792 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3793 ret |= BDRV_BLOCK_ZERO;
3794 }
3795 }
3796 }
3797
3798 if (bs->file &&
3799 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3800 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3801 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3802 *pnum, pnum);
3803 if (ret2 >= 0) {
3804 /* Ignore errors. This is just providing extra information, it
3805 * is useful but not necessary.
3806 */
3807 ret |= (ret2 & BDRV_BLOCK_ZERO);
3808 }
3809 }
3810
3811 return ret;
3812}
3813
3814/* Coroutine wrapper for bdrv_get_block_status() */
3815static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
3816{
3817 BdrvCoGetBlockStatusData *data = opaque;
3818 BlockDriverState *bs = data->bs;
3819
3820 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3821 data->pnum);
3822 data->done = true;
3823}
3824
3825/*
3826 * Synchronous wrapper around bdrv_co_get_block_status().
3827 *
3828 * See bdrv_co_get_block_status() for details.
3829 */
3830int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3831 int nb_sectors, int *pnum)
3832{
3833 Coroutine *co;
3834 BdrvCoGetBlockStatusData data = {
3835 .bs = bs,
3836 .sector_num = sector_num,
3837 .nb_sectors = nb_sectors,
3838 .pnum = pnum,
3839 .done = false,
3840 };
3841
3842 if (qemu_in_coroutine()) {
3843 /* Fast-path if already in coroutine context */
3844 bdrv_get_block_status_co_entry(&data);
3845 } else {
3846 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
3847 qemu_coroutine_enter(co, &data);
3848 while (!data.done) {
3849 qemu_aio_wait();
3850 }
3851 }
3852 return data.ret;
3853}
3854
3855int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3856 int nb_sectors, int *pnum)
3857{
3858 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3859 if (ret < 0) {
3860 return ret;
3861 }
3862 return
3863 (ret & BDRV_BLOCK_DATA) ||
3864 ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
3865}
3866
3867/*
3868 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3869 *
3870 * Return true if the given sector is allocated in any image between
3871 * BASE and TOP (inclusive). BASE can be NULL to check if the given
3872 * sector is allocated in any image of the chain. Return false otherwise.
3873 *
3874 * 'pnum' is set to the number of sectors (including and immediately following
3875 * the specified sector) that are known to be in the same
3876 * allocated/unallocated state.
3877 *
3878 */
3879int bdrv_is_allocated_above(BlockDriverState *top,
3880 BlockDriverState *base,
3881 int64_t sector_num,
3882 int nb_sectors, int *pnum)
3883{
3884 BlockDriverState *intermediate;
3885 int ret, n = nb_sectors;
3886
3887 intermediate = top;
3888 while (intermediate && intermediate != base) {
3889 int pnum_inter;
3890 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3891 &pnum_inter);
3892 if (ret < 0) {
3893 return ret;
3894 } else if (ret) {
3895 *pnum = pnum_inter;
3896 return 1;
3897 }
3898
3899 /*
3900 * [sector_num, nb_sectors] is unallocated on top but intermediate
3901 * might have
3902 *
3903 * [sector_num+x, nr_sectors] allocated.
3904 */
3905 if (n > pnum_inter &&
3906 (intermediate == top ||
3907 sector_num + pnum_inter < intermediate->total_sectors)) {
3908 n = pnum_inter;
3909 }
3910
3911 intermediate = intermediate->backing_hd;
3912 }
3913
3914 *pnum = n;
3915 return 0;
3916}
3917
3918const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
3919{
3920 if (bs->backing_hd && bs->backing_hd->encrypted)
3921 return bs->backing_file;
3922 else if (bs->encrypted)
3923 return bs->filename;
3924 else
3925 return NULL;
3926}
3927
3928void bdrv_get_backing_filename(BlockDriverState *bs,
3929 char *filename, int filename_size)
3930{
3931 pstrcpy(filename, filename_size, bs->backing_file);
3932}
3933
3934int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
3935 const uint8_t *buf, int nb_sectors)
3936{
3937 BlockDriver *drv = bs->drv;
3938 if (!drv)
3939 return -ENOMEDIUM;
3940 if (!drv->bdrv_write_compressed)
3941 return -ENOTSUP;
3942 if (bdrv_check_request(bs, sector_num, nb_sectors))
3943 return -EIO;
3944
3945 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
3946
3947 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
3948}
3949
3950int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3951{
3952 BlockDriver *drv = bs->drv;
3953 if (!drv)
3954 return -ENOMEDIUM;
3955 if (!drv->bdrv_get_info)
3956 return -ENOTSUP;
3957 memset(bdi, 0, sizeof(*bdi));
3958 return drv->bdrv_get_info(bs, bdi);
3959}
3960
3961ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
3962{
3963 BlockDriver *drv = bs->drv;
3964 if (drv && drv->bdrv_get_specific_info) {
3965 return drv->bdrv_get_specific_info(bs);
3966 }
3967 return NULL;
3968}
3969
3970int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
3971 int64_t pos, int size)
3972{
3973 QEMUIOVector qiov;
3974 struct iovec iov = {
3975 .iov_base = (void *) buf,
3976 .iov_len = size,
3977 };
3978
3979 qemu_iovec_init_external(&qiov, &iov, 1);
3980 return bdrv_writev_vmstate(bs, &qiov, pos);
3981}
3982
3983int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
3984{
3985 BlockDriver *drv = bs->drv;
3986
3987 if (!drv) {
3988 return -ENOMEDIUM;
3989 } else if (drv->bdrv_save_vmstate) {
3990 return drv->bdrv_save_vmstate(bs, qiov, pos);
3991 } else if (bs->file) {
3992 return bdrv_writev_vmstate(bs->file, qiov, pos);
3993 }
3994
3995 return -ENOTSUP;
3996}
3997
3998int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
3999 int64_t pos, int size)
4000{
4001 BlockDriver *drv = bs->drv;
4002 if (!drv)
4003 return -ENOMEDIUM;
4004 if (drv->bdrv_load_vmstate)
4005 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4006 if (bs->file)
4007 return bdrv_load_vmstate(bs->file, buf, pos, size);
4008 return -ENOTSUP;
4009}
4010
4011void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4012{
4013 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
4014 return;
4015 }
4016
4017 bs->drv->bdrv_debug_event(bs, event);
4018}
4019
4020int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4021 const char *tag)
4022{
4023 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4024 bs = bs->file;
4025 }
4026
4027 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4028 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4029 }
4030
4031 return -ENOTSUP;
4032}
4033
4034int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4035{
4036 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4037 bs = bs->file;
4038 }
4039
4040 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4041 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4042 }
4043
4044 return -ENOTSUP;
4045}
4046
4047int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4048{
4049 while (bs && bs->drv && !bs->drv->bdrv_debug_resume) {
4050 bs = bs->file;
4051 }
4052
4053 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4054 return bs->drv->bdrv_debug_resume(bs, tag);
4055 }
4056
4057 return -ENOTSUP;
4058}
4059
4060bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4061{
4062 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4063 bs = bs->file;
4064 }
4065
4066 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4067 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4068 }
4069
4070 return false;
4071}
4072
4073int bdrv_is_snapshot(BlockDriverState *bs)
4074{
4075 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4076}
4077
4078/* backing_file can either be relative, or absolute, or a protocol. If it is
4079 * relative, it must be relative to the chain. So, passing in bs->filename
4080 * from a BDS as backing_file should not be done, as that may be relative to
4081 * the CWD rather than the chain. */
4082BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4083 const char *backing_file)
4084{
4085 char *filename_full = NULL;
4086 char *backing_file_full = NULL;
4087 char *filename_tmp = NULL;
4088 int is_protocol = 0;
4089 BlockDriverState *curr_bs = NULL;
4090 BlockDriverState *retval = NULL;
4091
4092 if (!bs || !bs->drv || !backing_file) {
4093 return NULL;
4094 }
4095
4096 filename_full = g_malloc(PATH_MAX);
4097 backing_file_full = g_malloc(PATH_MAX);
4098 filename_tmp = g_malloc(PATH_MAX);
4099
4100 is_protocol = path_has_protocol(backing_file);
4101
4102 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4103
4104 /* If either of the filename paths is actually a protocol, then
4105 * compare unmodified paths; otherwise make paths relative */
4106 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4107 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4108 retval = curr_bs->backing_hd;
4109 break;
4110 }
4111 } else {
4112 /* If not an absolute filename path, make it relative to the current
4113 * image's filename path */
4114 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4115 backing_file);
4116
4117 /* We are going to compare absolute pathnames */
4118 if (!realpath(filename_tmp, filename_full)) {
4119 continue;
4120 }
4121
4122 /* We need to make sure the backing filename we are comparing against
4123 * is relative to the current image filename (or absolute) */
4124 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4125 curr_bs->backing_file);
4126
4127 if (!realpath(filename_tmp, backing_file_full)) {
4128 continue;
4129 }
4130
4131 if (strcmp(backing_file_full, filename_full) == 0) {
4132 retval = curr_bs->backing_hd;
4133 break;
4134 }
4135 }
4136 }
4137
4138 g_free(filename_full);
4139 g_free(backing_file_full);
4140 g_free(filename_tmp);
4141 return retval;
4142}
4143
4144int bdrv_get_backing_file_depth(BlockDriverState *bs)
4145{
4146 if (!bs->drv) {
4147 return 0;
4148 }
4149
4150 if (!bs->backing_hd) {
4151 return 0;
4152 }
4153
4154 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4155}
4156
4157BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4158{
4159 BlockDriverState *curr_bs = NULL;
4160
4161 if (!bs) {
4162 return NULL;
4163 }
4164
4165 curr_bs = bs;
4166
4167 while (curr_bs->backing_hd) {
4168 curr_bs = curr_bs->backing_hd;
4169 }
4170 return curr_bs;
4171}
4172
4173/**************************************************************/
4174/* async I/Os */
4175
4176BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
4177 QEMUIOVector *qiov, int nb_sectors,
4178 BlockDriverCompletionFunc *cb, void *opaque)
4179{
4180 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4181
4182 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4183 cb, opaque, false);
4184}
4185
4186BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4187 QEMUIOVector *qiov, int nb_sectors,
4188 BlockDriverCompletionFunc *cb, void *opaque)
4189{
4190 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4191
4192 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
4193 cb, opaque, true);
4194}
4195
4196BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4197 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4198 BlockDriverCompletionFunc *cb, void *opaque)
4199{
4200 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4201
4202 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4203 BDRV_REQ_ZERO_WRITE | flags,
4204 cb, opaque, true);
4205}
4206
4207
4208typedef struct MultiwriteCB {
4209 int error;
4210 int num_requests;
4211 int num_callbacks;
4212 struct {
4213 BlockDriverCompletionFunc *cb;
4214 void *opaque;
4215 QEMUIOVector *free_qiov;
4216 } callbacks[];
4217} MultiwriteCB;
4218
4219static void multiwrite_user_cb(MultiwriteCB *mcb)
4220{
4221 int i;
4222
4223 for (i = 0; i < mcb->num_callbacks; i++) {
4224 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
4225 if (mcb->callbacks[i].free_qiov) {
4226 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4227 }
4228 g_free(mcb->callbacks[i].free_qiov);
4229 }
4230}
4231
4232static void multiwrite_cb(void *opaque, int ret)
4233{
4234 MultiwriteCB *mcb = opaque;
4235
4236 trace_multiwrite_cb(mcb, ret);
4237
4238 if (ret < 0 && !mcb->error) {
4239 mcb->error = ret;
4240 }
4241
4242 mcb->num_requests--;
4243 if (mcb->num_requests == 0) {
4244 multiwrite_user_cb(mcb);
4245 g_free(mcb);
4246 }
4247}
4248
4249static int multiwrite_req_compare(const void *a, const void *b)
4250{
4251 const BlockRequest *req1 = a, *req2 = b;
4252
4253 /*
4254 * Note that we can't simply subtract req2->sector from req1->sector
4255 * here as that could overflow the return value.
4256 */
4257 if (req1->sector > req2->sector) {
4258 return 1;
4259 } else if (req1->sector < req2->sector) {
4260 return -1;
4261 } else {
4262 return 0;
4263 }
4264}
4265
4266/*
4267 * Takes a bunch of requests and tries to merge them. Returns the number of
4268 * requests that remain after merging.
4269 */
4270static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4271 int num_reqs, MultiwriteCB *mcb)
4272{
4273 int i, outidx;
4274
4275 // Sort requests by start sector
4276 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4277
4278 // Check if adjacent requests touch the same clusters. If so, combine them,
4279 // filling up gaps with zero sectors.
4280 outidx = 0;
4281 for (i = 1; i < num_reqs; i++) {
4282 int merge = 0;
4283 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4284
4285 // Handle exactly sequential writes and overlapping writes.
4286 if (reqs[i].sector <= oldreq_last) {
4287 merge = 1;
4288 }
4289
4290 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4291 merge = 0;
4292 }
4293
4294 if (merge) {
4295 size_t size;
4296 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
4297 qemu_iovec_init(qiov,
4298 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4299
4300 // Add the first request to the merged one. If the requests are
4301 // overlapping, drop the last sectors of the first request.
4302 size = (reqs[i].sector - reqs[outidx].sector) << 9;
4303 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
4304
4305 // We should need to add any zeros between the two requests
4306 assert (reqs[i].sector <= oldreq_last);
4307
4308 // Add the second request
4309 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
4310
4311 reqs[outidx].nb_sectors = qiov->size >> 9;
4312 reqs[outidx].qiov = qiov;
4313
4314 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4315 } else {
4316 outidx++;
4317 reqs[outidx].sector = reqs[i].sector;
4318 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4319 reqs[outidx].qiov = reqs[i].qiov;
4320 }
4321 }
4322
4323 return outidx + 1;
4324}
4325
4326/*
4327 * Submit multiple AIO write requests at once.
4328 *
4329 * On success, the function returns 0 and all requests in the reqs array have
4330 * been submitted. In error case this function returns -1, and any of the
4331 * requests may or may not be submitted yet. In particular, this means that the
4332 * callback will be called for some of the requests, for others it won't. The
4333 * caller must check the error field of the BlockRequest to wait for the right
4334 * callbacks (if error != 0, no callback will be called).
4335 *
4336 * The implementation may modify the contents of the reqs array, e.g. to merge
4337 * requests. However, the fields opaque and error are left unmodified as they
4338 * are used to signal failure for a single request to the caller.
4339 */
4340int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4341{
4342 MultiwriteCB *mcb;
4343 int i;
4344
4345 /* don't submit writes if we don't have a medium */
4346 if (bs->drv == NULL) {
4347 for (i = 0; i < num_reqs; i++) {
4348 reqs[i].error = -ENOMEDIUM;
4349 }
4350 return -1;
4351 }
4352
4353 if (num_reqs == 0) {
4354 return 0;
4355 }
4356
4357 // Create MultiwriteCB structure
4358 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
4359 mcb->num_requests = 0;
4360 mcb->num_callbacks = num_reqs;
4361
4362 for (i = 0; i < num_reqs; i++) {
4363 mcb->callbacks[i].cb = reqs[i].cb;
4364 mcb->callbacks[i].opaque = reqs[i].opaque;
4365 }
4366
4367 // Check for mergable requests
4368 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4369
4370 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4371
4372 /* Run the aio requests. */
4373 mcb->num_requests = num_reqs;
4374 for (i = 0; i < num_reqs; i++) {
4375 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4376 reqs[i].nb_sectors, reqs[i].flags,
4377 multiwrite_cb, mcb,
4378 true);
4379 }
4380
4381 return 0;
4382}
4383
4384void bdrv_aio_cancel(BlockDriverAIOCB *acb)
4385{
4386 acb->aiocb_info->cancel(acb);
4387}
4388
4389/**************************************************************/
4390/* async block device emulation */
4391
4392typedef struct BlockDriverAIOCBSync {
4393 BlockDriverAIOCB common;
4394 QEMUBH *bh;
4395 int ret;
4396 /* vector translation state */
4397 QEMUIOVector *qiov;
4398 uint8_t *bounce;
4399 int is_write;
4400} BlockDriverAIOCBSync;
4401
4402static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4403{
4404 BlockDriverAIOCBSync *acb =
4405 container_of(blockacb, BlockDriverAIOCBSync, common);
4406 qemu_bh_delete(acb->bh);
4407 acb->bh = NULL;
4408 qemu_aio_release(acb);
4409}
4410
4411static const AIOCBInfo bdrv_em_aiocb_info = {
4412 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4413 .cancel = bdrv_aio_cancel_em,
4414};
4415
4416static void bdrv_aio_bh_cb(void *opaque)
4417{
4418 BlockDriverAIOCBSync *acb = opaque;
4419
4420 if (!acb->is_write)
4421 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
4422 qemu_vfree(acb->bounce);
4423 acb->common.cb(acb->common.opaque, acb->ret);
4424 qemu_bh_delete(acb->bh);
4425 acb->bh = NULL;
4426 qemu_aio_release(acb);
4427}
4428
4429static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4430 int64_t sector_num,
4431 QEMUIOVector *qiov,
4432 int nb_sectors,
4433 BlockDriverCompletionFunc *cb,
4434 void *opaque,
4435 int is_write)
4436
4437{
4438 BlockDriverAIOCBSync *acb;
4439
4440 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
4441 acb->is_write = is_write;
4442 acb->qiov = qiov;
4443 acb->bounce = qemu_blockalign(bs, qiov->size);
4444 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
4445
4446 if (is_write) {
4447 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
4448 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
4449 } else {
4450 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
4451 }
4452
4453 qemu_bh_schedule(acb->bh);
4454
4455 return &acb->common;
4456}
4457
4458static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4459 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4460 BlockDriverCompletionFunc *cb, void *opaque)
4461{
4462 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4463}
4464
4465static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4466 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4467 BlockDriverCompletionFunc *cb, void *opaque)
4468{
4469 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
4470}
4471
4472
4473typedef struct BlockDriverAIOCBCoroutine {
4474 BlockDriverAIOCB common;
4475 BlockRequest req;
4476 bool is_write;
4477 bool *done;
4478 QEMUBH* bh;
4479} BlockDriverAIOCBCoroutine;
4480
4481static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4482{
4483 BlockDriverAIOCBCoroutine *acb =
4484 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4485 bool done = false;
4486
4487 acb->done = &done;
4488 while (!done) {
4489 qemu_aio_wait();
4490 }
4491}
4492
4493static const AIOCBInfo bdrv_em_co_aiocb_info = {
4494 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4495 .cancel = bdrv_aio_co_cancel_em,
4496};
4497
4498static void bdrv_co_em_bh(void *opaque)
4499{
4500 BlockDriverAIOCBCoroutine *acb = opaque;
4501
4502 acb->common.cb(acb->common.opaque, acb->req.error);
4503
4504 if (acb->done) {
4505 *acb->done = true;
4506 }
4507
4508 qemu_bh_delete(acb->bh);
4509 qemu_aio_release(acb);
4510}
4511
4512/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4513static void coroutine_fn bdrv_co_do_rw(void *opaque)
4514{
4515 BlockDriverAIOCBCoroutine *acb = opaque;
4516 BlockDriverState *bs = acb->common.bs;
4517
4518 if (!acb->is_write) {
4519 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
4520 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4521 } else {
4522 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
4523 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
4524 }
4525
4526 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4527 qemu_bh_schedule(acb->bh);
4528}
4529
4530static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4531 int64_t sector_num,
4532 QEMUIOVector *qiov,
4533 int nb_sectors,
4534 BdrvRequestFlags flags,
4535 BlockDriverCompletionFunc *cb,
4536 void *opaque,
4537 bool is_write)
4538{
4539 Coroutine *co;
4540 BlockDriverAIOCBCoroutine *acb;
4541
4542 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4543 acb->req.sector = sector_num;
4544 acb->req.nb_sectors = nb_sectors;
4545 acb->req.qiov = qiov;
4546 acb->req.flags = flags;
4547 acb->is_write = is_write;
4548 acb->done = NULL;
4549
4550 co = qemu_coroutine_create(bdrv_co_do_rw);
4551 qemu_coroutine_enter(co, acb);
4552
4553 return &acb->common;
4554}
4555
4556static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
4557{
4558 BlockDriverAIOCBCoroutine *acb = opaque;
4559 BlockDriverState *bs = acb->common.bs;
4560
4561 acb->req.error = bdrv_co_flush(bs);
4562 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4563 qemu_bh_schedule(acb->bh);
4564}
4565
4566BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
4567 BlockDriverCompletionFunc *cb, void *opaque)
4568{
4569 trace_bdrv_aio_flush(bs, opaque);
4570
4571 Coroutine *co;
4572 BlockDriverAIOCBCoroutine *acb;
4573
4574 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4575 acb->done = NULL;
4576
4577 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4578 qemu_coroutine_enter(co, acb);
4579
4580 return &acb->common;
4581}
4582
4583static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4584{
4585 BlockDriverAIOCBCoroutine *acb = opaque;
4586 BlockDriverState *bs = acb->common.bs;
4587
4588 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4589 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4590 qemu_bh_schedule(acb->bh);
4591}
4592
4593BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4594 int64_t sector_num, int nb_sectors,
4595 BlockDriverCompletionFunc *cb, void *opaque)
4596{
4597 Coroutine *co;
4598 BlockDriverAIOCBCoroutine *acb;
4599
4600 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4601
4602 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4603 acb->req.sector = sector_num;
4604 acb->req.nb_sectors = nb_sectors;
4605 acb->done = NULL;
4606 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4607 qemu_coroutine_enter(co, acb);
4608
4609 return &acb->common;
4610}
4611
4612void bdrv_init(void)
4613{
4614 module_call_init(MODULE_INIT_BLOCK);
4615}
4616
4617void bdrv_init_with_whitelist(void)
4618{
4619 use_bdrv_whitelist = 1;
4620 bdrv_init();
4621}
4622
4623void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
4624 BlockDriverCompletionFunc *cb, void *opaque)
4625{
4626 BlockDriverAIOCB *acb;
4627
4628 acb = g_slice_alloc(aiocb_info->aiocb_size);
4629 acb->aiocb_info = aiocb_info;
4630 acb->bs = bs;
4631 acb->cb = cb;
4632 acb->opaque = opaque;
4633 return acb;
4634}
4635
4636void qemu_aio_release(void *p)
4637{
4638 BlockDriverAIOCB *acb = p;
4639 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
4640}
4641
4642/**************************************************************/
4643/* Coroutine block device emulation */
4644
4645typedef struct CoroutineIOCompletion {
4646 Coroutine *coroutine;
4647 int ret;
4648} CoroutineIOCompletion;
4649
4650static void bdrv_co_io_em_complete(void *opaque, int ret)
4651{
4652 CoroutineIOCompletion *co = opaque;
4653
4654 co->ret = ret;
4655 qemu_coroutine_enter(co->coroutine, NULL);
4656}
4657
4658static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4659 int nb_sectors, QEMUIOVector *iov,
4660 bool is_write)
4661{
4662 CoroutineIOCompletion co = {
4663 .coroutine = qemu_coroutine_self(),
4664 };
4665 BlockDriverAIOCB *acb;
4666
4667 if (is_write) {
4668 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4669 bdrv_co_io_em_complete, &co);
4670 } else {
4671 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4672 bdrv_co_io_em_complete, &co);
4673 }
4674
4675 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
4676 if (!acb) {
4677 return -EIO;
4678 }
4679 qemu_coroutine_yield();
4680
4681 return co.ret;
4682}
4683
4684static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4685 int64_t sector_num, int nb_sectors,
4686 QEMUIOVector *iov)
4687{
4688 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4689}
4690
4691static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4692 int64_t sector_num, int nb_sectors,
4693 QEMUIOVector *iov)
4694{
4695 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4696}
4697
4698static void coroutine_fn bdrv_flush_co_entry(void *opaque)
4699{
4700 RwCo *rwco = opaque;
4701
4702 rwco->ret = bdrv_co_flush(rwco->bs);
4703}
4704
4705int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4706{
4707 int ret;
4708
4709 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
4710 return 0;
4711 }
4712
4713 /* Write back cached data to the OS even with cache=unsafe */
4714 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
4715 if (bs->drv->bdrv_co_flush_to_os) {
4716 ret = bs->drv->bdrv_co_flush_to_os(bs);
4717 if (ret < 0) {
4718 return ret;
4719 }
4720 }
4721
4722 /* But don't actually force it to the disk with cache=unsafe */
4723 if (bs->open_flags & BDRV_O_NO_FLUSH) {
4724 goto flush_parent;
4725 }
4726
4727 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
4728 if (bs->drv->bdrv_co_flush_to_disk) {
4729 ret = bs->drv->bdrv_co_flush_to_disk(bs);
4730 } else if (bs->drv->bdrv_aio_flush) {
4731 BlockDriverAIOCB *acb;
4732 CoroutineIOCompletion co = {
4733 .coroutine = qemu_coroutine_self(),
4734 };
4735
4736 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4737 if (acb == NULL) {
4738 ret = -EIO;
4739 } else {
4740 qemu_coroutine_yield();
4741 ret = co.ret;
4742 }
4743 } else {
4744 /*
4745 * Some block drivers always operate in either writethrough or unsafe
4746 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4747 * know how the server works (because the behaviour is hardcoded or
4748 * depends on server-side configuration), so we can't ensure that
4749 * everything is safe on disk. Returning an error doesn't work because
4750 * that would break guests even if the server operates in writethrough
4751 * mode.
4752 *
4753 * Let's hope the user knows what he's doing.
4754 */
4755 ret = 0;
4756 }
4757 if (ret < 0) {
4758 return ret;
4759 }
4760
4761 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4762 * in the case of cache=unsafe, so there are no useless flushes.
4763 */
4764flush_parent:
4765 return bdrv_co_flush(bs->file);
4766}
4767
4768void bdrv_invalidate_cache(BlockDriverState *bs)
4769{
4770 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
4771 bs->drv->bdrv_invalidate_cache(bs);
4772 }
4773}
4774
4775void bdrv_invalidate_cache_all(void)
4776{
4777 BlockDriverState *bs;
4778
4779 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4780 bdrv_invalidate_cache(bs);
4781 }
4782}
4783
4784void bdrv_clear_incoming_migration_all(void)
4785{
4786 BlockDriverState *bs;
4787
4788 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
4789 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4790 }
4791}
4792
4793int bdrv_flush(BlockDriverState *bs)
4794{
4795 Coroutine *co;
4796 RwCo rwco = {
4797 .bs = bs,
4798 .ret = NOT_DONE,
4799 };
4800
4801 if (qemu_in_coroutine()) {
4802 /* Fast-path if already in coroutine context */
4803 bdrv_flush_co_entry(&rwco);
4804 } else {
4805 co = qemu_coroutine_create(bdrv_flush_co_entry);
4806 qemu_coroutine_enter(co, &rwco);
4807 while (rwco.ret == NOT_DONE) {
4808 qemu_aio_wait();
4809 }
4810 }
4811
4812 return rwco.ret;
4813}
4814
4815typedef struct DiscardCo {
4816 BlockDriverState *bs;
4817 int64_t sector_num;
4818 int nb_sectors;
4819 int ret;
4820} DiscardCo;
4821static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4822{
4823 DiscardCo *rwco = opaque;
4824
4825 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4826}
4827
4828/* if no limit is specified in the BlockLimits use a default
4829 * of 32768 512-byte sectors (16 MiB) per request.
4830 */
4831#define MAX_DISCARD_DEFAULT 32768
4832
4833int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4834 int nb_sectors)
4835{
4836 int max_discard;
4837
4838 if (!bs->drv) {
4839 return -ENOMEDIUM;
4840 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4841 return -EIO;
4842 } else if (bs->read_only) {
4843 return -EROFS;
4844 }
4845
4846 bdrv_reset_dirty(bs, sector_num, nb_sectors);
4847
4848 /* Do nothing if disabled. */
4849 if (!(bs->open_flags & BDRV_O_UNMAP)) {
4850 return 0;
4851 }
4852
4853 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4854 return 0;
4855 }
4856
4857 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4858 while (nb_sectors > 0) {
4859 int ret;
4860 int num = nb_sectors;
4861
4862 /* align request */
4863 if (bs->bl.discard_alignment &&
4864 num >= bs->bl.discard_alignment &&
4865 sector_num % bs->bl.discard_alignment) {
4866 if (num > bs->bl.discard_alignment) {
4867 num = bs->bl.discard_alignment;
4868 }
4869 num -= sector_num % bs->bl.discard_alignment;
4870 }
4871
4872 /* limit request size */
4873 if (num > max_discard) {
4874 num = max_discard;
4875 }
4876
4877 if (bs->drv->bdrv_co_discard) {
4878 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
4879 } else {
4880 BlockDriverAIOCB *acb;
4881 CoroutineIOCompletion co = {
4882 .coroutine = qemu_coroutine_self(),
4883 };
4884
4885 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4886 bdrv_co_io_em_complete, &co);
4887 if (acb == NULL) {
4888 return -EIO;
4889 } else {
4890 qemu_coroutine_yield();
4891 ret = co.ret;
4892 }
4893 }
4894 if (ret && ret != -ENOTSUP) {
4895 return ret;
4896 }
4897
4898 sector_num += num;
4899 nb_sectors -= num;
4900 }
4901 return 0;
4902}
4903
4904int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4905{
4906 Coroutine *co;
4907 DiscardCo rwco = {
4908 .bs = bs,
4909 .sector_num = sector_num,
4910 .nb_sectors = nb_sectors,
4911 .ret = NOT_DONE,
4912 };
4913
4914 if (qemu_in_coroutine()) {
4915 /* Fast-path if already in coroutine context */
4916 bdrv_discard_co_entry(&rwco);
4917 } else {
4918 co = qemu_coroutine_create(bdrv_discard_co_entry);
4919 qemu_coroutine_enter(co, &rwco);
4920 while (rwco.ret == NOT_DONE) {
4921 qemu_aio_wait();
4922 }
4923 }
4924
4925 return rwco.ret;
4926}
4927
4928/**************************************************************/
4929/* removable device support */
4930
4931/**
4932 * Return TRUE if the media is present
4933 */
4934int bdrv_is_inserted(BlockDriverState *bs)
4935{
4936 BlockDriver *drv = bs->drv;
4937
4938 if (!drv)
4939 return 0;
4940 if (!drv->bdrv_is_inserted)
4941 return 1;
4942 return drv->bdrv_is_inserted(bs);
4943}
4944
4945/**
4946 * Return whether the media changed since the last call to this
4947 * function, or -ENOTSUP if we don't know. Most drivers don't know.
4948 */
4949int bdrv_media_changed(BlockDriverState *bs)
4950{
4951 BlockDriver *drv = bs->drv;
4952
4953 if (drv && drv->bdrv_media_changed) {
4954 return drv->bdrv_media_changed(bs);
4955 }
4956 return -ENOTSUP;
4957}
4958
4959/**
4960 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4961 */
4962void bdrv_eject(BlockDriverState *bs, bool eject_flag)
4963{
4964 BlockDriver *drv = bs->drv;
4965
4966 if (drv && drv->bdrv_eject) {
4967 drv->bdrv_eject(bs, eject_flag);
4968 }
4969
4970 if (bs->device_name[0] != '\0') {
4971 bdrv_emit_qmp_eject_event(bs, eject_flag);
4972 }
4973}
4974
4975/**
4976 * Lock or unlock the media (if it is locked, the user won't be able
4977 * to eject it manually).
4978 */
4979void bdrv_lock_medium(BlockDriverState *bs, bool locked)
4980{
4981 BlockDriver *drv = bs->drv;
4982
4983 trace_bdrv_lock_medium(bs, locked);
4984
4985 if (drv && drv->bdrv_lock_medium) {
4986 drv->bdrv_lock_medium(bs, locked);
4987 }
4988}
4989
4990/* needed for generic scsi interface */
4991
4992int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4993{
4994 BlockDriver *drv = bs->drv;
4995
4996 if (drv && drv->bdrv_ioctl)
4997 return drv->bdrv_ioctl(bs, req, buf);
4998 return -ENOTSUP;
4999}
5000
5001BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5002 unsigned long int req, void *buf,
5003 BlockDriverCompletionFunc *cb, void *opaque)
5004{
5005 BlockDriver *drv = bs->drv;
5006
5007 if (drv && drv->bdrv_aio_ioctl)
5008 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5009 return NULL;
5010}
5011
5012void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
5013{
5014 bs->guest_block_size = align;
5015}
5016
5017void *qemu_blockalign(BlockDriverState *bs, size_t size)
5018{
5019 return qemu_memalign(bdrv_opt_mem_align(bs), size);
5020}
5021
5022/*
5023 * Check if all memory in this vector is sector aligned.
5024 */
5025bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5026{
5027 int i;
5028 size_t alignment = bdrv_opt_mem_align(bs);
5029
5030 for (i = 0; i < qiov->niov; i++) {
5031 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
5032 return false;
5033 }
5034 if (qiov->iov[i].iov_len % alignment) {
5035 return false;
5036 }
5037 }
5038
5039 return true;
5040}
5041
5042BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity)
5043{
5044 int64_t bitmap_size;
5045 BdrvDirtyBitmap *bitmap;
5046
5047 assert((granularity & (granularity - 1)) == 0);
5048
5049 granularity >>= BDRV_SECTOR_BITS;
5050 assert(granularity);
5051 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
5052 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5053 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5054 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5055 return bitmap;
5056}
5057
5058void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5059{
5060 BdrvDirtyBitmap *bm, *next;
5061 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5062 if (bm == bitmap) {
5063 QLIST_REMOVE(bitmap, list);
5064 hbitmap_free(bitmap->bitmap);
5065 g_free(bitmap);
5066 return;
5067 }
5068 }
5069}
5070
5071BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5072{
5073 BdrvDirtyBitmap *bm;
5074 BlockDirtyInfoList *list = NULL;
5075 BlockDirtyInfoList **plist = &list;
5076
5077 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5078 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5079 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5080 info->count = bdrv_get_dirty_count(bs, bm);
5081 info->granularity =
5082 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5083 entry->value = info;
5084 *plist = entry;
5085 plist = &entry->next;
5086 }
5087
5088 return list;
5089}
5090
5091int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
5092{
5093 if (bitmap) {
5094 return hbitmap_get(bitmap->bitmap, sector);
5095 } else {
5096 return 0;
5097 }
5098}
5099
5100void bdrv_dirty_iter_init(BlockDriverState *bs,
5101 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
5102{
5103 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
5104}
5105
5106void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5107 int nr_sectors)
5108{
5109 BdrvDirtyBitmap *bitmap;
5110 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5111 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5112 }
5113}
5114
5115void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
5116{
5117 BdrvDirtyBitmap *bitmap;
5118 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5119 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5120 }
5121}
5122
5123int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5124{
5125 return hbitmap_count(bitmap->bitmap);
5126}
5127
5128/* Get a reference to bs */
5129void bdrv_ref(BlockDriverState *bs)
5130{
5131 bs->refcnt++;
5132}
5133
5134/* Release a previously grabbed reference to bs.
5135 * If after releasing, reference count is zero, the BlockDriverState is
5136 * deleted. */
5137void bdrv_unref(BlockDriverState *bs)
5138{
5139 assert(bs->refcnt > 0);
5140 if (--bs->refcnt == 0) {
5141 bdrv_delete(bs);
5142 }
5143}
5144
5145void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5146{
5147 assert(bs->in_use != in_use);
5148 bs->in_use = in_use;
5149}
5150
5151int bdrv_in_use(BlockDriverState *bs)
5152{
5153 return bs->in_use;
5154}
5155
5156void bdrv_iostatus_enable(BlockDriverState *bs)
5157{
5158 bs->iostatus_enabled = true;
5159 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5160}
5161
5162/* The I/O status is only enabled if the drive explicitly
5163 * enables it _and_ the VM is configured to stop on errors */
5164bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5165{
5166 return (bs->iostatus_enabled &&
5167 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5168 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5169 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
5170}
5171
5172void bdrv_iostatus_disable(BlockDriverState *bs)
5173{
5174 bs->iostatus_enabled = false;
5175}
5176
5177void bdrv_iostatus_reset(BlockDriverState *bs)
5178{
5179 if (bdrv_iostatus_is_enabled(bs)) {
5180 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
5181 if (bs->job) {
5182 block_job_iostatus_reset(bs->job);
5183 }
5184 }
5185}
5186
5187void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5188{
5189 assert(bdrv_iostatus_is_enabled(bs));
5190 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
5191 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5192 BLOCK_DEVICE_IO_STATUS_FAILED;
5193 }
5194}
5195
5196void
5197bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5198 enum BlockAcctType type)
5199{
5200 assert(type < BDRV_MAX_IOTYPE);
5201
5202 cookie->bytes = bytes;
5203 cookie->start_time_ns = get_clock();
5204 cookie->type = type;
5205}
5206
5207void
5208bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5209{
5210 assert(cookie->type < BDRV_MAX_IOTYPE);
5211
5212 bs->nr_bytes[cookie->type] += cookie->bytes;
5213 bs->nr_ops[cookie->type]++;
5214 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
5215}
5216
5217void bdrv_img_create(const char *filename, const char *fmt,
5218 const char *base_filename, const char *base_fmt,
5219 char *options, uint64_t img_size, int flags,
5220 Error **errp, bool quiet)
5221{
5222 QEMUOptionParameter *param = NULL, *create_options = NULL;
5223 QEMUOptionParameter *backing_fmt, *backing_file, *size;
5224 BlockDriver *drv, *proto_drv;
5225 BlockDriver *backing_drv = NULL;
5226 Error *local_err = NULL;
5227 int ret = 0;
5228
5229 /* Find driver and parse its options */
5230 drv = bdrv_find_format(fmt);
5231 if (!drv) {
5232 error_setg(errp, "Unknown file format '%s'", fmt);
5233 return;
5234 }
5235
5236 proto_drv = bdrv_find_protocol(filename, true);
5237 if (!proto_drv) {
5238 error_setg(errp, "Unknown protocol '%s'", filename);
5239 return;
5240 }
5241
5242 create_options = append_option_parameters(create_options,
5243 drv->create_options);
5244 create_options = append_option_parameters(create_options,
5245 proto_drv->create_options);
5246
5247 /* Create parameter list with default values */
5248 param = parse_option_parameters("", create_options, param);
5249
5250 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5251
5252 /* Parse -o options */
5253 if (options) {
5254 param = parse_option_parameters(options, create_options, param);
5255 if (param == NULL) {
5256 error_setg(errp, "Invalid options for file format '%s'.", fmt);
5257 goto out;
5258 }
5259 }
5260
5261 if (base_filename) {
5262 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5263 base_filename)) {
5264 error_setg(errp, "Backing file not supported for file format '%s'",
5265 fmt);
5266 goto out;
5267 }
5268 }
5269
5270 if (base_fmt) {
5271 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
5272 error_setg(errp, "Backing file format not supported for file "
5273 "format '%s'", fmt);
5274 goto out;
5275 }
5276 }
5277
5278 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5279 if (backing_file && backing_file->value.s) {
5280 if (!strcmp(filename, backing_file->value.s)) {
5281 error_setg(errp, "Error: Trying to create an image with the "
5282 "same filename as the backing file");
5283 goto out;
5284 }
5285 }
5286
5287 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5288 if (backing_fmt && backing_fmt->value.s) {
5289 backing_drv = bdrv_find_format(backing_fmt->value.s);
5290 if (!backing_drv) {
5291 error_setg(errp, "Unknown backing file format '%s'",
5292 backing_fmt->value.s);
5293 goto out;
5294 }
5295 }
5296
5297 // The size for the image must always be specified, with one exception:
5298 // If we are using a backing file, we can obtain the size from there
5299 size = get_option_parameter(param, BLOCK_OPT_SIZE);
5300 if (size && size->value.n == -1) {
5301 if (backing_file && backing_file->value.s) {
5302 BlockDriverState *bs;
5303 uint64_t size;
5304 char buf[32];
5305 int back_flags;
5306
5307 /* backing files always opened read-only */
5308 back_flags =
5309 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
5310
5311 bs = NULL;
5312 ret = bdrv_open(&bs, backing_file->value.s, NULL, back_flags,
5313 backing_drv, &local_err);
5314 if (ret < 0) {
5315 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5316 backing_file->value.s,
5317 error_get_pretty(local_err));
5318 error_free(local_err);
5319 local_err = NULL;
5320 goto out;
5321 }
5322 bdrv_get_geometry(bs, &size);
5323 size *= 512;
5324
5325 snprintf(buf, sizeof(buf), "%" PRId64, size);
5326 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
5327
5328 bdrv_unref(bs);
5329 } else {
5330 error_setg(errp, "Image creation needs a size parameter");
5331 goto out;
5332 }
5333 }
5334
5335 if (!quiet) {
5336 printf("Formatting '%s', fmt=%s ", filename, fmt);
5337 print_option_parameters(param);
5338 puts("");
5339 }
5340 ret = bdrv_create(drv, filename, param, &local_err);
5341 if (ret == -EFBIG) {
5342 /* This is generally a better message than whatever the driver would
5343 * deliver (especially because of the cluster_size_hint), since that
5344 * is most probably not much different from "image too large". */
5345 const char *cluster_size_hint = "";
5346 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5347 cluster_size_hint = " (try using a larger cluster size)";
5348 }
5349 error_setg(errp, "The image size is too large for file format '%s'"
5350 "%s", fmt, cluster_size_hint);
5351 error_free(local_err);
5352 local_err = NULL;
5353 }
5354
5355out:
5356 free_option_parameters(create_options);
5357 free_option_parameters(param);
5358
5359 if (local_err) {
5360 error_propagate(errp, local_err);
5361 }
5362}
5363
5364AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5365{
5366 /* Currently BlockDriverState always uses the main loop AioContext */
5367 return qemu_get_aio_context();
5368}
5369
5370void bdrv_add_before_write_notifier(BlockDriverState *bs,
5371 NotifierWithReturn *notifier)
5372{
5373 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5374}
5375
5376int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5377{
5378 if (bs->drv->bdrv_amend_options == NULL) {
5379 return -ENOTSUP;
5380 }
5381 return bs->drv->bdrv_amend_options(bs, options);
5382}
5383
5384/* Used to recurse on single child block filters.
5385 * Single child block filter will store their child in bs->file.
5386 */
5387bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
5388 BlockDriverState *candidate)
5389{
5390 if (!bs->drv) {
5391 return false;
5392 }
5393
5394 if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
5395 if (bs == candidate) {
5396 return true;
5397 } else {
5398 return false;
5399 }
5400 }
5401
5402 if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
5403 return false;
5404 }
5405
5406 if (!bs->file) {
5407 return false;
5408 }
5409
5410 return bdrv_recurse_is_first_non_filter(bs->file, candidate);
5411}
5412
5413bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
5414 BlockDriverState *candidate)
5415{
5416 if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
5417 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
5418 }
5419
5420 return bdrv_generic_is_first_non_filter(bs, candidate);
5421}
5422
5423/* This function checks if the candidate is the first non filter bs down it's
5424 * bs chain. Since we don't have pointers to parents it explore all bs chains
5425 * from the top. Some filters can choose not to pass down the recursion.
5426 */
5427bool bdrv_is_first_non_filter(BlockDriverState *candidate)
5428{
5429 BlockDriverState *bs;
5430
5431 /* walk down the bs forest recursively */
5432 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5433 bool perm;
5434
5435 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
5436
5437 /* candidate is the first non filter */
5438 if (perm) {
5439 return true;
5440 }
5441 }
5442
5443 return false;
5444}