]> git.ipfire.org Git - thirdparty/qemu.git/blame - block.c
block: Create bdrv_inherited_flags()
[thirdparty/qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
83c9089e 27#include "monitor/monitor.h"
737e150e
PB
28#include "block/block_int.h"
29#include "block/blockjob.h"
1de7afc9 30#include "qemu/module.h"
7b1b5d19 31#include "qapi/qmp/qjson.h"
9c17d615 32#include "sysemu/sysemu.h"
1de7afc9 33#include "qemu/notify.h"
737e150e 34#include "block/coroutine.h"
c13163fb 35#include "block/qapi.h"
b2023818 36#include "qmp-commands.h"
1de7afc9 37#include "qemu/timer.h"
fc01f7e7 38
71e72a19 39#ifdef CONFIG_BSD
7674e7bf
FB
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <sys/ioctl.h>
72cf2d4f 43#include <sys/queue.h>
c5e97233 44#ifndef __DragonFly__
7674e7bf
FB
45#include <sys/disk.h>
46#endif
c5e97233 47#endif
7674e7bf 48
49dc768d
AL
49#ifdef _WIN32
50#include <windows.h>
51#endif
52
e4654d2d
FZ
53struct BdrvDirtyBitmap {
54 HBitmap *bitmap;
55 QLIST_ENTRY(BdrvDirtyBitmap) list;
56};
57
1c9805a3
SH
58#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
59
7d4b4ba5 60static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
61static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
62 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 63 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
64static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
65 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 66 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
67static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
68 int64_t sector_num, int nb_sectors,
69 QEMUIOVector *iov);
70static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
71 int64_t sector_num, int nb_sectors,
72 QEMUIOVector *iov);
775aa8b6
KW
73static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
74 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
470c0504 75 BdrvRequestFlags flags);
775aa8b6
KW
76static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
77 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
f08f2dda 78 BdrvRequestFlags flags);
b2a61371
SH
79static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
80 int64_t sector_num,
81 QEMUIOVector *qiov,
82 int nb_sectors,
d20d9b7c 83 BdrvRequestFlags flags,
b2a61371
SH
84 BlockDriverCompletionFunc *cb,
85 void *opaque,
8c5873d6 86 bool is_write);
b2a61371 87static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589 88static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 89 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
ec530c81 90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
dc364f4c
BC
94static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
95 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
96
8a22f02a
SH
97static QLIST_HEAD(, BlockDriver) bdrv_drivers =
98 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
cc0681c4
BC
124void bdrv_set_io_limits(BlockDriverState *bs,
125 ThrottleConfig *cfg)
98f90dba 126{
cc0681c4 127 int i;
98f90dba 128
cc0681c4 129 throttle_config(&bs->throttle_state, cfg);
98f90dba 130
cc0681c4
BC
131 for (i = 0; i < 2; i++) {
132 qemu_co_enter_next(&bs->throttled_reqs[i]);
98f90dba 133 }
cc0681c4
BC
134}
135
136/* this function drain all the throttled IOs */
137static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
138{
139 bool drained = false;
140 bool enabled = bs->io_limits_enabled;
141 int i;
142
143 bs->io_limits_enabled = false;
144
145 for (i = 0; i < 2; i++) {
146 while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
147 drained = true;
148 }
149 }
150
151 bs->io_limits_enabled = enabled;
98f90dba 152
cc0681c4 153 return drained;
98f90dba
ZYW
154}
155
cc0681c4 156void bdrv_io_limits_disable(BlockDriverState *bs)
0563e191 157{
cc0681c4 158 bs->io_limits_enabled = false;
0563e191 159
cc0681c4
BC
160 bdrv_start_throttled_reqs(bs);
161
162 throttle_destroy(&bs->throttle_state);
0563e191
ZYW
163}
164
cc0681c4 165static void bdrv_throttle_read_timer_cb(void *opaque)
0563e191 166{
cc0681c4
BC
167 BlockDriverState *bs = opaque;
168 qemu_co_enter_next(&bs->throttled_reqs[0]);
0563e191
ZYW
169}
170
cc0681c4 171static void bdrv_throttle_write_timer_cb(void *opaque)
0563e191 172{
cc0681c4
BC
173 BlockDriverState *bs = opaque;
174 qemu_co_enter_next(&bs->throttled_reqs[1]);
0563e191
ZYW
175}
176
cc0681c4
BC
177/* should be called before bdrv_set_io_limits if a limit is set */
178void bdrv_io_limits_enable(BlockDriverState *bs)
179{
180 assert(!bs->io_limits_enabled);
181 throttle_init(&bs->throttle_state,
182 QEMU_CLOCK_VIRTUAL,
183 bdrv_throttle_read_timer_cb,
184 bdrv_throttle_write_timer_cb,
185 bs);
186 bs->io_limits_enabled = true;
187}
188
189/* This function makes an IO wait if needed
190 *
191 * @nb_sectors: the number of sectors of the IO
192 * @is_write: is the IO a write
193 */
98f90dba 194static void bdrv_io_limits_intercept(BlockDriverState *bs,
d5103588 195 unsigned int bytes,
cc0681c4 196 bool is_write)
98f90dba 197{
cc0681c4
BC
198 /* does this io must wait */
199 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
98f90dba 200
cc0681c4
BC
201 /* if must wait or any request of this type throttled queue the IO */
202 if (must_wait ||
203 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
204 qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
205 }
206
cc0681c4 207 /* the IO will be executed, do the accounting */
d5103588
KW
208 throttle_account(&bs->throttle_state, is_write, bytes);
209
98f90dba 210
cc0681c4
BC
211 /* if the next request must wait -> do nothing */
212 if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
213 return;
98f90dba
ZYW
214 }
215
cc0681c4
BC
216 /* else queue next request for execution */
217 qemu_co_queue_next(&bs->throttled_reqs[is_write]);
98f90dba
ZYW
218}
219
339064d5
KW
220size_t bdrv_opt_mem_align(BlockDriverState *bs)
221{
222 if (!bs || !bs->drv) {
223 /* 4k should be on the safe side */
224 return 4096;
225 }
226
227 return bs->bl.opt_mem_alignment;
228}
229
9e0b22f4
SH
230/* check if the path starts with "<protocol>:" */
231static int path_has_protocol(const char *path)
232{
947995c0
PB
233 const char *p;
234
9e0b22f4
SH
235#ifdef _WIN32
236 if (is_windows_drive(path) ||
237 is_windows_drive_prefix(path)) {
238 return 0;
239 }
947995c0
PB
240 p = path + strcspn(path, ":/\\");
241#else
242 p = path + strcspn(path, ":/");
9e0b22f4
SH
243#endif
244
947995c0 245 return *p == ':';
9e0b22f4
SH
246}
247
83f64091 248int path_is_absolute(const char *path)
3b0d4f61 249{
21664424
FB
250#ifdef _WIN32
251 /* specific case for names like: "\\.\d:" */
f53f4da9 252 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 253 return 1;
f53f4da9
PB
254 }
255 return (*path == '/' || *path == '\\');
3b9f94e1 256#else
f53f4da9 257 return (*path == '/');
3b9f94e1 258#endif
3b0d4f61
FB
259}
260
83f64091
FB
261/* if filename is absolute, just copy it to dest. Otherwise, build a
262 path to it by considering it is relative to base_path. URL are
263 supported. */
264void path_combine(char *dest, int dest_size,
265 const char *base_path,
266 const char *filename)
3b0d4f61 267{
83f64091
FB
268 const char *p, *p1;
269 int len;
270
271 if (dest_size <= 0)
272 return;
273 if (path_is_absolute(filename)) {
274 pstrcpy(dest, dest_size, filename);
275 } else {
276 p = strchr(base_path, ':');
277 if (p)
278 p++;
279 else
280 p = base_path;
3b9f94e1
FB
281 p1 = strrchr(base_path, '/');
282#ifdef _WIN32
283 {
284 const char *p2;
285 p2 = strrchr(base_path, '\\');
286 if (!p1 || p2 > p1)
287 p1 = p2;
288 }
289#endif
83f64091
FB
290 if (p1)
291 p1++;
292 else
293 p1 = base_path;
294 if (p1 > p)
295 p = p1;
296 len = p - base_path;
297 if (len > dest_size - 1)
298 len = dest_size - 1;
299 memcpy(dest, base_path, len);
300 dest[len] = '\0';
301 pstrcat(dest, dest_size, filename);
3b0d4f61 302 }
3b0d4f61
FB
303}
304
dc5a1371
PB
305void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
306{
307 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
308 pstrcpy(dest, sz, bs->backing_file);
309 } else {
310 path_combine(dest, sz, bs->filename, bs->backing_file);
311 }
312}
313
5efa9d5a 314void bdrv_register(BlockDriver *bdrv)
ea2384d3 315{
8c5873d6
SH
316 /* Block drivers without coroutine functions need emulation */
317 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
318 bdrv->bdrv_co_readv = bdrv_co_readv_em;
319 bdrv->bdrv_co_writev = bdrv_co_writev_em;
320
f8c35c1d
SH
321 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
322 * the block driver lacks aio we need to emulate that too.
323 */
f9f05dc5
KW
324 if (!bdrv->bdrv_aio_readv) {
325 /* add AIO emulation layer */
326 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
327 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 328 }
83f64091 329 }
b2e12bc6 330
8a22f02a 331 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 332}
b338082b
FB
333
334/* create a new block device (by default it is empty) */
98522f63 335BlockDriverState *bdrv_new(const char *device_name, Error **errp)
b338082b 336{
1b7bdbc1 337 BlockDriverState *bs;
b338082b 338
f2d953ec
KW
339 if (bdrv_find(device_name)) {
340 error_setg(errp, "Device with id '%s' already exists",
341 device_name);
342 return NULL;
343 }
344 if (bdrv_find_node(device_name)) {
345 error_setg(errp, "Device with node-name '%s' already exists",
346 device_name);
347 return NULL;
348 }
349
7267c094 350 bs = g_malloc0(sizeof(BlockDriverState));
e4654d2d 351 QLIST_INIT(&bs->dirty_bitmaps);
b338082b 352 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 353 if (device_name[0] != '\0') {
dc364f4c 354 QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
ea2384d3 355 }
28a7282a 356 bdrv_iostatus_disable(bs);
d7d512f6 357 notifier_list_init(&bs->close_notifiers);
d616b224 358 notifier_with_return_list_init(&bs->before_write_notifiers);
cc0681c4
BC
359 qemu_co_queue_init(&bs->throttled_reqs[0]);
360 qemu_co_queue_init(&bs->throttled_reqs[1]);
9fcb0251 361 bs->refcnt = 1;
d7d512f6 362
b338082b
FB
363 return bs;
364}
365
d7d512f6
PB
366void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
367{
368 notifier_list_add(&bs->close_notifiers, notify);
369}
370
ea2384d3
FB
371BlockDriver *bdrv_find_format(const char *format_name)
372{
373 BlockDriver *drv1;
8a22f02a
SH
374 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
375 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 376 return drv1;
8a22f02a 377 }
ea2384d3
FB
378 }
379 return NULL;
380}
381
b64ec4e4 382static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
eb852011 383{
b64ec4e4
FZ
384 static const char *whitelist_rw[] = {
385 CONFIG_BDRV_RW_WHITELIST
386 };
387 static const char *whitelist_ro[] = {
388 CONFIG_BDRV_RO_WHITELIST
eb852011
MA
389 };
390 const char **p;
391
b64ec4e4 392 if (!whitelist_rw[0] && !whitelist_ro[0]) {
eb852011 393 return 1; /* no whitelist, anything goes */
b64ec4e4 394 }
eb852011 395
b64ec4e4 396 for (p = whitelist_rw; *p; p++) {
eb852011
MA
397 if (!strcmp(drv->format_name, *p)) {
398 return 1;
399 }
400 }
b64ec4e4
FZ
401 if (read_only) {
402 for (p = whitelist_ro; *p; p++) {
403 if (!strcmp(drv->format_name, *p)) {
404 return 1;
405 }
406 }
407 }
eb852011
MA
408 return 0;
409}
410
b64ec4e4
FZ
411BlockDriver *bdrv_find_whitelisted_format(const char *format_name,
412 bool read_only)
eb852011
MA
413{
414 BlockDriver *drv = bdrv_find_format(format_name);
b64ec4e4 415 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL;
eb852011
MA
416}
417
5b7e1542
ZYW
418typedef struct CreateCo {
419 BlockDriver *drv;
420 char *filename;
421 QEMUOptionParameter *options;
422 int ret;
cc84d90f 423 Error *err;
5b7e1542
ZYW
424} CreateCo;
425
426static void coroutine_fn bdrv_create_co_entry(void *opaque)
427{
cc84d90f
HR
428 Error *local_err = NULL;
429 int ret;
430
5b7e1542
ZYW
431 CreateCo *cco = opaque;
432 assert(cco->drv);
433
cc84d90f 434 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err);
84d18f06 435 if (local_err) {
cc84d90f
HR
436 error_propagate(&cco->err, local_err);
437 }
438 cco->ret = ret;
5b7e1542
ZYW
439}
440
0e7e1989 441int bdrv_create(BlockDriver *drv, const char* filename,
cc84d90f 442 QEMUOptionParameter *options, Error **errp)
ea2384d3 443{
5b7e1542
ZYW
444 int ret;
445
446 Coroutine *co;
447 CreateCo cco = {
448 .drv = drv,
449 .filename = g_strdup(filename),
450 .options = options,
451 .ret = NOT_DONE,
cc84d90f 452 .err = NULL,
5b7e1542
ZYW
453 };
454
455 if (!drv->bdrv_create) {
cc84d90f 456 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
80168bff
LC
457 ret = -ENOTSUP;
458 goto out;
5b7e1542
ZYW
459 }
460
461 if (qemu_in_coroutine()) {
462 /* Fast-path if already in coroutine context */
463 bdrv_create_co_entry(&cco);
464 } else {
465 co = qemu_coroutine_create(bdrv_create_co_entry);
466 qemu_coroutine_enter(co, &cco);
467 while (cco.ret == NOT_DONE) {
468 qemu_aio_wait();
469 }
470 }
471
472 ret = cco.ret;
cc84d90f 473 if (ret < 0) {
84d18f06 474 if (cco.err) {
cc84d90f
HR
475 error_propagate(errp, cco.err);
476 } else {
477 error_setg_errno(errp, -ret, "Could not create image");
478 }
479 }
0e7e1989 480
80168bff
LC
481out:
482 g_free(cco.filename);
5b7e1542 483 return ret;
ea2384d3
FB
484}
485
cc84d90f
HR
486int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
487 Error **errp)
84a12e66
CH
488{
489 BlockDriver *drv;
cc84d90f
HR
490 Error *local_err = NULL;
491 int ret;
84a12e66 492
98289620 493 drv = bdrv_find_protocol(filename, true);
84a12e66 494 if (drv == NULL) {
cc84d90f 495 error_setg(errp, "Could not find protocol for file '%s'", filename);
16905d71 496 return -ENOENT;
84a12e66
CH
497 }
498
cc84d90f 499 ret = bdrv_create(drv, filename, options, &local_err);
84d18f06 500 if (local_err) {
cc84d90f
HR
501 error_propagate(errp, local_err);
502 }
503 return ret;
84a12e66
CH
504}
505
355ef4ac 506int bdrv_refresh_limits(BlockDriverState *bs)
d34682cd
KW
507{
508 BlockDriver *drv = bs->drv;
509
510 memset(&bs->bl, 0, sizeof(bs->bl));
511
466ad822
KW
512 if (!drv) {
513 return 0;
514 }
515
516 /* Take some limits from the children as a default */
517 if (bs->file) {
518 bdrv_refresh_limits(bs->file);
519 bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
339064d5
KW
520 bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
521 } else {
522 bs->bl.opt_mem_alignment = 512;
466ad822
KW
523 }
524
525 if (bs->backing_hd) {
526 bdrv_refresh_limits(bs->backing_hd);
527 bs->bl.opt_transfer_length =
528 MAX(bs->bl.opt_transfer_length,
529 bs->backing_hd->bl.opt_transfer_length);
339064d5
KW
530 bs->bl.opt_mem_alignment =
531 MAX(bs->bl.opt_mem_alignment,
532 bs->backing_hd->bl.opt_mem_alignment);
466ad822
KW
533 }
534
535 /* Then let the driver override it */
536 if (drv->bdrv_refresh_limits) {
d34682cd
KW
537 return drv->bdrv_refresh_limits(bs);
538 }
539
540 return 0;
541}
542
eba25057
JM
543/*
544 * Create a uniquely-named empty temporary file.
545 * Return 0 upon success, otherwise a negative errno value.
546 */
547int get_tmp_filename(char *filename, int size)
d5249393 548{
eba25057 549#ifdef _WIN32
3b9f94e1 550 char temp_dir[MAX_PATH];
eba25057
JM
551 /* GetTempFileName requires that its output buffer (4th param)
552 have length MAX_PATH or greater. */
553 assert(size >= MAX_PATH);
554 return (GetTempPath(MAX_PATH, temp_dir)
555 && GetTempFileName(temp_dir, "qem", 0, filename)
556 ? 0 : -GetLastError());
d5249393 557#else
67b915a5 558 int fd;
7ccfb2eb 559 const char *tmpdir;
0badc1ee 560 tmpdir = getenv("TMPDIR");
69bef793
AS
561 if (!tmpdir) {
562 tmpdir = "/var/tmp";
563 }
eba25057
JM
564 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
565 return -EOVERFLOW;
566 }
ea2384d3 567 fd = mkstemp(filename);
fe235a06
DH
568 if (fd < 0) {
569 return -errno;
570 }
571 if (close(fd) != 0) {
572 unlink(filename);
eba25057
JM
573 return -errno;
574 }
575 return 0;
d5249393 576#endif
eba25057 577}
fc01f7e7 578
84a12e66
CH
579/*
580 * Detect host devices. By convention, /dev/cdrom[N] is always
581 * recognized as a host CDROM.
582 */
583static BlockDriver *find_hdev_driver(const char *filename)
584{
585 int score_max = 0, score;
586 BlockDriver *drv = NULL, *d;
587
588 QLIST_FOREACH(d, &bdrv_drivers, list) {
589 if (d->bdrv_probe_device) {
590 score = d->bdrv_probe_device(filename);
591 if (score > score_max) {
592 score_max = score;
593 drv = d;
594 }
595 }
596 }
597
598 return drv;
599}
600
98289620
KW
601BlockDriver *bdrv_find_protocol(const char *filename,
602 bool allow_protocol_prefix)
83f64091
FB
603{
604 BlockDriver *drv1;
605 char protocol[128];
1cec71e3 606 int len;
83f64091 607 const char *p;
19cb3738 608
66f82cee
KW
609 /* TODO Drivers without bdrv_file_open must be specified explicitly */
610
39508e7a
CH
611 /*
612 * XXX(hch): we really should not let host device detection
613 * override an explicit protocol specification, but moving this
614 * later breaks access to device names with colons in them.
615 * Thanks to the brain-dead persistent naming schemes on udev-
616 * based Linux systems those actually are quite common.
617 */
618 drv1 = find_hdev_driver(filename);
619 if (drv1) {
620 return drv1;
621 }
622
98289620 623 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
39508e7a 624 return bdrv_find_format("file");
84a12e66 625 }
98289620 626
9e0b22f4
SH
627 p = strchr(filename, ':');
628 assert(p != NULL);
1cec71e3
AL
629 len = p - filename;
630 if (len > sizeof(protocol) - 1)
631 len = sizeof(protocol) - 1;
632 memcpy(protocol, filename, len);
633 protocol[len] = '\0';
8a22f02a 634 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 635 if (drv1->protocol_name &&
8a22f02a 636 !strcmp(drv1->protocol_name, protocol)) {
83f64091 637 return drv1;
8a22f02a 638 }
83f64091
FB
639 }
640 return NULL;
641}
642
f500a6d3 643static int find_image_format(BlockDriverState *bs, const char *filename,
34b5d2c6 644 BlockDriver **pdrv, Error **errp)
f3a5d3f8 645{
f500a6d3 646 int score, score_max;
f3a5d3f8
CH
647 BlockDriver *drv1, *drv;
648 uint8_t buf[2048];
f500a6d3 649 int ret = 0;
f8ea0b00 650
08a00559 651 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
8e895599 652 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
c98ac35d
SW
653 drv = bdrv_find_format("raw");
654 if (!drv) {
34b5d2c6 655 error_setg(errp, "Could not find raw image format");
c98ac35d
SW
656 ret = -ENOENT;
657 }
658 *pdrv = drv;
659 return ret;
1a396859 660 }
f8ea0b00 661
83f64091 662 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
83f64091 663 if (ret < 0) {
34b5d2c6
HR
664 error_setg_errno(errp, -ret, "Could not read image for determining its "
665 "format");
c98ac35d
SW
666 *pdrv = NULL;
667 return ret;
83f64091
FB
668 }
669
ea2384d3 670 score_max = 0;
84a12e66 671 drv = NULL;
8a22f02a 672 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
673 if (drv1->bdrv_probe) {
674 score = drv1->bdrv_probe(buf, ret, filename);
675 if (score > score_max) {
676 score_max = score;
677 drv = drv1;
678 }
0849bf08 679 }
fc01f7e7 680 }
c98ac35d 681 if (!drv) {
34b5d2c6
HR
682 error_setg(errp, "Could not determine image format: No compatible "
683 "driver found");
c98ac35d
SW
684 ret = -ENOENT;
685 }
686 *pdrv = drv;
687 return ret;
ea2384d3
FB
688}
689
51762288
SH
690/**
691 * Set the current 'total_sectors' value
692 */
693static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
694{
695 BlockDriver *drv = bs->drv;
696
396759ad
NB
697 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
698 if (bs->sg)
699 return 0;
700
51762288
SH
701 /* query actual device if possible, otherwise just trust the hint */
702 if (drv->bdrv_getlength) {
703 int64_t length = drv->bdrv_getlength(bs);
704 if (length < 0) {
705 return length;
706 }
7e382003 707 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
51762288
SH
708 }
709
710 bs->total_sectors = hint;
711 return 0;
712}
713
9e8f1835
PB
714/**
715 * Set open flags for a given discard mode
716 *
717 * Return 0 on success, -1 if the discard mode was invalid.
718 */
719int bdrv_parse_discard_flags(const char *mode, int *flags)
720{
721 *flags &= ~BDRV_O_UNMAP;
722
723 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
724 /* do nothing */
725 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
726 *flags |= BDRV_O_UNMAP;
727 } else {
728 return -1;
729 }
730
731 return 0;
732}
733
c3993cdc
SH
734/**
735 * Set open flags for a given cache mode
736 *
737 * Return 0 on success, -1 if the cache mode was invalid.
738 */
739int bdrv_parse_cache_flags(const char *mode, int *flags)
740{
741 *flags &= ~BDRV_O_CACHE_MASK;
742
743 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
744 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
745 } else if (!strcmp(mode, "directsync")) {
746 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
747 } else if (!strcmp(mode, "writeback")) {
748 *flags |= BDRV_O_CACHE_WB;
749 } else if (!strcmp(mode, "unsafe")) {
750 *flags |= BDRV_O_CACHE_WB;
751 *flags |= BDRV_O_NO_FLUSH;
752 } else if (!strcmp(mode, "writethrough")) {
753 /* this is the default */
754 } else {
755 return -1;
756 }
757
758 return 0;
759}
760
53fec9d3
SH
761/**
762 * The copy-on-read flag is actually a reference count so multiple users may
763 * use the feature without worrying about clobbering its previous state.
764 * Copy-on-read stays enabled until all users have called to disable it.
765 */
766void bdrv_enable_copy_on_read(BlockDriverState *bs)
767{
768 bs->copy_on_read++;
769}
770
771void bdrv_disable_copy_on_read(BlockDriverState *bs)
772{
773 assert(bs->copy_on_read > 0);
774 bs->copy_on_read--;
775}
776
0b50cc88
KW
777/*
778 * Returns the flags that bs->file should get, based on the given flags for
779 * the parent BDS
780 */
781static int bdrv_inherited_flags(int flags)
782{
783 /* Enable protocol handling, disable format probing for bs->file */
784 flags |= BDRV_O_PROTOCOL;
785
786 /* Our block drivers take care to send flushes and respect unmap policy,
787 * so we can enable both unconditionally on lower layers. */
788 flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP;
789
790 /* The backing file of a temporary snapshot is read-only */
791 if (flags & BDRV_O_SNAPSHOT) {
792 flags &= ~BDRV_O_RDWR;
793 }
794
795 /* Clear flags that only apply to the top layer */
796 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
797
798 return flags;
799}
800
7b272452
KW
801static int bdrv_open_flags(BlockDriverState *bs, int flags)
802{
803 int open_flags = flags | BDRV_O_CACHE_WB;
804
b998875d
KW
805 /* The backing file of a temporary snapshot is read-only */
806 if (flags & BDRV_O_SNAPSHOT) {
807 open_flags &= ~BDRV_O_RDWR;
808 }
809
7b272452
KW
810 /*
811 * Clear flags that are internal to the block layer before opening the
812 * image.
813 */
814 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
815
816 /*
817 * Snapshots should be writable.
818 */
819 if (bs->is_temporary) {
820 open_flags |= BDRV_O_RDWR;
821 }
822
823 return open_flags;
824}
825
636ea370
KW
826static void bdrv_assign_node_name(BlockDriverState *bs,
827 const char *node_name,
828 Error **errp)
6913c0c2
BC
829{
830 if (!node_name) {
636ea370 831 return;
6913c0c2
BC
832 }
833
834 /* empty string node name is invalid */
835 if (node_name[0] == '\0') {
836 error_setg(errp, "Empty node name");
636ea370 837 return;
6913c0c2
BC
838 }
839
0c5e94ee
BC
840 /* takes care of avoiding namespaces collisions */
841 if (bdrv_find(node_name)) {
842 error_setg(errp, "node-name=%s is conflicting with a device id",
843 node_name);
636ea370 844 return;
0c5e94ee
BC
845 }
846
6913c0c2
BC
847 /* takes care of avoiding duplicates node names */
848 if (bdrv_find_node(node_name)) {
849 error_setg(errp, "Duplicate node name");
636ea370 850 return;
6913c0c2
BC
851 }
852
853 /* copy node name into the bs and insert it into the graph list */
854 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
855 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
6913c0c2
BC
856}
857
57915332
KW
858/*
859 * Common part for opening disk images and files
b6ad491a
KW
860 *
861 * Removes all processed options from *options.
57915332 862 */
f500a6d3 863static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
34b5d2c6 864 QDict *options, int flags, BlockDriver *drv, Error **errp)
57915332
KW
865{
866 int ret, open_flags;
035fccdf 867 const char *filename;
6913c0c2 868 const char *node_name = NULL;
34b5d2c6 869 Error *local_err = NULL;
57915332
KW
870
871 assert(drv != NULL);
6405875c 872 assert(bs->file == NULL);
707ff828 873 assert(options != NULL && bs->options != options);
57915332 874
45673671
KW
875 if (file != NULL) {
876 filename = file->filename;
877 } else {
878 filename = qdict_get_try_str(options, "filename");
879 }
880
765003db
KW
881 if (drv->bdrv_needs_filename && !filename) {
882 error_setg(errp, "The '%s' block driver requires a file name",
883 drv->format_name);
884 return -EINVAL;
885 }
886
45673671 887 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
28dcee10 888
6913c0c2 889 node_name = qdict_get_try_str(options, "node-name");
636ea370 890 bdrv_assign_node_name(bs, node_name, &local_err);
0fb6395c 891 if (local_err) {
636ea370
KW
892 error_propagate(errp, local_err);
893 return -EINVAL;
6913c0c2
BC
894 }
895 qdict_del(options, "node-name");
896
5d186eb0
KW
897 /* bdrv_open() with directly using a protocol as drv. This layer is already
898 * opened, so assign it to bs (while file becomes a closed BlockDriverState)
899 * and return immediately. */
900 if (file != NULL && drv->bdrv_file_open) {
901 bdrv_swap(file, bs);
902 return 0;
903 }
904
57915332 905 bs->open_flags = flags;
1b7fd729 906 bs->guest_block_size = 512;
c25f53b0 907 bs->request_alignment = 512;
0d51b4de 908 bs->zero_beyond_eof = true;
b64ec4e4
FZ
909 open_flags = bdrv_open_flags(bs, flags);
910 bs->read_only = !(open_flags & BDRV_O_RDWR);
911
912 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
8f94a6e4
KW
913 error_setg(errp,
914 !bs->read_only && bdrv_is_whitelisted(drv, true)
915 ? "Driver '%s' can only be used for read-only devices"
916 : "Driver '%s' is not whitelisted",
917 drv->format_name);
b64ec4e4
FZ
918 return -ENOTSUP;
919 }
57915332 920
53fec9d3 921 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
0ebd24e0
KW
922 if (flags & BDRV_O_COPY_ON_READ) {
923 if (!bs->read_only) {
924 bdrv_enable_copy_on_read(bs);
925 } else {
926 error_setg(errp, "Can't use copy-on-read on read-only device");
927 return -EINVAL;
928 }
53fec9d3
SH
929 }
930
c2ad1b0c
KW
931 if (filename != NULL) {
932 pstrcpy(bs->filename, sizeof(bs->filename), filename);
933 } else {
934 bs->filename[0] = '\0';
935 }
57915332 936
57915332 937 bs->drv = drv;
7267c094 938 bs->opaque = g_malloc0(drv->instance_size);
57915332 939
03f541bd 940 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e7c63796 941
66f82cee
KW
942 /* Open the image, either directly or using a protocol */
943 if (drv->bdrv_file_open) {
5d186eb0 944 assert(file == NULL);
030be321 945 assert(!drv->bdrv_needs_filename || filename != NULL);
34b5d2c6 946 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
f500a6d3 947 } else {
2af5ef70 948 if (file == NULL) {
34b5d2c6
HR
949 error_setg(errp, "Can't use '%s' as a block driver for the "
950 "protocol level", drv->format_name);
2af5ef70
KW
951 ret = -EINVAL;
952 goto free_and_fail;
953 }
f500a6d3 954 bs->file = file;
34b5d2c6 955 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
66f82cee
KW
956 }
957
57915332 958 if (ret < 0) {
84d18f06 959 if (local_err) {
34b5d2c6 960 error_propagate(errp, local_err);
2fa9aa59
DH
961 } else if (bs->filename[0]) {
962 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
34b5d2c6
HR
963 } else {
964 error_setg_errno(errp, -ret, "Could not open image");
965 }
57915332
KW
966 goto free_and_fail;
967 }
968
51762288
SH
969 ret = refresh_total_sectors(bs, bs->total_sectors);
970 if (ret < 0) {
34b5d2c6 971 error_setg_errno(errp, -ret, "Could not refresh total sector count");
51762288 972 goto free_and_fail;
57915332 973 }
51762288 974
d34682cd 975 bdrv_refresh_limits(bs);
c25f53b0 976 assert(bdrv_opt_mem_align(bs) != 0);
47ea2de2 977 assert((bs->request_alignment != 0) || bs->sg);
d34682cd 978
57915332
KW
979#ifndef _WIN32
980 if (bs->is_temporary) {
d4cea8df
DH
981 assert(bs->filename[0] != '\0');
982 unlink(bs->filename);
57915332
KW
983 }
984#endif
985 return 0;
986
987free_and_fail:
f500a6d3 988 bs->file = NULL;
7267c094 989 g_free(bs->opaque);
57915332
KW
990 bs->opaque = NULL;
991 bs->drv = NULL;
992 return ret;
993}
994
b6ce07aa
KW
995/*
996 * Opens a file using a protocol (file, host_device, nbd, ...)
787e4a85 997 *
5acd9d81
HR
998 * options is an indirect pointer to a QDict of options to pass to the block
999 * drivers, or pointer to NULL for an empty set of options. If this function
1000 * takes ownership of the QDict reference, it will set *options to NULL;
1001 * otherwise, it will contain unused/unrecognized options after this function
1002 * returns. Then, the caller is responsible for freeing it. If it intends to
1003 * reuse the QDict, QINCREF() should be called beforehand.
b6ce07aa 1004 */
d4446eae 1005static int bdrv_file_open(BlockDriverState *bs, const char *filename,
5acd9d81 1006 QDict **options, int flags, Error **errp)
ea2384d3 1007{
6db95603 1008 BlockDriver *drv;
c2ad1b0c 1009 const char *drvname;
e3fa4bfa 1010 bool parse_filename = false;
34b5d2c6 1011 Error *local_err = NULL;
83f64091
FB
1012 int ret;
1013
035fccdf
KW
1014 /* Fetch the file name from the options QDict if necessary */
1015 if (!filename) {
5acd9d81
HR
1016 filename = qdict_get_try_str(*options, "filename");
1017 } else if (filename && !qdict_haskey(*options, "filename")) {
1018 qdict_put(*options, "filename", qstring_from_str(filename));
e3fa4bfa 1019 parse_filename = true;
035fccdf 1020 } else {
34b5d2c6
HR
1021 error_setg(errp, "Can't specify 'file' and 'filename' options at the "
1022 "same time");
035fccdf
KW
1023 ret = -EINVAL;
1024 goto fail;
1025 }
1026
c2ad1b0c 1027 /* Find the right block driver */
5acd9d81 1028 drvname = qdict_get_try_str(*options, "driver");
c2ad1b0c 1029 if (drvname) {
8f94a6e4 1030 drv = bdrv_find_format(drvname);
34b5d2c6
HR
1031 if (!drv) {
1032 error_setg(errp, "Unknown driver '%s'", drvname);
1033 }
5acd9d81 1034 qdict_del(*options, "driver");
c2ad1b0c 1035 } else if (filename) {
e3fa4bfa 1036 drv = bdrv_find_protocol(filename, parse_filename);
98289620 1037 if (!drv) {
34b5d2c6 1038 error_setg(errp, "Unknown protocol");
98289620 1039 }
c2ad1b0c 1040 } else {
34b5d2c6 1041 error_setg(errp, "Must specify either driver or file");
c2ad1b0c
KW
1042 drv = NULL;
1043 }
1044
1045 if (!drv) {
34b5d2c6 1046 /* errp has been set already */
c2ad1b0c
KW
1047 ret = -ENOENT;
1048 goto fail;
1049 }
1050
1051 /* Parse the filename and open it */
e3fa4bfa 1052 if (drv->bdrv_parse_filename && parse_filename) {
5acd9d81 1053 drv->bdrv_parse_filename(filename, *options, &local_err);
84d18f06 1054 if (local_err) {
34b5d2c6 1055 error_propagate(errp, local_err);
6963a30d
KW
1056 ret = -EINVAL;
1057 goto fail;
1058 }
cd5d031e
HR
1059
1060 if (!drv->bdrv_needs_filename) {
1061 qdict_del(*options, "filename");
1062 } else {
1063 filename = qdict_get_str(*options, "filename");
1064 }
6963a30d
KW
1065 }
1066
505d7583 1067 if (!drv->bdrv_file_open) {
5acd9d81
HR
1068 ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err);
1069 *options = NULL;
505d7583 1070 } else {
5acd9d81 1071 ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err);
505d7583 1072 }
83f64091 1073 if (ret < 0) {
34b5d2c6 1074 error_propagate(errp, local_err);
707ff828
KW
1075 goto fail;
1076 }
1077
71d0770c 1078 bs->growable = 1;
83f64091 1079 return 0;
707ff828
KW
1080
1081fail:
707ff828 1082 return ret;
83f64091
FB
1083}
1084
31ca6d07
KW
1085/*
1086 * Opens the backing file for a BlockDriverState if not yet open
1087 *
1088 * options is a QDict of options to pass to the block drivers, or NULL for an
1089 * empty set of options. The reference to the QDict is transferred to this
1090 * function (even on failure), so if the caller intends to reuse the dictionary,
1091 * it needs to use QINCREF() before calling bdrv_file_open.
1092 */
34b5d2c6 1093int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
9156df12 1094{
1ba4b6a5
BC
1095 char *backing_filename = g_malloc0(PATH_MAX);
1096 int back_flags, ret = 0;
9156df12 1097 BlockDriver *back_drv = NULL;
34b5d2c6 1098 Error *local_err = NULL;
9156df12
PB
1099
1100 if (bs->backing_hd != NULL) {
31ca6d07 1101 QDECREF(options);
1ba4b6a5 1102 goto free_exit;
9156df12
PB
1103 }
1104
31ca6d07
KW
1105 /* NULL means an empty set of options */
1106 if (options == NULL) {
1107 options = qdict_new();
1108 }
1109
9156df12 1110 bs->open_flags &= ~BDRV_O_NO_BACKING;
1cb6f506
KW
1111 if (qdict_haskey(options, "file.filename")) {
1112 backing_filename[0] = '\0';
1113 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
31ca6d07 1114 QDECREF(options);
1ba4b6a5 1115 goto free_exit;
dbecebdd 1116 } else {
1ba4b6a5 1117 bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX);
9156df12
PB
1118 }
1119
9156df12
PB
1120 if (bs->backing_format[0] != '\0') {
1121 back_drv = bdrv_find_format(bs->backing_format);
1122 }
1123
1124 /* backing files always opened read-only */
87a5debd
TL
1125 back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT |
1126 BDRV_O_COPY_ON_READ);
9156df12 1127
f67503e5
HR
1128 assert(bs->backing_hd == NULL);
1129 ret = bdrv_open(&bs->backing_hd,
ddf5636d 1130 *backing_filename ? backing_filename : NULL, NULL, options,
34b5d2c6 1131 back_flags, back_drv, &local_err);
9156df12 1132 if (ret < 0) {
9156df12
PB
1133 bs->backing_hd = NULL;
1134 bs->open_flags |= BDRV_O_NO_BACKING;
b04b6b6e
FZ
1135 error_setg(errp, "Could not open backing file: %s",
1136 error_get_pretty(local_err));
1137 error_free(local_err);
1ba4b6a5 1138 goto free_exit;
9156df12 1139 }
d80ac658
PF
1140
1141 if (bs->backing_hd->file) {
1142 pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1143 bs->backing_hd->file->filename);
1144 }
1145
d34682cd
KW
1146 /* Recalculate the BlockLimits with the backing file */
1147 bdrv_refresh_limits(bs);
1148
1ba4b6a5
BC
1149free_exit:
1150 g_free(backing_filename);
1151 return ret;
9156df12
PB
1152}
1153
da557aac
HR
1154/*
1155 * Opens a disk image whose options are given as BlockdevRef in another block
1156 * device's options.
1157 *
da557aac
HR
1158 * If allow_none is true, no image will be opened if filename is false and no
1159 * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
1160 *
1161 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
1162 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
1163 * itself, all options starting with "${bdref_key}." are considered part of the
1164 * BlockdevRef.
1165 *
1166 * The BlockdevRef will be removed from the options QDict.
f67503e5
HR
1167 *
1168 * To conform with the behavior of bdrv_open(), *pbs has to be NULL.
da557aac
HR
1169 */
1170int bdrv_open_image(BlockDriverState **pbs, const char *filename,
1171 QDict *options, const char *bdref_key, int flags,
f7d9fd8c 1172 bool allow_none, Error **errp)
da557aac
HR
1173{
1174 QDict *image_options;
1175 int ret;
1176 char *bdref_key_dot;
1177 const char *reference;
1178
f67503e5
HR
1179 assert(pbs);
1180 assert(*pbs == NULL);
1181
da557aac
HR
1182 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
1183 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
1184 g_free(bdref_key_dot);
1185
1186 reference = qdict_get_try_str(options, bdref_key);
1187 if (!filename && !reference && !qdict_size(image_options)) {
1188 if (allow_none) {
1189 ret = 0;
1190 } else {
1191 error_setg(errp, "A block device must be specified for \"%s\"",
1192 bdref_key);
1193 ret = -EINVAL;
1194 }
1195 goto done;
1196 }
1197
f7d9fd8c 1198 ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp);
da557aac
HR
1199
1200done:
1201 qdict_del(options, bdref_key);
1202 return ret;
1203}
1204
b998875d
KW
1205void bdrv_append_temp_snapshot(BlockDriverState *bs, Error **errp)
1206{
1207 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
1ba4b6a5 1208 char *tmp_filename = g_malloc0(PATH_MAX + 1);
b998875d
KW
1209 int64_t total_size;
1210 BlockDriver *bdrv_qcow2;
1211 QEMUOptionParameter *create_options;
1212 QDict *snapshot_options;
1213 BlockDriverState *bs_snapshot;
1214 Error *local_err;
1215 int ret;
1216
1217 /* if snapshot, we create a temporary backing file and open it
1218 instead of opening 'filename' directly */
1219
1220 /* Get the required size from the image */
f187743a
KW
1221 total_size = bdrv_getlength(bs);
1222 if (total_size < 0) {
1223 error_setg_errno(errp, -total_size, "Could not get image size");
1ba4b6a5 1224 goto out;
f187743a
KW
1225 }
1226 total_size &= BDRV_SECTOR_MASK;
b998875d
KW
1227
1228 /* Create the temporary image */
1ba4b6a5 1229 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
b998875d
KW
1230 if (ret < 0) {
1231 error_setg_errno(errp, -ret, "Could not get temporary filename");
1ba4b6a5 1232 goto out;
b998875d
KW
1233 }
1234
1235 bdrv_qcow2 = bdrv_find_format("qcow2");
1236 create_options = parse_option_parameters("", bdrv_qcow2->create_options,
1237 NULL);
1238
1239 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size);
1240
1241 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err);
1242 free_option_parameters(create_options);
1243 if (ret < 0) {
1244 error_setg_errno(errp, -ret, "Could not create temporary overlay "
1245 "'%s': %s", tmp_filename,
1246 error_get_pretty(local_err));
1247 error_free(local_err);
1ba4b6a5 1248 goto out;
b998875d
KW
1249 }
1250
1251 /* Prepare a new options QDict for the temporary file */
1252 snapshot_options = qdict_new();
1253 qdict_put(snapshot_options, "file.driver",
1254 qstring_from_str("file"));
1255 qdict_put(snapshot_options, "file.filename",
1256 qstring_from_str(tmp_filename));
1257
98522f63 1258 bs_snapshot = bdrv_new("", &error_abort);
b998875d
KW
1259 bs_snapshot->is_temporary = 1;
1260
1261 ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options,
1262 bs->open_flags & ~BDRV_O_SNAPSHOT, bdrv_qcow2, &local_err);
1263 if (ret < 0) {
1264 error_propagate(errp, local_err);
1ba4b6a5 1265 goto out;
b998875d
KW
1266 }
1267
1268 bdrv_append(bs_snapshot, bs);
1ba4b6a5
BC
1269
1270out:
1271 g_free(tmp_filename);
b998875d
KW
1272}
1273
b6ce07aa
KW
1274/*
1275 * Opens a disk image (raw, qcow2, vmdk, ...)
de9c0cec
KW
1276 *
1277 * options is a QDict of options to pass to the block drivers, or NULL for an
1278 * empty set of options. The reference to the QDict belongs to the block layer
1279 * after the call (even on failure), so if the caller intends to reuse the
1280 * dictionary, it needs to use QINCREF() before calling bdrv_open.
f67503e5
HR
1281 *
1282 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
1283 * If it is not NULL, the referenced BDS will be reused.
ddf5636d
HR
1284 *
1285 * The reference parameter may be used to specify an existing block device which
1286 * should be opened. If specified, neither options nor a filename may be given,
1287 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
b6ce07aa 1288 */
ddf5636d
HR
1289int bdrv_open(BlockDriverState **pbs, const char *filename,
1290 const char *reference, QDict *options, int flags,
1291 BlockDriver *drv, Error **errp)
ea2384d3 1292{
b6ce07aa 1293 int ret;
f67503e5 1294 BlockDriverState *file = NULL, *bs;
74fe54f2 1295 const char *drvname;
34b5d2c6 1296 Error *local_err = NULL;
712e7874 1297
f67503e5
HR
1298 assert(pbs);
1299
ddf5636d
HR
1300 if (reference) {
1301 bool options_non_empty = options ? qdict_size(options) : false;
1302 QDECREF(options);
1303
1304 if (*pbs) {
1305 error_setg(errp, "Cannot reuse an existing BDS when referencing "
1306 "another block device");
1307 return -EINVAL;
1308 }
1309
1310 if (filename || options_non_empty) {
1311 error_setg(errp, "Cannot reference an existing block device with "
1312 "additional options or a new filename");
1313 return -EINVAL;
1314 }
1315
1316 bs = bdrv_lookup_bs(reference, reference, errp);
1317 if (!bs) {
1318 return -ENODEV;
1319 }
1320 bdrv_ref(bs);
1321 *pbs = bs;
1322 return 0;
1323 }
1324
f67503e5
HR
1325 if (*pbs) {
1326 bs = *pbs;
1327 } else {
98522f63 1328 bs = bdrv_new("", &error_abort);
f67503e5
HR
1329 }
1330
de9c0cec
KW
1331 /* NULL means an empty set of options */
1332 if (options == NULL) {
1333 options = qdict_new();
1334 }
1335
1336 bs->options = options;
b6ad491a 1337 options = qdict_clone_shallow(options);
de9c0cec 1338
5469a2a6
HR
1339 if (flags & BDRV_O_PROTOCOL) {
1340 assert(!drv);
5acd9d81 1341 ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL,
5469a2a6 1342 &local_err);
5469a2a6 1343 if (!ret) {
eb909c7f 1344 drv = bs->drv;
5acd9d81 1345 goto done;
5469a2a6
HR
1346 } else if (bs->drv) {
1347 goto close_and_fail;
1348 } else {
1349 goto fail;
1350 }
1351 }
1352
f500a6d3
KW
1353 /* Open image file without format layer */
1354 if (flags & BDRV_O_RDWR) {
1355 flags |= BDRV_O_ALLOW_RDWR;
1356 }
1357
f67503e5 1358 assert(file == NULL);
054963f8 1359 ret = bdrv_open_image(&file, filename, options, "file",
0b50cc88
KW
1360 bdrv_inherited_flags(flags),
1361 true, &local_err);
054963f8 1362 if (ret < 0) {
9562f69c 1363 goto unlink_and_fail;
f500a6d3
KW
1364 }
1365
b6ce07aa 1366 /* Find the right image format driver */
74fe54f2
KW
1367 drvname = qdict_get_try_str(options, "driver");
1368 if (drvname) {
8f94a6e4 1369 drv = bdrv_find_format(drvname);
74fe54f2 1370 qdict_del(options, "driver");
06d22aa3
KW
1371 if (!drv) {
1372 error_setg(errp, "Invalid driver: '%s'", drvname);
1373 ret = -EINVAL;
1374 goto unlink_and_fail;
1375 }
74fe54f2
KW
1376 }
1377
6db95603 1378 if (!drv) {
2a05cbe4
HR
1379 if (file) {
1380 ret = find_image_format(file, filename, &drv, &local_err);
1381 } else {
1382 error_setg(errp, "Must specify either driver or file");
1383 ret = -EINVAL;
1384 goto unlink_and_fail;
1385 }
51d7c00c 1386 }
6987307c 1387
51d7c00c 1388 if (!drv) {
51d7c00c 1389 goto unlink_and_fail;
ea2384d3 1390 }
b6ce07aa
KW
1391
1392 /* Open the image */
34b5d2c6 1393 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err);
b6ce07aa 1394 if (ret < 0) {
6987307c
CH
1395 goto unlink_and_fail;
1396 }
1397
2a05cbe4 1398 if (file && (bs->file != file)) {
4f6fd349 1399 bdrv_unref(file);
f500a6d3
KW
1400 file = NULL;
1401 }
1402
b6ce07aa 1403 /* If there is a backing file, use it */
9156df12 1404 if ((flags & BDRV_O_NO_BACKING) == 0) {
31ca6d07
KW
1405 QDict *backing_options;
1406
5726d872 1407 qdict_extract_subqdict(options, &backing_options, "backing.");
34b5d2c6 1408 ret = bdrv_open_backing_file(bs, backing_options, &local_err);
b6ce07aa 1409 if (ret < 0) {
b6ad491a 1410 goto close_and_fail;
b6ce07aa 1411 }
b6ce07aa
KW
1412 }
1413
b998875d
KW
1414 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
1415 * temporary snapshot afterwards. */
1416 if (flags & BDRV_O_SNAPSHOT) {
1417 bdrv_append_temp_snapshot(bs, &local_err);
1418 if (local_err) {
1419 error_propagate(errp, local_err);
1420 goto close_and_fail;
1421 }
1422 }
1423
1424
5acd9d81 1425done:
b6ad491a 1426 /* Check if any unknown options were used */
5acd9d81 1427 if (options && (qdict_size(options) != 0)) {
b6ad491a 1428 const QDictEntry *entry = qdict_first(options);
5acd9d81
HR
1429 if (flags & BDRV_O_PROTOCOL) {
1430 error_setg(errp, "Block protocol '%s' doesn't support the option "
1431 "'%s'", drv->format_name, entry->key);
1432 } else {
1433 error_setg(errp, "Block format '%s' used by device '%s' doesn't "
1434 "support the option '%s'", drv->format_name,
1435 bs->device_name, entry->key);
1436 }
b6ad491a
KW
1437
1438 ret = -EINVAL;
1439 goto close_and_fail;
1440 }
b6ad491a 1441
b6ce07aa 1442 if (!bdrv_key_required(bs)) {
7d4b4ba5 1443 bdrv_dev_change_media_cb(bs, true);
c3adb58f
MA
1444 } else if (!runstate_check(RUN_STATE_PRELAUNCH)
1445 && !runstate_check(RUN_STATE_INMIGRATE)
1446 && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
1447 error_setg(errp,
1448 "Guest must be stopped for opening of encrypted image");
1449 ret = -EBUSY;
1450 goto close_and_fail;
b6ce07aa
KW
1451 }
1452
c3adb58f 1453 QDECREF(options);
f67503e5 1454 *pbs = bs;
b6ce07aa
KW
1455 return 0;
1456
1457unlink_and_fail:
f500a6d3 1458 if (file != NULL) {
4f6fd349 1459 bdrv_unref(file);
f500a6d3 1460 }
b6ce07aa
KW
1461 if (bs->is_temporary) {
1462 unlink(filename);
1463 }
de9c0cec
KW
1464fail:
1465 QDECREF(bs->options);
b6ad491a 1466 QDECREF(options);
de9c0cec 1467 bs->options = NULL;
f67503e5
HR
1468 if (!*pbs) {
1469 /* If *pbs is NULL, a new BDS has been created in this function and
1470 needs to be freed now. Otherwise, it does not need to be closed,
1471 since it has not really been opened yet. */
1472 bdrv_unref(bs);
1473 }
84d18f06 1474 if (local_err) {
34b5d2c6
HR
1475 error_propagate(errp, local_err);
1476 }
b6ad491a 1477 return ret;
de9c0cec 1478
b6ad491a 1479close_and_fail:
f67503e5
HR
1480 /* See fail path, but now the BDS has to be always closed */
1481 if (*pbs) {
1482 bdrv_close(bs);
1483 } else {
1484 bdrv_unref(bs);
1485 }
b6ad491a 1486 QDECREF(options);
84d18f06 1487 if (local_err) {
34b5d2c6
HR
1488 error_propagate(errp, local_err);
1489 }
b6ce07aa
KW
1490 return ret;
1491}
1492
e971aa12
JC
1493typedef struct BlockReopenQueueEntry {
1494 bool prepared;
1495 BDRVReopenState state;
1496 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
1497} BlockReopenQueueEntry;
1498
1499/*
1500 * Adds a BlockDriverState to a simple queue for an atomic, transactional
1501 * reopen of multiple devices.
1502 *
1503 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
1504 * already performed, or alternatively may be NULL a new BlockReopenQueue will
1505 * be created and initialized. This newly created BlockReopenQueue should be
1506 * passed back in for subsequent calls that are intended to be of the same
1507 * atomic 'set'.
1508 *
1509 * bs is the BlockDriverState to add to the reopen queue.
1510 *
1511 * flags contains the open flags for the associated bs
1512 *
1513 * returns a pointer to bs_queue, which is either the newly allocated
1514 * bs_queue, or the existing bs_queue being used.
1515 *
1516 */
1517BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
1518 BlockDriverState *bs, int flags)
1519{
1520 assert(bs != NULL);
1521
1522 BlockReopenQueueEntry *bs_entry;
1523 if (bs_queue == NULL) {
1524 bs_queue = g_new0(BlockReopenQueue, 1);
1525 QSIMPLEQ_INIT(bs_queue);
1526 }
1527
1528 if (bs->file) {
1529 bdrv_reopen_queue(bs_queue, bs->file, flags);
1530 }
1531
1532 bs_entry = g_new0(BlockReopenQueueEntry, 1);
1533 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
1534
1535 bs_entry->state.bs = bs;
1536 bs_entry->state.flags = flags;
1537
1538 return bs_queue;
1539}
1540
1541/*
1542 * Reopen multiple BlockDriverStates atomically & transactionally.
1543 *
1544 * The queue passed in (bs_queue) must have been built up previous
1545 * via bdrv_reopen_queue().
1546 *
1547 * Reopens all BDS specified in the queue, with the appropriate
1548 * flags. All devices are prepared for reopen, and failure of any
1549 * device will cause all device changes to be abandonded, and intermediate
1550 * data cleaned up.
1551 *
1552 * If all devices prepare successfully, then the changes are committed
1553 * to all devices.
1554 *
1555 */
1556int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
1557{
1558 int ret = -1;
1559 BlockReopenQueueEntry *bs_entry, *next;
1560 Error *local_err = NULL;
1561
1562 assert(bs_queue != NULL);
1563
1564 bdrv_drain_all();
1565
1566 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1567 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
1568 error_propagate(errp, local_err);
1569 goto cleanup;
1570 }
1571 bs_entry->prepared = true;
1572 }
1573
1574 /* If we reach this point, we have success and just need to apply the
1575 * changes
1576 */
1577 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
1578 bdrv_reopen_commit(&bs_entry->state);
1579 }
1580
1581 ret = 0;
1582
1583cleanup:
1584 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
1585 if (ret && bs_entry->prepared) {
1586 bdrv_reopen_abort(&bs_entry->state);
1587 }
1588 g_free(bs_entry);
1589 }
1590 g_free(bs_queue);
1591 return ret;
1592}
1593
1594
1595/* Reopen a single BlockDriverState with the specified flags. */
1596int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
1597{
1598 int ret = -1;
1599 Error *local_err = NULL;
1600 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
1601
1602 ret = bdrv_reopen_multiple(queue, &local_err);
1603 if (local_err != NULL) {
1604 error_propagate(errp, local_err);
1605 }
1606 return ret;
1607}
1608
1609
1610/*
1611 * Prepares a BlockDriverState for reopen. All changes are staged in the
1612 * 'opaque' field of the BDRVReopenState, which is used and allocated by
1613 * the block driver layer .bdrv_reopen_prepare()
1614 *
1615 * bs is the BlockDriverState to reopen
1616 * flags are the new open flags
1617 * queue is the reopen queue
1618 *
1619 * Returns 0 on success, non-zero on error. On error errp will be set
1620 * as well.
1621 *
1622 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1623 * It is the responsibility of the caller to then call the abort() or
1624 * commit() for any other BDS that have been left in a prepare() state
1625 *
1626 */
1627int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1628 Error **errp)
1629{
1630 int ret = -1;
1631 Error *local_err = NULL;
1632 BlockDriver *drv;
1633
1634 assert(reopen_state != NULL);
1635 assert(reopen_state->bs->drv != NULL);
1636 drv = reopen_state->bs->drv;
1637
1638 /* if we are to stay read-only, do not allow permission change
1639 * to r/w */
1640 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1641 reopen_state->flags & BDRV_O_RDWR) {
1642 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1643 reopen_state->bs->device_name);
1644 goto error;
1645 }
1646
1647
1648 ret = bdrv_flush(reopen_state->bs);
1649 if (ret) {
1650 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1651 strerror(-ret));
1652 goto error;
1653 }
1654
1655 if (drv->bdrv_reopen_prepare) {
1656 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1657 if (ret) {
1658 if (local_err != NULL) {
1659 error_propagate(errp, local_err);
1660 } else {
d8b6895f
LC
1661 error_setg(errp, "failed while preparing to reopen image '%s'",
1662 reopen_state->bs->filename);
e971aa12
JC
1663 }
1664 goto error;
1665 }
1666 } else {
1667 /* It is currently mandatory to have a bdrv_reopen_prepare()
1668 * handler for each supported drv. */
1669 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1670 drv->format_name, reopen_state->bs->device_name,
1671 "reopening of file");
1672 ret = -1;
1673 goto error;
1674 }
1675
1676 ret = 0;
1677
1678error:
1679 return ret;
1680}
1681
1682/*
1683 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1684 * makes them final by swapping the staging BlockDriverState contents into
1685 * the active BlockDriverState contents.
1686 */
1687void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1688{
1689 BlockDriver *drv;
1690
1691 assert(reopen_state != NULL);
1692 drv = reopen_state->bs->drv;
1693 assert(drv != NULL);
1694
1695 /* If there are any driver level actions to take */
1696 if (drv->bdrv_reopen_commit) {
1697 drv->bdrv_reopen_commit(reopen_state);
1698 }
1699
1700 /* set BDS specific flags now */
1701 reopen_state->bs->open_flags = reopen_state->flags;
1702 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1703 BDRV_O_CACHE_WB);
1704 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
355ef4ac
KW
1705
1706 bdrv_refresh_limits(reopen_state->bs);
e971aa12
JC
1707}
1708
1709/*
1710 * Abort the reopen, and delete and free the staged changes in
1711 * reopen_state
1712 */
1713void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1714{
1715 BlockDriver *drv;
1716
1717 assert(reopen_state != NULL);
1718 drv = reopen_state->bs->drv;
1719 assert(drv != NULL);
1720
1721 if (drv->bdrv_reopen_abort) {
1722 drv->bdrv_reopen_abort(reopen_state);
1723 }
1724}
1725
1726
fc01f7e7
FB
1727void bdrv_close(BlockDriverState *bs)
1728{
3cbc002c
PB
1729 if (bs->job) {
1730 block_job_cancel_sync(bs->job);
1731 }
58fda173
SH
1732 bdrv_drain_all(); /* complete I/O */
1733 bdrv_flush(bs);
1734 bdrv_drain_all(); /* in case flush left pending I/O */
d7d512f6 1735 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1736
3cbc002c 1737 if (bs->drv) {
557df6ac 1738 if (bs->backing_hd) {
4f6fd349 1739 bdrv_unref(bs->backing_hd);
557df6ac
SH
1740 bs->backing_hd = NULL;
1741 }
ea2384d3 1742 bs->drv->bdrv_close(bs);
7267c094 1743 g_free(bs->opaque);
ea2384d3
FB
1744#ifdef _WIN32
1745 if (bs->is_temporary) {
1746 unlink(bs->filename);
1747 }
67b915a5 1748#endif
ea2384d3
FB
1749 bs->opaque = NULL;
1750 bs->drv = NULL;
53fec9d3 1751 bs->copy_on_read = 0;
a275fa42
PB
1752 bs->backing_file[0] = '\0';
1753 bs->backing_format[0] = '\0';
6405875c
PB
1754 bs->total_sectors = 0;
1755 bs->encrypted = 0;
1756 bs->valid_key = 0;
1757 bs->sg = 0;
1758 bs->growable = 0;
0d51b4de 1759 bs->zero_beyond_eof = false;
de9c0cec
KW
1760 QDECREF(bs->options);
1761 bs->options = NULL;
b338082b 1762
66f82cee 1763 if (bs->file != NULL) {
4f6fd349 1764 bdrv_unref(bs->file);
0ac9377d 1765 bs->file = NULL;
66f82cee 1766 }
b338082b 1767 }
98f90dba 1768
9ca11154
PH
1769 bdrv_dev_change_media_cb(bs, false);
1770
98f90dba
ZYW
1771 /*throttling disk I/O limits*/
1772 if (bs->io_limits_enabled) {
1773 bdrv_io_limits_disable(bs);
1774 }
b338082b
FB
1775}
1776
2bc93fed
MK
1777void bdrv_close_all(void)
1778{
1779 BlockDriverState *bs;
1780
dc364f4c 1781 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
2bc93fed
MK
1782 bdrv_close(bs);
1783 }
1784}
1785
88266f5a
SH
1786/* Check if any requests are in-flight (including throttled requests) */
1787static bool bdrv_requests_pending(BlockDriverState *bs)
1788{
1789 if (!QLIST_EMPTY(&bs->tracked_requests)) {
1790 return true;
1791 }
cc0681c4
BC
1792 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
1793 return true;
1794 }
1795 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
88266f5a
SH
1796 return true;
1797 }
1798 if (bs->file && bdrv_requests_pending(bs->file)) {
1799 return true;
1800 }
1801 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
1802 return true;
1803 }
1804 return false;
1805}
1806
1807static bool bdrv_requests_pending_all(void)
1808{
1809 BlockDriverState *bs;
dc364f4c 1810 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
88266f5a
SH
1811 if (bdrv_requests_pending(bs)) {
1812 return true;
1813 }
1814 }
1815 return false;
1816}
1817
922453bc
SH
1818/*
1819 * Wait for pending requests to complete across all BlockDriverStates
1820 *
1821 * This function does not flush data to disk, use bdrv_flush_all() for that
1822 * after calling this function.
4c355d53
ZYW
1823 *
1824 * Note that completion of an asynchronous I/O operation can trigger any
1825 * number of other I/O operations on other devices---for example a coroutine
1826 * can be arbitrarily complex and a constant flow of I/O can come until the
1827 * coroutine is complete. Because of this, it is not possible to have a
1828 * function to drain a single device's I/O queue.
922453bc
SH
1829 */
1830void bdrv_drain_all(void)
1831{
88266f5a
SH
1832 /* Always run first iteration so any pending completion BHs run */
1833 bool busy = true;
922453bc
SH
1834 BlockDriverState *bs;
1835
88266f5a 1836 while (busy) {
dc364f4c 1837 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
0b06ef3b 1838 bdrv_start_throttled_reqs(bs);
4c355d53 1839 }
922453bc 1840
88266f5a
SH
1841 busy = bdrv_requests_pending_all();
1842 busy |= aio_poll(qemu_get_aio_context(), busy);
922453bc
SH
1843 }
1844}
1845
dc364f4c
BC
1846/* make a BlockDriverState anonymous by removing from bdrv_state and
1847 * graph_bdrv_state list.
d22b2f41
RH
1848 Also, NULL terminate the device_name to prevent double remove */
1849void bdrv_make_anon(BlockDriverState *bs)
1850{
1851 if (bs->device_name[0] != '\0') {
dc364f4c 1852 QTAILQ_REMOVE(&bdrv_states, bs, device_list);
d22b2f41
RH
1853 }
1854 bs->device_name[0] = '\0';
dc364f4c
BC
1855 if (bs->node_name[0] != '\0') {
1856 QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
1857 }
1858 bs->node_name[0] = '\0';
d22b2f41
RH
1859}
1860
e023b2e2
PB
1861static void bdrv_rebind(BlockDriverState *bs)
1862{
1863 if (bs->drv && bs->drv->bdrv_rebind) {
1864 bs->drv->bdrv_rebind(bs);
1865 }
1866}
1867
4ddc07ca
PB
1868static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1869 BlockDriverState *bs_src)
8802d1fd 1870{
4ddc07ca
PB
1871 /* move some fields that need to stay attached to the device */
1872 bs_dest->open_flags = bs_src->open_flags;
8802d1fd
JC
1873
1874 /* dev info */
4ddc07ca
PB
1875 bs_dest->dev_ops = bs_src->dev_ops;
1876 bs_dest->dev_opaque = bs_src->dev_opaque;
1877 bs_dest->dev = bs_src->dev;
1b7fd729 1878 bs_dest->guest_block_size = bs_src->guest_block_size;
4ddc07ca 1879 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1880
4ddc07ca 1881 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1882
cc0681c4
BC
1883 /* i/o throttled req */
1884 memcpy(&bs_dest->throttle_state,
1885 &bs_src->throttle_state,
1886 sizeof(ThrottleState));
1887 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0];
1888 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1];
4ddc07ca 1889 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 1890
8802d1fd 1891 /* r/w error */
4ddc07ca
PB
1892 bs_dest->on_read_error = bs_src->on_read_error;
1893 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
1894
1895 /* i/o status */
4ddc07ca
PB
1896 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1897 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 1898
a9fc4408 1899 /* dirty bitmap */
e4654d2d 1900 bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps;
a9fc4408 1901
9fcb0251
FZ
1902 /* reference count */
1903 bs_dest->refcnt = bs_src->refcnt;
1904
a9fc4408 1905 /* job */
4ddc07ca
PB
1906 bs_dest->in_use = bs_src->in_use;
1907 bs_dest->job = bs_src->job;
a9fc4408 1908
8802d1fd 1909 /* keep the same entry in bdrv_states */
4ddc07ca
PB
1910 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1911 bs_src->device_name);
dc364f4c 1912 bs_dest->device_list = bs_src->device_list;
4ddc07ca 1913}
8802d1fd 1914
4ddc07ca
PB
1915/*
1916 * Swap bs contents for two image chains while they are live,
1917 * while keeping required fields on the BlockDriverState that is
1918 * actually attached to a device.
1919 *
1920 * This will modify the BlockDriverState fields, and swap contents
1921 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1922 *
1923 * bs_new is required to be anonymous.
1924 *
1925 * This function does not create any image files.
1926 */
1927void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1928{
1929 BlockDriverState tmp;
f6801b83 1930
90ce8a06
BC
1931 /* The code needs to swap the node_name but simply swapping node_list won't
1932 * work so first remove the nodes from the graph list, do the swap then
1933 * insert them back if needed.
1934 */
1935 if (bs_new->node_name[0] != '\0') {
1936 QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list);
1937 }
1938 if (bs_old->node_name[0] != '\0') {
1939 QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list);
1940 }
1941
4ddc07ca
PB
1942 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1943 assert(bs_new->device_name[0] == '\0');
e4654d2d 1944 assert(QLIST_EMPTY(&bs_new->dirty_bitmaps));
4ddc07ca
PB
1945 assert(bs_new->job == NULL);
1946 assert(bs_new->dev == NULL);
1947 assert(bs_new->in_use == 0);
1948 assert(bs_new->io_limits_enabled == false);
cc0681c4 1949 assert(!throttle_have_timer(&bs_new->throttle_state));
8802d1fd 1950
4ddc07ca
PB
1951 tmp = *bs_new;
1952 *bs_new = *bs_old;
1953 *bs_old = tmp;
a9fc4408 1954
4ddc07ca
PB
1955 /* there are some fields that should not be swapped, move them back */
1956 bdrv_move_feature_fields(&tmp, bs_old);
1957 bdrv_move_feature_fields(bs_old, bs_new);
1958 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 1959
4ddc07ca
PB
1960 /* bs_new shouldn't be in bdrv_states even after the swap! */
1961 assert(bs_new->device_name[0] == '\0');
1962
1963 /* Check a few fields that should remain attached to the device */
1964 assert(bs_new->dev == NULL);
1965 assert(bs_new->job == NULL);
1966 assert(bs_new->in_use == 0);
1967 assert(bs_new->io_limits_enabled == false);
cc0681c4 1968 assert(!throttle_have_timer(&bs_new->throttle_state));
e023b2e2 1969
90ce8a06
BC
1970 /* insert the nodes back into the graph node list if needed */
1971 if (bs_new->node_name[0] != '\0') {
1972 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list);
1973 }
1974 if (bs_old->node_name[0] != '\0') {
1975 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list);
1976 }
1977
e023b2e2 1978 bdrv_rebind(bs_new);
4ddc07ca
PB
1979 bdrv_rebind(bs_old);
1980}
1981
1982/*
1983 * Add new bs contents at the top of an image chain while the chain is
1984 * live, while keeping required fields on the top layer.
1985 *
1986 * This will modify the BlockDriverState fields, and swap contents
1987 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1988 *
1989 * bs_new is required to be anonymous.
1990 *
1991 * This function does not create any image files.
1992 */
1993void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1994{
1995 bdrv_swap(bs_new, bs_top);
1996
1997 /* The contents of 'tmp' will become bs_top, as we are
1998 * swapping bs_new and bs_top contents. */
1999 bs_top->backing_hd = bs_new;
2000 bs_top->open_flags &= ~BDRV_O_NO_BACKING;
2001 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
2002 bs_new->filename);
2003 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
2004 bs_new->drv ? bs_new->drv->format_name : "");
8802d1fd
JC
2005}
2006
4f6fd349 2007static void bdrv_delete(BlockDriverState *bs)
b338082b 2008{
fa879d62 2009 assert(!bs->dev);
3e914655
PB
2010 assert(!bs->job);
2011 assert(!bs->in_use);
4f6fd349 2012 assert(!bs->refcnt);
e4654d2d 2013 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
18846dee 2014
e1b5c52e
SH
2015 bdrv_close(bs);
2016
1b7bdbc1 2017 /* remove from list, if necessary */
d22b2f41 2018 bdrv_make_anon(bs);
34c6f050 2019
7267c094 2020 g_free(bs);
fc01f7e7
FB
2021}
2022
fa879d62
MA
2023int bdrv_attach_dev(BlockDriverState *bs, void *dev)
2024/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 2025{
fa879d62 2026 if (bs->dev) {
18846dee
MA
2027 return -EBUSY;
2028 }
fa879d62 2029 bs->dev = dev;
28a7282a 2030 bdrv_iostatus_reset(bs);
18846dee
MA
2031 return 0;
2032}
2033
fa879d62
MA
2034/* TODO qdevified devices don't use this, remove when devices are qdevified */
2035void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 2036{
fa879d62
MA
2037 if (bdrv_attach_dev(bs, dev) < 0) {
2038 abort();
2039 }
2040}
2041
2042void bdrv_detach_dev(BlockDriverState *bs, void *dev)
2043/* TODO change to DeviceState *dev when all users are qdevified */
2044{
2045 assert(bs->dev == dev);
2046 bs->dev = NULL;
0e49de52
MA
2047 bs->dev_ops = NULL;
2048 bs->dev_opaque = NULL;
1b7fd729 2049 bs->guest_block_size = 512;
18846dee
MA
2050}
2051
fa879d62
MA
2052/* TODO change to return DeviceState * when all users are qdevified */
2053void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 2054{
fa879d62 2055 return bs->dev;
18846dee
MA
2056}
2057
0e49de52
MA
2058void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
2059 void *opaque)
2060{
2061 bs->dev_ops = ops;
2062 bs->dev_opaque = opaque;
2063}
2064
32c81a4a
PB
2065void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
2066 enum MonitorEvent ev,
2067 BlockErrorAction action, bool is_read)
329c0a48
LC
2068{
2069 QObject *data;
2070 const char *action_str;
2071
2072 switch (action) {
2073 case BDRV_ACTION_REPORT:
2074 action_str = "report";
2075 break;
2076 case BDRV_ACTION_IGNORE:
2077 action_str = "ignore";
2078 break;
2079 case BDRV_ACTION_STOP:
2080 action_str = "stop";
2081 break;
2082 default:
2083 abort();
2084 }
2085
2086 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2087 bdrv->device_name,
2088 action_str,
2089 is_read ? "read" : "write");
32c81a4a 2090 monitor_protocol_event(ev, data);
329c0a48
LC
2091
2092 qobject_decref(data);
2093}
2094
6f382ed2
LC
2095static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
2096{
2097 QObject *data;
2098
2099 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
2100 bdrv_get_device_name(bs), ejected);
2101 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
2102
2103 qobject_decref(data);
2104}
2105
7d4b4ba5 2106static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 2107{
145feb17 2108 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 2109 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 2110 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
2111 if (tray_was_closed) {
2112 /* tray open */
2113 bdrv_emit_qmp_eject_event(bs, true);
2114 }
2115 if (load) {
2116 /* tray close */
2117 bdrv_emit_qmp_eject_event(bs, false);
2118 }
145feb17
MA
2119 }
2120}
2121
2c6942fa
MA
2122bool bdrv_dev_has_removable_media(BlockDriverState *bs)
2123{
2124 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
2125}
2126
025ccaa7
PB
2127void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
2128{
2129 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
2130 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
2131 }
2132}
2133
e4def80b
MA
2134bool bdrv_dev_is_tray_open(BlockDriverState *bs)
2135{
2136 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
2137 return bs->dev_ops->is_tray_open(bs->dev_opaque);
2138 }
2139 return false;
2140}
2141
145feb17
MA
2142static void bdrv_dev_resize_cb(BlockDriverState *bs)
2143{
2144 if (bs->dev_ops && bs->dev_ops->resize_cb) {
2145 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
2146 }
2147}
2148
f107639a
MA
2149bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
2150{
2151 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
2152 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
2153 }
2154 return false;
2155}
2156
e97fc193
AL
2157/*
2158 * Run consistency checks on an image
2159 *
e076f338 2160 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 2161 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 2162 * check are stored in res.
e97fc193 2163 */
4534ff54 2164int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193
AL
2165{
2166 if (bs->drv->bdrv_check == NULL) {
2167 return -ENOTSUP;
2168 }
2169
e076f338 2170 memset(res, 0, sizeof(*res));
4534ff54 2171 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
2172}
2173
8a426614
KW
2174#define COMMIT_BUF_SECTORS 2048
2175
33e3963e
FB
2176/* commit COW file into the raw image */
2177int bdrv_commit(BlockDriverState *bs)
2178{
19cb3738 2179 BlockDriver *drv = bs->drv;
72706ea4 2180 int64_t sector, total_sectors, length, backing_length;
8a426614 2181 int n, ro, open_flags;
0bce597d 2182 int ret = 0;
72706ea4 2183 uint8_t *buf = NULL;
c2cba3d9 2184 char filename[PATH_MAX];
33e3963e 2185
19cb3738
FB
2186 if (!drv)
2187 return -ENOMEDIUM;
4dca4b63
NS
2188
2189 if (!bs->backing_hd) {
2190 return -ENOTSUP;
33e3963e
FB
2191 }
2192
2d3735d3
SH
2193 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
2194 return -EBUSY;
2195 }
2196
4dca4b63 2197 ro = bs->backing_hd->read_only;
c2cba3d9
JM
2198 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
2199 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
2200 open_flags = bs->backing_hd->open_flags;
2201
2202 if (ro) {
0bce597d
JC
2203 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
2204 return -EACCES;
4dca4b63 2205 }
ea2384d3 2206 }
33e3963e 2207
72706ea4
JC
2208 length = bdrv_getlength(bs);
2209 if (length < 0) {
2210 ret = length;
2211 goto ro_cleanup;
2212 }
2213
2214 backing_length = bdrv_getlength(bs->backing_hd);
2215 if (backing_length < 0) {
2216 ret = backing_length;
2217 goto ro_cleanup;
2218 }
2219
2220 /* If our top snapshot is larger than the backing file image,
2221 * grow the backing file image if possible. If not possible,
2222 * we must return an error */
2223 if (length > backing_length) {
2224 ret = bdrv_truncate(bs->backing_hd, length);
2225 if (ret < 0) {
2226 goto ro_cleanup;
2227 }
2228 }
2229
2230 total_sectors = length >> BDRV_SECTOR_BITS;
7267c094 2231 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
2232
2233 for (sector = 0; sector < total_sectors; sector += n) {
d663640c
PB
2234 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
2235 if (ret < 0) {
2236 goto ro_cleanup;
2237 }
2238 if (ret) {
dabfa6cc
KW
2239 ret = bdrv_read(bs, sector, buf, n);
2240 if (ret < 0) {
8a426614
KW
2241 goto ro_cleanup;
2242 }
2243
dabfa6cc
KW
2244 ret = bdrv_write(bs->backing_hd, sector, buf, n);
2245 if (ret < 0) {
8a426614
KW
2246 goto ro_cleanup;
2247 }
ea2384d3 2248 }
33e3963e 2249 }
95389c86 2250
1d44952f
CH
2251 if (drv->bdrv_make_empty) {
2252 ret = drv->bdrv_make_empty(bs);
dabfa6cc
KW
2253 if (ret < 0) {
2254 goto ro_cleanup;
2255 }
1d44952f
CH
2256 bdrv_flush(bs);
2257 }
95389c86 2258
3f5075ae
CH
2259 /*
2260 * Make sure all data we wrote to the backing device is actually
2261 * stable on disk.
2262 */
dabfa6cc 2263 if (bs->backing_hd) {
3f5075ae 2264 bdrv_flush(bs->backing_hd);
dabfa6cc 2265 }
4dca4b63 2266
dabfa6cc 2267 ret = 0;
4dca4b63 2268ro_cleanup:
7267c094 2269 g_free(buf);
4dca4b63
NS
2270
2271 if (ro) {
0bce597d
JC
2272 /* ignoring error return here */
2273 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
2274 }
2275
1d44952f 2276 return ret;
33e3963e
FB
2277}
2278
e8877497 2279int bdrv_commit_all(void)
6ab4b5ab
MA
2280{
2281 BlockDriverState *bs;
2282
dc364f4c 2283 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
272d2d8e
JC
2284 if (bs->drv && bs->backing_hd) {
2285 int ret = bdrv_commit(bs);
2286 if (ret < 0) {
2287 return ret;
2288 }
e8877497 2289 }
6ab4b5ab 2290 }
e8877497 2291 return 0;
6ab4b5ab
MA
2292}
2293
dbffbdcf
SH
2294/**
2295 * Remove an active request from the tracked requests list
2296 *
2297 * This function should be called when a tracked request is completing.
2298 */
2299static void tracked_request_end(BdrvTrackedRequest *req)
2300{
2dbafdc0
KW
2301 if (req->serialising) {
2302 req->bs->serialising_in_flight--;
2303 }
2304
dbffbdcf 2305 QLIST_REMOVE(req, list);
f4658285 2306 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
2307}
2308
2309/**
2310 * Add an active request to the tracked requests list
2311 */
2312static void tracked_request_begin(BdrvTrackedRequest *req,
2313 BlockDriverState *bs,
793ed47a
KW
2314 int64_t offset,
2315 unsigned int bytes, bool is_write)
dbffbdcf
SH
2316{
2317 *req = (BdrvTrackedRequest){
2318 .bs = bs,
2dbafdc0
KW
2319 .offset = offset,
2320 .bytes = bytes,
2321 .is_write = is_write,
2322 .co = qemu_coroutine_self(),
2323 .serialising = false,
7327145f
KW
2324 .overlap_offset = offset,
2325 .overlap_bytes = bytes,
dbffbdcf
SH
2326 };
2327
f4658285
SH
2328 qemu_co_queue_init(&req->wait_queue);
2329
dbffbdcf
SH
2330 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
2331}
2332
e96126ff 2333static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
2dbafdc0 2334{
7327145f 2335 int64_t overlap_offset = req->offset & ~(align - 1);
e96126ff
KW
2336 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
2337 - overlap_offset;
7327145f 2338
2dbafdc0
KW
2339 if (!req->serialising) {
2340 req->bs->serialising_in_flight++;
2341 req->serialising = true;
2342 }
7327145f
KW
2343
2344 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
2345 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
2dbafdc0
KW
2346}
2347
d83947ac
SH
2348/**
2349 * Round a region to cluster boundaries
2350 */
343bded4
PB
2351void bdrv_round_to_clusters(BlockDriverState *bs,
2352 int64_t sector_num, int nb_sectors,
2353 int64_t *cluster_sector_num,
2354 int *cluster_nb_sectors)
d83947ac
SH
2355{
2356 BlockDriverInfo bdi;
2357
2358 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
2359 *cluster_sector_num = sector_num;
2360 *cluster_nb_sectors = nb_sectors;
2361 } else {
2362 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
2363 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
2364 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
2365 nb_sectors, c);
2366 }
2367}
2368
7327145f 2369static int bdrv_get_cluster_size(BlockDriverState *bs)
793ed47a
KW
2370{
2371 BlockDriverInfo bdi;
7327145f 2372 int ret;
793ed47a 2373
7327145f
KW
2374 ret = bdrv_get_info(bs, &bdi);
2375 if (ret < 0 || bdi.cluster_size == 0) {
2376 return bs->request_alignment;
793ed47a 2377 } else {
7327145f 2378 return bdi.cluster_size;
793ed47a
KW
2379 }
2380}
2381
f4658285 2382static bool tracked_request_overlaps(BdrvTrackedRequest *req,
793ed47a
KW
2383 int64_t offset, unsigned int bytes)
2384{
d83947ac 2385 /* aaaa bbbb */
7327145f 2386 if (offset >= req->overlap_offset + req->overlap_bytes) {
d83947ac
SH
2387 return false;
2388 }
2389 /* bbbb aaaa */
7327145f 2390 if (req->overlap_offset >= offset + bytes) {
d83947ac
SH
2391 return false;
2392 }
2393 return true;
f4658285
SH
2394}
2395
28de2dcd 2396static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
f4658285 2397{
2dbafdc0 2398 BlockDriverState *bs = self->bs;
f4658285
SH
2399 BdrvTrackedRequest *req;
2400 bool retry;
28de2dcd 2401 bool waited = false;
f4658285 2402
2dbafdc0 2403 if (!bs->serialising_in_flight) {
28de2dcd 2404 return false;
2dbafdc0
KW
2405 }
2406
f4658285
SH
2407 do {
2408 retry = false;
2409 QLIST_FOREACH(req, &bs->tracked_requests, list) {
2dbafdc0 2410 if (req == self || (!req->serialising && !self->serialising)) {
65afd211
KW
2411 continue;
2412 }
7327145f
KW
2413 if (tracked_request_overlaps(req, self->overlap_offset,
2414 self->overlap_bytes))
2415 {
5f8b6491
SH
2416 /* Hitting this means there was a reentrant request, for
2417 * example, a block driver issuing nested requests. This must
2418 * never happen since it means deadlock.
2419 */
2420 assert(qemu_coroutine_self() != req->co);
2421
6460440f
KW
2422 /* If the request is already (indirectly) waiting for us, or
2423 * will wait for us as soon as it wakes up, then just go on
2424 * (instead of producing a deadlock in the former case). */
2425 if (!req->waiting_for) {
2426 self->waiting_for = req;
2427 qemu_co_queue_wait(&req->wait_queue);
2428 self->waiting_for = NULL;
2429 retry = true;
28de2dcd 2430 waited = true;
6460440f
KW
2431 break;
2432 }
f4658285
SH
2433 }
2434 }
2435 } while (retry);
28de2dcd
KW
2436
2437 return waited;
f4658285
SH
2438}
2439
756e6736
KW
2440/*
2441 * Return values:
2442 * 0 - success
2443 * -EINVAL - backing format specified, but no file
2444 * -ENOSPC - can't update the backing file because no space is left in the
2445 * image file header
2446 * -ENOTSUP - format driver doesn't support changing the backing file
2447 */
2448int bdrv_change_backing_file(BlockDriverState *bs,
2449 const char *backing_file, const char *backing_fmt)
2450{
2451 BlockDriver *drv = bs->drv;
469ef350 2452 int ret;
756e6736 2453
5f377794
PB
2454 /* Backing file format doesn't make sense without a backing file */
2455 if (backing_fmt && !backing_file) {
2456 return -EINVAL;
2457 }
2458
756e6736 2459 if (drv->bdrv_change_backing_file != NULL) {
469ef350 2460 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 2461 } else {
469ef350 2462 ret = -ENOTSUP;
756e6736 2463 }
469ef350
PB
2464
2465 if (ret == 0) {
2466 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2467 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2468 }
2469 return ret;
756e6736
KW
2470}
2471
6ebdcee2
JC
2472/*
2473 * Finds the image layer in the chain that has 'bs' as its backing file.
2474 *
2475 * active is the current topmost image.
2476 *
2477 * Returns NULL if bs is not found in active's image chain,
2478 * or if active == bs.
2479 */
2480BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
2481 BlockDriverState *bs)
2482{
2483 BlockDriverState *overlay = NULL;
2484 BlockDriverState *intermediate;
2485
2486 assert(active != NULL);
2487 assert(bs != NULL);
2488
2489 /* if bs is the same as active, then by definition it has no overlay
2490 */
2491 if (active == bs) {
2492 return NULL;
2493 }
2494
2495 intermediate = active;
2496 while (intermediate->backing_hd) {
2497 if (intermediate->backing_hd == bs) {
2498 overlay = intermediate;
2499 break;
2500 }
2501 intermediate = intermediate->backing_hd;
2502 }
2503
2504 return overlay;
2505}
2506
2507typedef struct BlkIntermediateStates {
2508 BlockDriverState *bs;
2509 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
2510} BlkIntermediateStates;
2511
2512
2513/*
2514 * Drops images above 'base' up to and including 'top', and sets the image
2515 * above 'top' to have base as its backing file.
2516 *
2517 * Requires that the overlay to 'top' is opened r/w, so that the backing file
2518 * information in 'bs' can be properly updated.
2519 *
2520 * E.g., this will convert the following chain:
2521 * bottom <- base <- intermediate <- top <- active
2522 *
2523 * to
2524 *
2525 * bottom <- base <- active
2526 *
2527 * It is allowed for bottom==base, in which case it converts:
2528 *
2529 * base <- intermediate <- top <- active
2530 *
2531 * to
2532 *
2533 * base <- active
2534 *
2535 * Error conditions:
2536 * if active == top, that is considered an error
2537 *
2538 */
2539int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
2540 BlockDriverState *base)
2541{
2542 BlockDriverState *intermediate;
2543 BlockDriverState *base_bs = NULL;
2544 BlockDriverState *new_top_bs = NULL;
2545 BlkIntermediateStates *intermediate_state, *next;
2546 int ret = -EIO;
2547
2548 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
2549 QSIMPLEQ_INIT(&states_to_delete);
2550
2551 if (!top->drv || !base->drv) {
2552 goto exit;
2553 }
2554
2555 new_top_bs = bdrv_find_overlay(active, top);
2556
2557 if (new_top_bs == NULL) {
2558 /* we could not find the image above 'top', this is an error */
2559 goto exit;
2560 }
2561
2562 /* special case of new_top_bs->backing_hd already pointing to base - nothing
2563 * to do, no intermediate images */
2564 if (new_top_bs->backing_hd == base) {
2565 ret = 0;
2566 goto exit;
2567 }
2568
2569 intermediate = top;
2570
2571 /* now we will go down through the list, and add each BDS we find
2572 * into our deletion queue, until we hit the 'base'
2573 */
2574 while (intermediate) {
2575 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
2576 intermediate_state->bs = intermediate;
2577 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
2578
2579 if (intermediate->backing_hd == base) {
2580 base_bs = intermediate->backing_hd;
2581 break;
2582 }
2583 intermediate = intermediate->backing_hd;
2584 }
2585 if (base_bs == NULL) {
2586 /* something went wrong, we did not end at the base. safely
2587 * unravel everything, and exit with error */
2588 goto exit;
2589 }
2590
2591 /* success - we can delete the intermediate states, and link top->base */
2592 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
2593 base_bs->drv ? base_bs->drv->format_name : "");
2594 if (ret) {
2595 goto exit;
2596 }
2597 new_top_bs->backing_hd = base_bs;
2598
355ef4ac 2599 bdrv_refresh_limits(new_top_bs);
6ebdcee2
JC
2600
2601 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2602 /* so that bdrv_close() does not recursively close the chain */
2603 intermediate_state->bs->backing_hd = NULL;
4f6fd349 2604 bdrv_unref(intermediate_state->bs);
6ebdcee2
JC
2605 }
2606 ret = 0;
2607
2608exit:
2609 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
2610 g_free(intermediate_state);
2611 }
2612 return ret;
2613}
2614
2615
71d0770c
AL
2616static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
2617 size_t size)
2618{
2619 int64_t len;
2620
1dd3a447
KW
2621 if (size > INT_MAX) {
2622 return -EIO;
2623 }
2624
71d0770c
AL
2625 if (!bdrv_is_inserted(bs))
2626 return -ENOMEDIUM;
2627
2628 if (bs->growable)
2629 return 0;
2630
2631 len = bdrv_getlength(bs);
2632
fbb7b4e0
KW
2633 if (offset < 0)
2634 return -EIO;
2635
2636 if ((offset > len) || (len - offset < size))
71d0770c
AL
2637 return -EIO;
2638
2639 return 0;
2640}
2641
2642static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
2643 int nb_sectors)
2644{
54db38a4 2645 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
8f4754ed
KW
2646 return -EIO;
2647 }
2648
eb5a3165
JS
2649 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
2650 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
2651}
2652
1c9805a3
SH
2653typedef struct RwCo {
2654 BlockDriverState *bs;
775aa8b6 2655 int64_t offset;
1c9805a3
SH
2656 QEMUIOVector *qiov;
2657 bool is_write;
2658 int ret;
4105eaaa 2659 BdrvRequestFlags flags;
1c9805a3
SH
2660} RwCo;
2661
2662static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 2663{
1c9805a3 2664 RwCo *rwco = opaque;
ea2384d3 2665
1c9805a3 2666 if (!rwco->is_write) {
775aa8b6
KW
2667 rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
2668 rwco->qiov->size, rwco->qiov,
4105eaaa 2669 rwco->flags);
775aa8b6
KW
2670 } else {
2671 rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
2672 rwco->qiov->size, rwco->qiov,
2673 rwco->flags);
1c9805a3
SH
2674 }
2675}
e7a8a783 2676
1c9805a3 2677/*
8d3b1a2d 2678 * Process a vectored synchronous request using coroutines
1c9805a3 2679 */
775aa8b6
KW
2680static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
2681 QEMUIOVector *qiov, bool is_write,
2682 BdrvRequestFlags flags)
1c9805a3 2683{
1c9805a3
SH
2684 Coroutine *co;
2685 RwCo rwco = {
2686 .bs = bs,
775aa8b6 2687 .offset = offset,
8d3b1a2d 2688 .qiov = qiov,
1c9805a3
SH
2689 .is_write = is_write,
2690 .ret = NOT_DONE,
4105eaaa 2691 .flags = flags,
1c9805a3 2692 };
e7a8a783 2693
498e386c
ZYW
2694 /**
2695 * In sync call context, when the vcpu is blocked, this throttling timer
2696 * will not fire; so the I/O throttling function has to be disabled here
2697 * if it has been enabled.
2698 */
2699 if (bs->io_limits_enabled) {
2700 fprintf(stderr, "Disabling I/O throttling on '%s' due "
2701 "to synchronous I/O.\n", bdrv_get_device_name(bs));
2702 bdrv_io_limits_disable(bs);
2703 }
2704
1c9805a3
SH
2705 if (qemu_in_coroutine()) {
2706 /* Fast-path if already in coroutine context */
2707 bdrv_rw_co_entry(&rwco);
2708 } else {
2709 co = qemu_coroutine_create(bdrv_rw_co_entry);
2710 qemu_coroutine_enter(co, &rwco);
2711 while (rwco.ret == NOT_DONE) {
2712 qemu_aio_wait();
2713 }
2714 }
2715 return rwco.ret;
2716}
b338082b 2717
8d3b1a2d
KW
2718/*
2719 * Process a synchronous request using coroutines
2720 */
2721static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
4105eaaa 2722 int nb_sectors, bool is_write, BdrvRequestFlags flags)
8d3b1a2d
KW
2723{
2724 QEMUIOVector qiov;
2725 struct iovec iov = {
2726 .iov_base = (void *)buf,
2727 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
2728 };
2729
da15ee51
KW
2730 if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
2731 return -EINVAL;
2732 }
2733
8d3b1a2d 2734 qemu_iovec_init_external(&qiov, &iov, 1);
775aa8b6
KW
2735 return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
2736 &qiov, is_write, flags);
8d3b1a2d
KW
2737}
2738
1c9805a3
SH
2739/* return < 0 if error. See bdrv_write() for the return codes */
2740int bdrv_read(BlockDriverState *bs, int64_t sector_num,
2741 uint8_t *buf, int nb_sectors)
2742{
4105eaaa 2743 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
fc01f7e7
FB
2744}
2745
07d27a44
MA
2746/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
2747int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
2748 uint8_t *buf, int nb_sectors)
2749{
2750 bool enabled;
2751 int ret;
2752
2753 enabled = bs->io_limits_enabled;
2754 bs->io_limits_enabled = false;
4e7395e8 2755 ret = bdrv_read(bs, sector_num, buf, nb_sectors);
07d27a44
MA
2756 bs->io_limits_enabled = enabled;
2757 return ret;
2758}
2759
5fafdf24 2760/* Return < 0 if error. Important errors are:
19cb3738
FB
2761 -EIO generic I/O error (may happen for all errors)
2762 -ENOMEDIUM No media inserted.
2763 -EINVAL Invalid sector number or nb_sectors
2764 -EACCES Trying to write a read-only device
2765*/
5fafdf24 2766int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2767 const uint8_t *buf, int nb_sectors)
2768{
4105eaaa 2769 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
83f64091
FB
2770}
2771
aa7bfbff
PL
2772int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
2773 int nb_sectors, BdrvRequestFlags flags)
4105eaaa
PL
2774{
2775 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
aa7bfbff 2776 BDRV_REQ_ZERO_WRITE | flags);
8d3b1a2d
KW
2777}
2778
d75cbb5e
PL
2779/*
2780 * Completely zero out a block device with the help of bdrv_write_zeroes.
2781 * The operation is sped up by checking the block status and only writing
2782 * zeroes to the device if they currently do not return zeroes. Optional
2783 * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
2784 *
2785 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
2786 */
2787int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
2788{
9ce10c0b 2789 int64_t target_size;
d75cbb5e
PL
2790 int64_t ret, nb_sectors, sector_num = 0;
2791 int n;
2792
9ce10c0b
KW
2793 target_size = bdrv_getlength(bs);
2794 if (target_size < 0) {
2795 return target_size;
2796 }
2797 target_size /= BDRV_SECTOR_SIZE;
2798
d75cbb5e
PL
2799 for (;;) {
2800 nb_sectors = target_size - sector_num;
2801 if (nb_sectors <= 0) {
2802 return 0;
2803 }
2804 if (nb_sectors > INT_MAX) {
2805 nb_sectors = INT_MAX;
2806 }
2807 ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
3d94ce60
PL
2808 if (ret < 0) {
2809 error_report("error getting block status at sector %" PRId64 ": %s",
2810 sector_num, strerror(-ret));
2811 return ret;
2812 }
d75cbb5e
PL
2813 if (ret & BDRV_BLOCK_ZERO) {
2814 sector_num += n;
2815 continue;
2816 }
2817 ret = bdrv_write_zeroes(bs, sector_num, n, flags);
2818 if (ret < 0) {
2819 error_report("error writing zeroes at sector %" PRId64 ": %s",
2820 sector_num, strerror(-ret));
2821 return ret;
2822 }
2823 sector_num += n;
2824 }
2825}
2826
a3ef6571 2827int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
83f64091 2828{
a3ef6571
KW
2829 QEMUIOVector qiov;
2830 struct iovec iov = {
2831 .iov_base = (void *)buf,
2832 .iov_len = bytes,
2833 };
9a8c4cce 2834 int ret;
83f64091 2835
a3ef6571
KW
2836 if (bytes < 0) {
2837 return -EINVAL;
83f64091
FB
2838 }
2839
a3ef6571
KW
2840 qemu_iovec_init_external(&qiov, &iov, 1);
2841 ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
2842 if (ret < 0) {
2843 return ret;
83f64091 2844 }
a3ef6571
KW
2845
2846 return bytes;
83f64091
FB
2847}
2848
8d3b1a2d 2849int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
83f64091 2850{
9a8c4cce 2851 int ret;
83f64091 2852
8407d5d7
KW
2853 ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
2854 if (ret < 0) {
2855 return ret;
83f64091
FB
2856 }
2857
8d3b1a2d
KW
2858 return qiov->size;
2859}
2860
2861int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
8407d5d7 2862 const void *buf, int bytes)
8d3b1a2d
KW
2863{
2864 QEMUIOVector qiov;
2865 struct iovec iov = {
2866 .iov_base = (void *) buf,
8407d5d7 2867 .iov_len = bytes,
8d3b1a2d
KW
2868 };
2869
8407d5d7
KW
2870 if (bytes < 0) {
2871 return -EINVAL;
2872 }
2873
8d3b1a2d
KW
2874 qemu_iovec_init_external(&qiov, &iov, 1);
2875 return bdrv_pwritev(bs, offset, &qiov);
83f64091 2876}
83f64091 2877
f08145fe
KW
2878/*
2879 * Writes to the file and ensures that no writes are reordered across this
2880 * request (acts as a barrier)
2881 *
2882 * Returns 0 on success, -errno in error cases.
2883 */
2884int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2885 const void *buf, int count)
2886{
2887 int ret;
2888
2889 ret = bdrv_pwrite(bs, offset, buf, count);
2890 if (ret < 0) {
2891 return ret;
2892 }
2893
f05fa4ad
PB
2894 /* No flush needed for cache modes that already do it */
2895 if (bs->enable_write_cache) {
f08145fe
KW
2896 bdrv_flush(bs);
2897 }
2898
2899 return 0;
2900}
2901
470c0504 2902static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2903 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2904{
2905 /* Perform I/O through a temporary buffer so that users who scribble over
2906 * their read buffer while the operation is in progress do not end up
2907 * modifying the image file. This is critical for zero-copy guest I/O
2908 * where anything might happen inside guest memory.
2909 */
2910 void *bounce_buffer;
2911
79c053bd 2912 BlockDriver *drv = bs->drv;
ab185921
SH
2913 struct iovec iov;
2914 QEMUIOVector bounce_qiov;
2915 int64_t cluster_sector_num;
2916 int cluster_nb_sectors;
2917 size_t skip_bytes;
2918 int ret;
2919
2920 /* Cover entire cluster so no additional backing file I/O is required when
2921 * allocating cluster in the image file.
2922 */
343bded4
PB
2923 bdrv_round_to_clusters(bs, sector_num, nb_sectors,
2924 &cluster_sector_num, &cluster_nb_sectors);
ab185921 2925
470c0504
SH
2926 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2927 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
2928
2929 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2930 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2931 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2932
79c053bd
SH
2933 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2934 &bounce_qiov);
ab185921
SH
2935 if (ret < 0) {
2936 goto err;
2937 }
2938
79c053bd
SH
2939 if (drv->bdrv_co_write_zeroes &&
2940 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589 2941 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
aa7bfbff 2942 cluster_nb_sectors, 0);
79c053bd 2943 } else {
f05fa4ad
PB
2944 /* This does not change the data on the disk, it is not necessary
2945 * to flush even in cache=writethrough mode.
2946 */
79c053bd 2947 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 2948 &bounce_qiov);
79c053bd
SH
2949 }
2950
ab185921
SH
2951 if (ret < 0) {
2952 /* It might be okay to ignore write errors for guest requests. If this
2953 * is a deliberate copy-on-read then we don't want to ignore the error.
2954 * Simply report it in all cases.
2955 */
2956 goto err;
2957 }
2958
2959 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
2960 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2961 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
2962
2963err:
2964 qemu_vfree(bounce_buffer);
2965 return ret;
2966}
2967
c5fbe571 2968/*
d0c7f642
KW
2969 * Forwards an already correctly aligned request to the BlockDriver. This
2970 * handles copy on read and zeroing after EOF; any other features must be
2971 * implemented by the caller.
c5fbe571 2972 */
d0c7f642 2973static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
65afd211 2974 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
ec746e10 2975 int64_t align, QEMUIOVector *qiov, int flags)
da1fa91d
KW
2976{
2977 BlockDriver *drv = bs->drv;
dbffbdcf 2978 int ret;
da1fa91d 2979
d0c7f642
KW
2980 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
2981 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
da1fa91d 2982
d0c7f642
KW
2983 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
2984 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
2985
2986 /* Handle Copy on Read and associated serialisation */
470c0504 2987 if (flags & BDRV_REQ_COPY_ON_READ) {
7327145f
KW
2988 /* If we touch the same cluster it counts as an overlap. This
2989 * guarantees that allocating writes will be serialized and not race
2990 * with each other for the same cluster. For example, in copy-on-read
2991 * it ensures that the CoR read and write operations are atomic and
2992 * guest writes cannot interleave between them. */
2993 mark_request_serialising(req, bdrv_get_cluster_size(bs));
470c0504
SH
2994 }
2995
2dbafdc0 2996 wait_serialising_requests(req);
f4658285 2997
470c0504 2998 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
2999 int pnum;
3000
bdad13b9 3001 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
ab185921
SH
3002 if (ret < 0) {
3003 goto out;
3004 }
3005
3006 if (!ret || pnum != nb_sectors) {
470c0504 3007 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
3008 goto out;
3009 }
3010 }
3011
d0c7f642 3012 /* Forward the request to the BlockDriver */
893a8f62
MK
3013 if (!(bs->zero_beyond_eof && bs->growable)) {
3014 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
3015 } else {
3016 /* Read zeros after EOF of growable BDSes */
3017 int64_t len, total_sectors, max_nb_sectors;
3018
3019 len = bdrv_getlength(bs);
3020 if (len < 0) {
3021 ret = len;
3022 goto out;
3023 }
3024
d055a1fe 3025 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
5f5bcd80
KW
3026 max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
3027 align >> BDRV_SECTOR_BITS);
893a8f62
MK
3028 if (max_nb_sectors > 0) {
3029 ret = drv->bdrv_co_readv(bs, sector_num,
3030 MIN(nb_sectors, max_nb_sectors), qiov);
3031 } else {
3032 ret = 0;
3033 }
3034
3035 /* Reading beyond end of file is supposed to produce zeroes */
3036 if (ret == 0 && total_sectors < sector_num + nb_sectors) {
3037 uint64_t offset = MAX(0, total_sectors - sector_num);
3038 uint64_t bytes = (sector_num + nb_sectors - offset) *
3039 BDRV_SECTOR_SIZE;
3040 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
3041 }
3042 }
ab185921
SH
3043
3044out:
dbffbdcf 3045 return ret;
da1fa91d
KW
3046}
3047
d0c7f642
KW
3048/*
3049 * Handle a read request in coroutine context
3050 */
1b0288ae
KW
3051static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
3052 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
d0c7f642
KW
3053 BdrvRequestFlags flags)
3054{
3055 BlockDriver *drv = bs->drv;
65afd211
KW
3056 BdrvTrackedRequest req;
3057
1b0288ae
KW
3058 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3059 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3060 uint8_t *head_buf = NULL;
3061 uint8_t *tail_buf = NULL;
3062 QEMUIOVector local_qiov;
3063 bool use_local_qiov = false;
d0c7f642
KW
3064 int ret;
3065
3066 if (!drv) {
3067 return -ENOMEDIUM;
3068 }
1b0288ae 3069 if (bdrv_check_byte_request(bs, offset, bytes)) {
d0c7f642
KW
3070 return -EIO;
3071 }
3072
3073 if (bs->copy_on_read) {
3074 flags |= BDRV_REQ_COPY_ON_READ;
3075 }
3076
3077 /* throttling disk I/O */
3078 if (bs->io_limits_enabled) {
d5103588 3079 bdrv_io_limits_intercept(bs, bytes, false);
1b0288ae
KW
3080 }
3081
3082 /* Align read if necessary by padding qiov */
3083 if (offset & (align - 1)) {
3084 head_buf = qemu_blockalign(bs, align);
3085 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3086 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3087 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3088 use_local_qiov = true;
3089
3090 bytes += offset & (align - 1);
3091 offset = offset & ~(align - 1);
3092 }
3093
3094 if ((offset + bytes) & (align - 1)) {
3095 if (!use_local_qiov) {
3096 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3097 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3098 use_local_qiov = true;
3099 }
3100 tail_buf = qemu_blockalign(bs, align);
3101 qemu_iovec_add(&local_qiov, tail_buf,
3102 align - ((offset + bytes) & (align - 1)));
3103
3104 bytes = ROUND_UP(bytes, align);
3105 }
3106
65afd211 3107 tracked_request_begin(&req, bs, offset, bytes, false);
ec746e10 3108 ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1b0288ae
KW
3109 use_local_qiov ? &local_qiov : qiov,
3110 flags);
65afd211 3111 tracked_request_end(&req);
1b0288ae
KW
3112
3113 if (use_local_qiov) {
3114 qemu_iovec_destroy(&local_qiov);
3115 qemu_vfree(head_buf);
3116 qemu_vfree(tail_buf);
d0c7f642
KW
3117 }
3118
d0c7f642
KW
3119 return ret;
3120}
3121
1b0288ae
KW
3122static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
3123 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3124 BdrvRequestFlags flags)
3125{
3126 if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
3127 return -EINVAL;
3128 }
3129
3130 return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
3131 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3132}
3133
c5fbe571 3134int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
3135 int nb_sectors, QEMUIOVector *qiov)
3136{
c5fbe571 3137 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 3138
470c0504
SH
3139 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
3140}
3141
3142int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
3143 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
3144{
3145 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
3146
3147 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
3148 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
3149}
3150
c31cb707
PL
3151/* if no limit is specified in the BlockLimits use a default
3152 * of 32768 512-byte sectors (16 MiB) per request.
3153 */
3154#define MAX_WRITE_ZEROES_DEFAULT 32768
3155
f08f2dda 3156static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
aa7bfbff 3157 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
f08f2dda
SH
3158{
3159 BlockDriver *drv = bs->drv;
3160 QEMUIOVector qiov;
c31cb707
PL
3161 struct iovec iov = {0};
3162 int ret = 0;
f08f2dda 3163
c31cb707
PL
3164 int max_write_zeroes = bs->bl.max_write_zeroes ?
3165 bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
621f0589 3166
c31cb707
PL
3167 while (nb_sectors > 0 && !ret) {
3168 int num = nb_sectors;
3169
b8d71c09
PB
3170 /* Align request. Block drivers can expect the "bulk" of the request
3171 * to be aligned.
3172 */
3173 if (bs->bl.write_zeroes_alignment
3174 && num > bs->bl.write_zeroes_alignment) {
3175 if (sector_num % bs->bl.write_zeroes_alignment != 0) {
3176 /* Make a small request up to the first aligned sector. */
c31cb707 3177 num = bs->bl.write_zeroes_alignment;
b8d71c09
PB
3178 num -= sector_num % bs->bl.write_zeroes_alignment;
3179 } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
3180 /* Shorten the request to the last aligned sector. num cannot
3181 * underflow because num > bs->bl.write_zeroes_alignment.
3182 */
3183 num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
c31cb707 3184 }
621f0589 3185 }
f08f2dda 3186
c31cb707
PL
3187 /* limit request size */
3188 if (num > max_write_zeroes) {
3189 num = max_write_zeroes;
3190 }
3191
3192 ret = -ENOTSUP;
3193 /* First try the efficient write zeroes operation */
3194 if (drv->bdrv_co_write_zeroes) {
3195 ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
3196 }
3197
3198 if (ret == -ENOTSUP) {
3199 /* Fall back to bounce buffer if write zeroes is unsupported */
3200 iov.iov_len = num * BDRV_SECTOR_SIZE;
3201 if (iov.iov_base == NULL) {
b8d71c09
PB
3202 iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE);
3203 memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
c31cb707
PL
3204 }
3205 qemu_iovec_init_external(&qiov, &iov, 1);
f08f2dda 3206
c31cb707 3207 ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
b8d71c09
PB
3208
3209 /* Keep bounce buffer around if it is big enough for all
3210 * all future requests.
3211 */
3212 if (num < max_write_zeroes) {
3213 qemu_vfree(iov.iov_base);
3214 iov.iov_base = NULL;
3215 }
c31cb707
PL
3216 }
3217
3218 sector_num += num;
3219 nb_sectors -= num;
3220 }
f08f2dda
SH
3221
3222 qemu_vfree(iov.iov_base);
3223 return ret;
3224}
3225
c5fbe571 3226/*
b404f720 3227 * Forwards an already correctly aligned write request to the BlockDriver.
c5fbe571 3228 */
b404f720 3229static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
65afd211
KW
3230 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
3231 QEMUIOVector *qiov, int flags)
c5fbe571
SH
3232{
3233 BlockDriver *drv = bs->drv;
28de2dcd 3234 bool waited;
6b7cb247 3235 int ret;
da1fa91d 3236
b404f720
KW
3237 int64_t sector_num = offset >> BDRV_SECTOR_BITS;
3238 unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
f4658285 3239
b404f720
KW
3240 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
3241 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
cc0681c4 3242
28de2dcd
KW
3243 waited = wait_serialising_requests(req);
3244 assert(!waited || !req->serialising);
af91f9a7
KW
3245 assert(req->overlap_offset <= offset);
3246 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
244eadef 3247
65afd211 3248 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
d616b224
SH
3249
3250 if (ret < 0) {
3251 /* Do nothing, write notifier decided to fail this request */
3252 } else if (flags & BDRV_REQ_ZERO_WRITE) {
9e1cb96d 3253 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
aa7bfbff 3254 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3255 } else {
9e1cb96d 3256 BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
f08f2dda
SH
3257 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
3258 }
9e1cb96d 3259 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
6b7cb247 3260
f05fa4ad
PB
3261 if (ret == 0 && !bs->enable_write_cache) {
3262 ret = bdrv_co_flush(bs);
3263 }
3264
e4654d2d 3265 bdrv_set_dirty(bs, sector_num, nb_sectors);
da1fa91d
KW
3266
3267 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
3268 bs->wr_highest_sector = sector_num + nb_sectors - 1;
3269 }
df2a6f29
PB
3270 if (bs->growable && ret >= 0) {
3271 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
3272 }
da1fa91d 3273
6b7cb247 3274 return ret;
da1fa91d
KW
3275}
3276
b404f720
KW
3277/*
3278 * Handle a write request in coroutine context
3279 */
6601553e
KW
3280static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
3281 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
b404f720
KW
3282 BdrvRequestFlags flags)
3283{
65afd211 3284 BdrvTrackedRequest req;
3b8242e0
KW
3285 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
3286 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
3287 uint8_t *head_buf = NULL;
3288 uint8_t *tail_buf = NULL;
3289 QEMUIOVector local_qiov;
3290 bool use_local_qiov = false;
b404f720
KW
3291 int ret;
3292
3293 if (!bs->drv) {
3294 return -ENOMEDIUM;
3295 }
3296 if (bs->read_only) {
3297 return -EACCES;
3298 }
6601553e 3299 if (bdrv_check_byte_request(bs, offset, bytes)) {
b404f720
KW
3300 return -EIO;
3301 }
3302
b404f720
KW
3303 /* throttling disk I/O */
3304 if (bs->io_limits_enabled) {
d5103588 3305 bdrv_io_limits_intercept(bs, bytes, true);
b404f720
KW
3306 }
3307
3b8242e0
KW
3308 /*
3309 * Align write if necessary by performing a read-modify-write cycle.
3310 * Pad qiov with the read parts and be sure to have a tracked request not
3311 * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
3312 */
65afd211 3313 tracked_request_begin(&req, bs, offset, bytes, true);
3b8242e0
KW
3314
3315 if (offset & (align - 1)) {
3316 QEMUIOVector head_qiov;
3317 struct iovec head_iov;
3318
3319 mark_request_serialising(&req, align);
3320 wait_serialising_requests(&req);
3321
3322 head_buf = qemu_blockalign(bs, align);
3323 head_iov = (struct iovec) {
3324 .iov_base = head_buf,
3325 .iov_len = align,
3326 };
3327 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
3328
9e1cb96d 3329 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
3b8242e0
KW
3330 ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
3331 align, &head_qiov, 0);
3332 if (ret < 0) {
3333 goto fail;
3334 }
9e1cb96d 3335 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
3b8242e0
KW
3336
3337 qemu_iovec_init(&local_qiov, qiov->niov + 2);
3338 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
3339 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3340 use_local_qiov = true;
3341
3342 bytes += offset & (align - 1);
3343 offset = offset & ~(align - 1);
3344 }
3345
3346 if ((offset + bytes) & (align - 1)) {
3347 QEMUIOVector tail_qiov;
3348 struct iovec tail_iov;
3349 size_t tail_bytes;
28de2dcd 3350 bool waited;
3b8242e0
KW
3351
3352 mark_request_serialising(&req, align);
28de2dcd
KW
3353 waited = wait_serialising_requests(&req);
3354 assert(!waited || !use_local_qiov);
3b8242e0
KW
3355
3356 tail_buf = qemu_blockalign(bs, align);
3357 tail_iov = (struct iovec) {
3358 .iov_base = tail_buf,
3359 .iov_len = align,
3360 };
3361 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
3362
9e1cb96d 3363 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
3b8242e0
KW
3364 ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
3365 align, &tail_qiov, 0);
3366 if (ret < 0) {
3367 goto fail;
3368 }
9e1cb96d 3369 BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
3b8242e0
KW
3370
3371 if (!use_local_qiov) {
3372 qemu_iovec_init(&local_qiov, qiov->niov + 1);
3373 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
3374 use_local_qiov = true;
3375 }
3376
3377 tail_bytes = (offset + bytes) & (align - 1);
3378 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
3379
3380 bytes = ROUND_UP(bytes, align);
3381 }
3382
3383 ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
3384 use_local_qiov ? &local_qiov : qiov,
3385 flags);
3386
3387fail:
65afd211 3388 tracked_request_end(&req);
b404f720 3389
3b8242e0
KW
3390 if (use_local_qiov) {
3391 qemu_iovec_destroy(&local_qiov);
3b8242e0 3392 }
99c4a85c
KW
3393 qemu_vfree(head_buf);
3394 qemu_vfree(tail_buf);
3b8242e0 3395
b404f720
KW
3396 return ret;
3397}
3398
6601553e
KW
3399static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
3400 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
3401 BdrvRequestFlags flags)
3402{
3403 if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
3404 return -EINVAL;
3405 }
3406
3407 return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
3408 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
3409}
3410
c5fbe571
SH
3411int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
3412 int nb_sectors, QEMUIOVector *qiov)
3413{
3414 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
3415
f08f2dda
SH
3416 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
3417}
3418
3419int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
aa7bfbff
PL
3420 int64_t sector_num, int nb_sectors,
3421 BdrvRequestFlags flags)
f08f2dda 3422{
94d6ff21 3423 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
f08f2dda 3424
d32f35cb
PL
3425 if (!(bs->open_flags & BDRV_O_UNMAP)) {
3426 flags &= ~BDRV_REQ_MAY_UNMAP;
3427 }
3428
f08f2dda 3429 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
aa7bfbff 3430 BDRV_REQ_ZERO_WRITE | flags);
c5fbe571
SH
3431}
3432
83f64091
FB
3433/**
3434 * Truncate file to 'offset' bytes (needed only for file protocols)
3435 */
3436int bdrv_truncate(BlockDriverState *bs, int64_t offset)
3437{
3438 BlockDriver *drv = bs->drv;
51762288 3439 int ret;
83f64091 3440 if (!drv)
19cb3738 3441 return -ENOMEDIUM;
83f64091
FB
3442 if (!drv->bdrv_truncate)
3443 return -ENOTSUP;
59f2689d
NS
3444 if (bs->read_only)
3445 return -EACCES;
8591675f
MT
3446 if (bdrv_in_use(bs))
3447 return -EBUSY;
51762288
SH
3448 ret = drv->bdrv_truncate(bs, offset);
3449 if (ret == 0) {
3450 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 3451 bdrv_dev_resize_cb(bs);
51762288
SH
3452 }
3453 return ret;
83f64091
FB
3454}
3455
4a1d5e1f
FZ
3456/**
3457 * Length of a allocated file in bytes. Sparse files are counted by actual
3458 * allocated space. Return < 0 if error or unknown.
3459 */
3460int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
3461{
3462 BlockDriver *drv = bs->drv;
3463 if (!drv) {
3464 return -ENOMEDIUM;
3465 }
3466 if (drv->bdrv_get_allocated_file_size) {
3467 return drv->bdrv_get_allocated_file_size(bs);
3468 }
3469 if (bs->file) {
3470 return bdrv_get_allocated_file_size(bs->file);
3471 }
3472 return -ENOTSUP;
3473}
3474
83f64091
FB
3475/**
3476 * Length of a file in bytes. Return < 0 if error or unknown.
3477 */
3478int64_t bdrv_getlength(BlockDriverState *bs)
3479{
3480 BlockDriver *drv = bs->drv;
3481 if (!drv)
19cb3738 3482 return -ENOMEDIUM;
51762288 3483
b94a2610
KW
3484 if (drv->has_variable_length) {
3485 int ret = refresh_total_sectors(bs, bs->total_sectors);
3486 if (ret < 0) {
3487 return ret;
46a4e4e6 3488 }
83f64091 3489 }
46a4e4e6 3490 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
3491}
3492
19cb3738 3493/* return 0 as number of sectors if no device present or error */
96b8f136 3494void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 3495{
19cb3738
FB
3496 int64_t length;
3497 length = bdrv_getlength(bs);
3498 if (length < 0)
3499 length = 0;
3500 else
6ea44308 3501 length = length >> BDRV_SECTOR_BITS;
19cb3738 3502 *nb_sectors_ptr = length;
fc01f7e7 3503}
cf98951b 3504
ff06f5f3
PB
3505void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
3506 BlockdevOnError on_write_error)
abd7f68d
MA
3507{
3508 bs->on_read_error = on_read_error;
3509 bs->on_write_error = on_write_error;
3510}
3511
1ceee0d5 3512BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
3513{
3514 return is_read ? bs->on_read_error : bs->on_write_error;
3515}
3516
3e1caa5f
PB
3517BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
3518{
3519 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
3520
3521 switch (on_err) {
3522 case BLOCKDEV_ON_ERROR_ENOSPC:
3523 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
3524 case BLOCKDEV_ON_ERROR_STOP:
3525 return BDRV_ACTION_STOP;
3526 case BLOCKDEV_ON_ERROR_REPORT:
3527 return BDRV_ACTION_REPORT;
3528 case BLOCKDEV_ON_ERROR_IGNORE:
3529 return BDRV_ACTION_IGNORE;
3530 default:
3531 abort();
3532 }
3533}
3534
3535/* This is done by device models because, while the block layer knows
3536 * about the error, it does not know whether an operation comes from
3537 * the device or the block layer (from a job, for example).
3538 */
3539void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
3540 bool is_read, int error)
3541{
3542 assert(error >= 0);
32c81a4a 3543 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3e1caa5f
PB
3544 if (action == BDRV_ACTION_STOP) {
3545 vm_stop(RUN_STATE_IO_ERROR);
3546 bdrv_iostatus_set_err(bs, error);
3547 }
3548}
3549
b338082b
FB
3550int bdrv_is_read_only(BlockDriverState *bs)
3551{
3552 return bs->read_only;
3553}
3554
985a03b0
TS
3555int bdrv_is_sg(BlockDriverState *bs)
3556{
3557 return bs->sg;
3558}
3559
e900a7b7
CH
3560int bdrv_enable_write_cache(BlockDriverState *bs)
3561{
3562 return bs->enable_write_cache;
3563}
3564
425b0148
PB
3565void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
3566{
3567 bs->enable_write_cache = wce;
55b110f2
JC
3568
3569 /* so a reopen() will preserve wce */
3570 if (wce) {
3571 bs->open_flags |= BDRV_O_CACHE_WB;
3572 } else {
3573 bs->open_flags &= ~BDRV_O_CACHE_WB;
3574 }
425b0148
PB
3575}
3576
ea2384d3
FB
3577int bdrv_is_encrypted(BlockDriverState *bs)
3578{
3579 if (bs->backing_hd && bs->backing_hd->encrypted)
3580 return 1;
3581 return bs->encrypted;
3582}
3583
c0f4ce77
AL
3584int bdrv_key_required(BlockDriverState *bs)
3585{
3586 BlockDriverState *backing_hd = bs->backing_hd;
3587
3588 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
3589 return 1;
3590 return (bs->encrypted && !bs->valid_key);
3591}
3592
ea2384d3
FB
3593int bdrv_set_key(BlockDriverState *bs, const char *key)
3594{
3595 int ret;
3596 if (bs->backing_hd && bs->backing_hd->encrypted) {
3597 ret = bdrv_set_key(bs->backing_hd, key);
3598 if (ret < 0)
3599 return ret;
3600 if (!bs->encrypted)
3601 return 0;
3602 }
fd04a2ae
SH
3603 if (!bs->encrypted) {
3604 return -EINVAL;
3605 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
3606 return -ENOMEDIUM;
3607 }
c0f4ce77 3608 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
3609 if (ret < 0) {
3610 bs->valid_key = 0;
3611 } else if (!bs->valid_key) {
3612 bs->valid_key = 1;
3613 /* call the change callback now, we skipped it on open */
7d4b4ba5 3614 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 3615 }
c0f4ce77 3616 return ret;
ea2384d3
FB
3617}
3618
f8d6bba1 3619const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 3620{
f8d6bba1 3621 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
3622}
3623
5fafdf24 3624void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
3625 void *opaque)
3626{
3627 BlockDriver *drv;
e855e4fb
JC
3628 int count = 0;
3629 const char **formats = NULL;
ea2384d3 3630
8a22f02a 3631 QLIST_FOREACH(drv, &bdrv_drivers, list) {
e855e4fb
JC
3632 if (drv->format_name) {
3633 bool found = false;
3634 int i = count;
3635 while (formats && i && !found) {
3636 found = !strcmp(formats[--i], drv->format_name);
3637 }
3638
3639 if (!found) {
3640 formats = g_realloc(formats, (count + 1) * sizeof(char *));
3641 formats[count++] = drv->format_name;
3642 it(opaque, drv->format_name);
3643 }
3644 }
ea2384d3 3645 }
e855e4fb 3646 g_free(formats);
ea2384d3
FB
3647}
3648
dc364f4c 3649/* This function is to find block backend bs */
b338082b
FB
3650BlockDriverState *bdrv_find(const char *name)
3651{
3652 BlockDriverState *bs;
3653
dc364f4c 3654 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
1b7bdbc1 3655 if (!strcmp(name, bs->device_name)) {
b338082b 3656 return bs;
1b7bdbc1 3657 }
b338082b
FB
3658 }
3659 return NULL;
3660}
3661
dc364f4c
BC
3662/* This function is to find a node in the bs graph */
3663BlockDriverState *bdrv_find_node(const char *node_name)
3664{
3665 BlockDriverState *bs;
3666
3667 assert(node_name);
3668
3669 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3670 if (!strcmp(node_name, bs->node_name)) {
3671 return bs;
3672 }
3673 }
3674 return NULL;
3675}
3676
c13163fb
BC
3677/* Put this QMP function here so it can access the static graph_bdrv_states. */
3678BlockDeviceInfoList *bdrv_named_nodes_list(void)
3679{
3680 BlockDeviceInfoList *list, *entry;
3681 BlockDriverState *bs;
3682
3683 list = NULL;
3684 QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
3685 entry = g_malloc0(sizeof(*entry));
3686 entry->value = bdrv_block_device_info(bs);
3687 entry->next = list;
3688 list = entry;
3689 }
3690
3691 return list;
3692}
3693
12d3ba82
BC
3694BlockDriverState *bdrv_lookup_bs(const char *device,
3695 const char *node_name,
3696 Error **errp)
3697{
3698 BlockDriverState *bs = NULL;
3699
12d3ba82
BC
3700 if (device) {
3701 bs = bdrv_find(device);
3702
dd67fa50
BC
3703 if (bs) {
3704 return bs;
12d3ba82 3705 }
12d3ba82
BC
3706 }
3707
dd67fa50
BC
3708 if (node_name) {
3709 bs = bdrv_find_node(node_name);
12d3ba82 3710
dd67fa50
BC
3711 if (bs) {
3712 return bs;
3713 }
12d3ba82
BC
3714 }
3715
dd67fa50
BC
3716 error_setg(errp, "Cannot find device=%s nor node_name=%s",
3717 device ? device : "",
3718 node_name ? node_name : "");
3719 return NULL;
12d3ba82
BC
3720}
3721
2f399b0a
MA
3722BlockDriverState *bdrv_next(BlockDriverState *bs)
3723{
3724 if (!bs) {
3725 return QTAILQ_FIRST(&bdrv_states);
3726 }
dc364f4c 3727 return QTAILQ_NEXT(bs, device_list);
2f399b0a
MA
3728}
3729
51de9760 3730void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
3731{
3732 BlockDriverState *bs;
3733
dc364f4c 3734 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
51de9760 3735 it(opaque, bs);
81d0912d
FB
3736 }
3737}
3738
ea2384d3
FB
3739const char *bdrv_get_device_name(BlockDriverState *bs)
3740{
3741 return bs->device_name;
3742}
3743
c8433287
MA
3744int bdrv_get_flags(BlockDriverState *bs)
3745{
3746 return bs->open_flags;
3747}
3748
f0f0fdfe 3749int bdrv_flush_all(void)
c6ca28d6
AL
3750{
3751 BlockDriverState *bs;
f0f0fdfe 3752 int result = 0;
c6ca28d6 3753
dc364f4c 3754 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
f0f0fdfe
KW
3755 int ret = bdrv_flush(bs);
3756 if (ret < 0 && !result) {
3757 result = ret;
3758 }
1b7bdbc1 3759 }
f0f0fdfe
KW
3760
3761 return result;
c6ca28d6
AL
3762}
3763
3ac21627
PL
3764int bdrv_has_zero_init_1(BlockDriverState *bs)
3765{
3766 return 1;
3767}
3768
f2feebbd
KW
3769int bdrv_has_zero_init(BlockDriverState *bs)
3770{
3771 assert(bs->drv);
3772
11212d8f
PB
3773 /* If BS is a copy on write image, it is initialized to
3774 the contents of the base image, which may not be zeroes. */
3775 if (bs->backing_hd) {
3776 return 0;
3777 }
336c1c12
KW
3778 if (bs->drv->bdrv_has_zero_init) {
3779 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
3780 }
3781
3ac21627
PL
3782 /* safe default */
3783 return 0;
f2feebbd
KW
3784}
3785
4ce78691
PL
3786bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
3787{
3788 BlockDriverInfo bdi;
3789
3790 if (bs->backing_hd) {
3791 return false;
3792 }
3793
3794 if (bdrv_get_info(bs, &bdi) == 0) {
3795 return bdi.unallocated_blocks_are_zero;
3796 }
3797
3798 return false;
3799}
3800
3801bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
3802{
3803 BlockDriverInfo bdi;
3804
3805 if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
3806 return false;
3807 }
3808
3809 if (bdrv_get_info(bs, &bdi) == 0) {
3810 return bdi.can_write_zeroes_with_unmap;
3811 }
3812
3813 return false;
3814}
3815
b6b8a333 3816typedef struct BdrvCoGetBlockStatusData {
376ae3f1 3817 BlockDriverState *bs;
b35b2bba 3818 BlockDriverState *base;
376ae3f1
SH
3819 int64_t sector_num;
3820 int nb_sectors;
3821 int *pnum;
b6b8a333 3822 int64_t ret;
376ae3f1 3823 bool done;
b6b8a333 3824} BdrvCoGetBlockStatusData;
376ae3f1 3825
f58c7b35
TS
3826/*
3827 * Returns true iff the specified sector is present in the disk image. Drivers
3828 * not implementing the functionality are assumed to not support backing files,
3829 * hence all their sectors are reported as allocated.
3830 *
bd9533e3
SH
3831 * If 'sector_num' is beyond the end of the disk image the return value is 0
3832 * and 'pnum' is set to 0.
3833 *
f58c7b35
TS
3834 * 'pnum' is set to the number of sectors (including and immediately following
3835 * the specified sector) that are known to be in the same
3836 * allocated/unallocated state.
3837 *
bd9533e3
SH
3838 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
3839 * beyond the end of the disk image it will be clamped.
f58c7b35 3840 */
b6b8a333
PB
3841static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
3842 int64_t sector_num,
3843 int nb_sectors, int *pnum)
f58c7b35 3844{
617ccb46 3845 int64_t length;
bd9533e3 3846 int64_t n;
5daa74a6 3847 int64_t ret, ret2;
bd9533e3 3848
617ccb46
PB
3849 length = bdrv_getlength(bs);
3850 if (length < 0) {
3851 return length;
3852 }
3853
3854 if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
bd9533e3
SH
3855 *pnum = 0;
3856 return 0;
3857 }
3858
3859 n = bs->total_sectors - sector_num;
3860 if (n < nb_sectors) {
3861 nb_sectors = n;
3862 }
3863
b6b8a333 3864 if (!bs->drv->bdrv_co_get_block_status) {
bd9533e3 3865 *pnum = nb_sectors;
918e92d7
PB
3866 ret = BDRV_BLOCK_DATA;
3867 if (bs->drv->protocol_name) {
3868 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
3869 }
3870 return ret;
f58c7b35 3871 }
6aebab14 3872
415b5b01
PB
3873 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
3874 if (ret < 0) {
3e0a233d 3875 *pnum = 0;
415b5b01
PB
3876 return ret;
3877 }
3878
92bc50a5
PL
3879 if (ret & BDRV_BLOCK_RAW) {
3880 assert(ret & BDRV_BLOCK_OFFSET_VALID);
3881 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3882 *pnum, pnum);
3883 }
3884
c3d86884
PL
3885 if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
3886 if (bdrv_unallocated_blocks_are_zero(bs)) {
f0ad5712 3887 ret |= BDRV_BLOCK_ZERO;
1f9db224 3888 } else if (bs->backing_hd) {
f0ad5712
PB
3889 BlockDriverState *bs2 = bs->backing_hd;
3890 int64_t length2 = bdrv_getlength(bs2);
3891 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
3892 ret |= BDRV_BLOCK_ZERO;
3893 }
3894 }
415b5b01 3895 }
5daa74a6
PB
3896
3897 if (bs->file &&
3898 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
3899 (ret & BDRV_BLOCK_OFFSET_VALID)) {
3900 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
3901 *pnum, pnum);
3902 if (ret2 >= 0) {
3903 /* Ignore errors. This is just providing extra information, it
3904 * is useful but not necessary.
3905 */
3906 ret |= (ret2 & BDRV_BLOCK_ZERO);
3907 }
3908 }
3909
415b5b01 3910 return ret;
060f51c9
SH
3911}
3912
b6b8a333
PB
3913/* Coroutine wrapper for bdrv_get_block_status() */
3914static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
060f51c9 3915{
b6b8a333 3916 BdrvCoGetBlockStatusData *data = opaque;
060f51c9
SH
3917 BlockDriverState *bs = data->bs;
3918
b6b8a333
PB
3919 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
3920 data->pnum);
060f51c9
SH
3921 data->done = true;
3922}
3923
3924/*
b6b8a333 3925 * Synchronous wrapper around bdrv_co_get_block_status().
060f51c9 3926 *
b6b8a333 3927 * See bdrv_co_get_block_status() for details.
060f51c9 3928 */
b6b8a333
PB
3929int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
3930 int nb_sectors, int *pnum)
060f51c9 3931{
6aebab14 3932 Coroutine *co;
b6b8a333 3933 BdrvCoGetBlockStatusData data = {
6aebab14
SH
3934 .bs = bs,
3935 .sector_num = sector_num,
3936 .nb_sectors = nb_sectors,
3937 .pnum = pnum,
3938 .done = false,
3939 };
3940
bdad13b9
PB
3941 if (qemu_in_coroutine()) {
3942 /* Fast-path if already in coroutine context */
b6b8a333 3943 bdrv_get_block_status_co_entry(&data);
bdad13b9 3944 } else {
b6b8a333 3945 co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
bdad13b9
PB
3946 qemu_coroutine_enter(co, &data);
3947 while (!data.done) {
3948 qemu_aio_wait();
3949 }
6aebab14
SH
3950 }
3951 return data.ret;
f58c7b35
TS
3952}
3953
b6b8a333
PB
3954int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
3955 int nb_sectors, int *pnum)
3956{
4333bb71
PB
3957 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
3958 if (ret < 0) {
3959 return ret;
3960 }
3961 return
3962 (ret & BDRV_BLOCK_DATA) ||
3963 ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
b6b8a333
PB
3964}
3965
188a7bbf
PB
3966/*
3967 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
3968 *
3969 * Return true if the given sector is allocated in any image between
3970 * BASE and TOP (inclusive). BASE can be NULL to check if the given
3971 * sector is allocated in any image of the chain. Return false otherwise.
3972 *
3973 * 'pnum' is set to the number of sectors (including and immediately following
3974 * the specified sector) that are known to be in the same
3975 * allocated/unallocated state.
3976 *
3977 */
4f578637
PB
3978int bdrv_is_allocated_above(BlockDriverState *top,
3979 BlockDriverState *base,
3980 int64_t sector_num,
3981 int nb_sectors, int *pnum)
188a7bbf
PB
3982{
3983 BlockDriverState *intermediate;
3984 int ret, n = nb_sectors;
3985
3986 intermediate = top;
3987 while (intermediate && intermediate != base) {
3988 int pnum_inter;
bdad13b9
PB
3989 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
3990 &pnum_inter);
188a7bbf
PB
3991 if (ret < 0) {
3992 return ret;
3993 } else if (ret) {
3994 *pnum = pnum_inter;
3995 return 1;
3996 }
3997
3998 /*
3999 * [sector_num, nb_sectors] is unallocated on top but intermediate
4000 * might have
4001 *
4002 * [sector_num+x, nr_sectors] allocated.
4003 */
63ba17d3
VI
4004 if (n > pnum_inter &&
4005 (intermediate == top ||
4006 sector_num + pnum_inter < intermediate->total_sectors)) {
188a7bbf
PB
4007 n = pnum_inter;
4008 }
4009
4010 intermediate = intermediate->backing_hd;
4011 }
4012
4013 *pnum = n;
4014 return 0;
4015}
4016
045df330
AL
4017const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
4018{
4019 if (bs->backing_hd && bs->backing_hd->encrypted)
4020 return bs->backing_file;
4021 else if (bs->encrypted)
4022 return bs->filename;
4023 else
4024 return NULL;
4025}
4026
5fafdf24 4027void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
4028 char *filename, int filename_size)
4029{
3574c608 4030 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
4031}
4032
5fafdf24 4033int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
4034 const uint8_t *buf, int nb_sectors)
4035{
4036 BlockDriver *drv = bs->drv;
4037 if (!drv)
19cb3738 4038 return -ENOMEDIUM;
faea38e7
FB
4039 if (!drv->bdrv_write_compressed)
4040 return -ENOTSUP;
fbb7b4e0
KW
4041 if (bdrv_check_request(bs, sector_num, nb_sectors))
4042 return -EIO;
a55eb92c 4043
e4654d2d 4044 assert(QLIST_EMPTY(&bs->dirty_bitmaps));
a55eb92c 4045
faea38e7
FB
4046 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
4047}
3b46e624 4048
faea38e7
FB
4049int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4050{
4051 BlockDriver *drv = bs->drv;
4052 if (!drv)
19cb3738 4053 return -ENOMEDIUM;
faea38e7
FB
4054 if (!drv->bdrv_get_info)
4055 return -ENOTSUP;
4056 memset(bdi, 0, sizeof(*bdi));
4057 return drv->bdrv_get_info(bs, bdi);
4058}
4059
eae041fe
HR
4060ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
4061{
4062 BlockDriver *drv = bs->drv;
4063 if (drv && drv->bdrv_get_specific_info) {
4064 return drv->bdrv_get_specific_info(bs);
4065 }
4066 return NULL;
4067}
4068
45566e9c
CH
4069int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
4070 int64_t pos, int size)
cf8074b3
KW
4071{
4072 QEMUIOVector qiov;
4073 struct iovec iov = {
4074 .iov_base = (void *) buf,
4075 .iov_len = size,
4076 };
4077
4078 qemu_iovec_init_external(&qiov, &iov, 1);
4079 return bdrv_writev_vmstate(bs, &qiov, pos);
4080}
4081
4082int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
178e08a5
AL
4083{
4084 BlockDriver *drv = bs->drv;
cf8074b3
KW
4085
4086 if (!drv) {
178e08a5 4087 return -ENOMEDIUM;
cf8074b3
KW
4088 } else if (drv->bdrv_save_vmstate) {
4089 return drv->bdrv_save_vmstate(bs, qiov, pos);
4090 } else if (bs->file) {
4091 return bdrv_writev_vmstate(bs->file, qiov, pos);
4092 }
4093
7cdb1f6d 4094 return -ENOTSUP;
178e08a5
AL
4095}
4096
45566e9c
CH
4097int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
4098 int64_t pos, int size)
178e08a5
AL
4099{
4100 BlockDriver *drv = bs->drv;
4101 if (!drv)
4102 return -ENOMEDIUM;
7cdb1f6d
MK
4103 if (drv->bdrv_load_vmstate)
4104 return drv->bdrv_load_vmstate(bs, buf, pos, size);
4105 if (bs->file)
4106 return bdrv_load_vmstate(bs->file, buf, pos, size);
4107 return -ENOTSUP;
178e08a5
AL
4108}
4109
8b9b0cc2
KW
4110void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
4111{
bf736fe3 4112 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
8b9b0cc2
KW
4113 return;
4114 }
4115
bf736fe3 4116 bs->drv->bdrv_debug_event(bs, event);
41c695c7
KW
4117}
4118
4119int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
4120 const char *tag)
4121{
4122 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
4123 bs = bs->file;
4124 }
4125
4126 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
4127 return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
4128 }
4129
4130 return -ENOTSUP;
4131}
4132
4cc70e93
FZ
4133int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
4134{
4135 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
4136 bs = bs->file;
4137 }
4138
4139 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
4140 return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
4141 }
4142
4143 return -ENOTSUP;
4144}
4145
41c695c7
KW
4146int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
4147{
938789ea 4148 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
41c695c7
KW
4149 bs = bs->file;
4150 }
8b9b0cc2 4151
41c695c7
KW
4152 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
4153 return bs->drv->bdrv_debug_resume(bs, tag);
4154 }
4155
4156 return -ENOTSUP;
4157}
4158
4159bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
4160{
4161 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
4162 bs = bs->file;
4163 }
4164
4165 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
4166 return bs->drv->bdrv_debug_is_suspended(bs, tag);
4167 }
4168
4169 return false;
8b9b0cc2
KW
4170}
4171
199630b6
BS
4172int bdrv_is_snapshot(BlockDriverState *bs)
4173{
4174 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
4175}
4176
b1b1d783
JC
4177/* backing_file can either be relative, or absolute, or a protocol. If it is
4178 * relative, it must be relative to the chain. So, passing in bs->filename
4179 * from a BDS as backing_file should not be done, as that may be relative to
4180 * the CWD rather than the chain. */
e8a6bb9c
MT
4181BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
4182 const char *backing_file)
4183{
b1b1d783
JC
4184 char *filename_full = NULL;
4185 char *backing_file_full = NULL;
4186 char *filename_tmp = NULL;
4187 int is_protocol = 0;
4188 BlockDriverState *curr_bs = NULL;
4189 BlockDriverState *retval = NULL;
4190
4191 if (!bs || !bs->drv || !backing_file) {
e8a6bb9c
MT
4192 return NULL;
4193 }
4194
b1b1d783
JC
4195 filename_full = g_malloc(PATH_MAX);
4196 backing_file_full = g_malloc(PATH_MAX);
4197 filename_tmp = g_malloc(PATH_MAX);
4198
4199 is_protocol = path_has_protocol(backing_file);
4200
4201 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
4202
4203 /* If either of the filename paths is actually a protocol, then
4204 * compare unmodified paths; otherwise make paths relative */
4205 if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
4206 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
4207 retval = curr_bs->backing_hd;
4208 break;
4209 }
e8a6bb9c 4210 } else {
b1b1d783
JC
4211 /* If not an absolute filename path, make it relative to the current
4212 * image's filename path */
4213 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4214 backing_file);
4215
4216 /* We are going to compare absolute pathnames */
4217 if (!realpath(filename_tmp, filename_full)) {
4218 continue;
4219 }
4220
4221 /* We need to make sure the backing filename we are comparing against
4222 * is relative to the current image filename (or absolute) */
4223 path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
4224 curr_bs->backing_file);
4225
4226 if (!realpath(filename_tmp, backing_file_full)) {
4227 continue;
4228 }
4229
4230 if (strcmp(backing_file_full, filename_full) == 0) {
4231 retval = curr_bs->backing_hd;
4232 break;
4233 }
e8a6bb9c
MT
4234 }
4235 }
4236
b1b1d783
JC
4237 g_free(filename_full);
4238 g_free(backing_file_full);
4239 g_free(filename_tmp);
4240 return retval;
e8a6bb9c
MT
4241}
4242
f198fd1c
BC
4243int bdrv_get_backing_file_depth(BlockDriverState *bs)
4244{
4245 if (!bs->drv) {
4246 return 0;
4247 }
4248
4249 if (!bs->backing_hd) {
4250 return 0;
4251 }
4252
4253 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
4254}
4255
79fac568
JC
4256BlockDriverState *bdrv_find_base(BlockDriverState *bs)
4257{
4258 BlockDriverState *curr_bs = NULL;
4259
4260 if (!bs) {
4261 return NULL;
4262 }
4263
4264 curr_bs = bs;
4265
4266 while (curr_bs->backing_hd) {
4267 curr_bs = curr_bs->backing_hd;
4268 }
4269 return curr_bs;
4270}
4271
ea2384d3 4272/**************************************************************/
83f64091 4273/* async I/Os */
ea2384d3 4274
3b69e4b9 4275BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 4276 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 4277 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 4278{
bbf0a440
SH
4279 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
4280
d20d9b7c 4281 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4282 cb, opaque, false);
ea2384d3
FB
4283}
4284
f141eafe
AL
4285BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
4286 QEMUIOVector *qiov, int nb_sectors,
4287 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 4288{
bbf0a440
SH
4289 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
4290
d20d9b7c 4291 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
8c5873d6 4292 cb, opaque, true);
83f64091
FB
4293}
4294
d5ef94d4
PB
4295BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
4296 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
4297 BlockDriverCompletionFunc *cb, void *opaque)
4298{
4299 trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
4300
4301 return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
4302 BDRV_REQ_ZERO_WRITE | flags,
4303 cb, opaque, true);
4304}
4305
40b4f539
KW
4306
4307typedef struct MultiwriteCB {
4308 int error;
4309 int num_requests;
4310 int num_callbacks;
4311 struct {
4312 BlockDriverCompletionFunc *cb;
4313 void *opaque;
4314 QEMUIOVector *free_qiov;
40b4f539
KW
4315 } callbacks[];
4316} MultiwriteCB;
4317
4318static void multiwrite_user_cb(MultiwriteCB *mcb)
4319{
4320 int i;
4321
4322 for (i = 0; i < mcb->num_callbacks; i++) {
4323 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
4324 if (mcb->callbacks[i].free_qiov) {
4325 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
4326 }
7267c094 4327 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
4328 }
4329}
4330
4331static void multiwrite_cb(void *opaque, int ret)
4332{
4333 MultiwriteCB *mcb = opaque;
4334
6d519a5f
SH
4335 trace_multiwrite_cb(mcb, ret);
4336
cb6d3ca0 4337 if (ret < 0 && !mcb->error) {
40b4f539 4338 mcb->error = ret;
40b4f539
KW
4339 }
4340
4341 mcb->num_requests--;
4342 if (mcb->num_requests == 0) {
de189a1b 4343 multiwrite_user_cb(mcb);
7267c094 4344 g_free(mcb);
40b4f539
KW
4345 }
4346}
4347
4348static int multiwrite_req_compare(const void *a, const void *b)
4349{
77be4366
CH
4350 const BlockRequest *req1 = a, *req2 = b;
4351
4352 /*
4353 * Note that we can't simply subtract req2->sector from req1->sector
4354 * here as that could overflow the return value.
4355 */
4356 if (req1->sector > req2->sector) {
4357 return 1;
4358 } else if (req1->sector < req2->sector) {
4359 return -1;
4360 } else {
4361 return 0;
4362 }
40b4f539
KW
4363}
4364
4365/*
4366 * Takes a bunch of requests and tries to merge them. Returns the number of
4367 * requests that remain after merging.
4368 */
4369static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
4370 int num_reqs, MultiwriteCB *mcb)
4371{
4372 int i, outidx;
4373
4374 // Sort requests by start sector
4375 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
4376
4377 // Check if adjacent requests touch the same clusters. If so, combine them,
4378 // filling up gaps with zero sectors.
4379 outidx = 0;
4380 for (i = 1; i < num_reqs; i++) {
4381 int merge = 0;
4382 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
4383
b6a127a1 4384 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
4385 if (reqs[i].sector <= oldreq_last) {
4386 merge = 1;
4387 }
4388
e2a305fb
CH
4389 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
4390 merge = 0;
4391 }
4392
40b4f539
KW
4393 if (merge) {
4394 size_t size;
7267c094 4395 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
4396 qemu_iovec_init(qiov,
4397 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
4398
4399 // Add the first request to the merged one. If the requests are
4400 // overlapping, drop the last sectors of the first request.
4401 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 4402 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 4403
b6a127a1
PB
4404 // We should need to add any zeros between the two requests
4405 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
4406
4407 // Add the second request
1b093c48 4408 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 4409
cbf1dff2 4410 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
4411 reqs[outidx].qiov = qiov;
4412
4413 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
4414 } else {
4415 outidx++;
4416 reqs[outidx].sector = reqs[i].sector;
4417 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
4418 reqs[outidx].qiov = reqs[i].qiov;
4419 }
4420 }
4421
4422 return outidx + 1;
4423}
4424
4425/*
4426 * Submit multiple AIO write requests at once.
4427 *
4428 * On success, the function returns 0 and all requests in the reqs array have
4429 * been submitted. In error case this function returns -1, and any of the
4430 * requests may or may not be submitted yet. In particular, this means that the
4431 * callback will be called for some of the requests, for others it won't. The
4432 * caller must check the error field of the BlockRequest to wait for the right
4433 * callbacks (if error != 0, no callback will be called).
4434 *
4435 * The implementation may modify the contents of the reqs array, e.g. to merge
4436 * requests. However, the fields opaque and error are left unmodified as they
4437 * are used to signal failure for a single request to the caller.
4438 */
4439int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
4440{
40b4f539
KW
4441 MultiwriteCB *mcb;
4442 int i;
4443
301db7c2
RH
4444 /* don't submit writes if we don't have a medium */
4445 if (bs->drv == NULL) {
4446 for (i = 0; i < num_reqs; i++) {
4447 reqs[i].error = -ENOMEDIUM;
4448 }
4449 return -1;
4450 }
4451
40b4f539
KW
4452 if (num_reqs == 0) {
4453 return 0;
4454 }
4455
4456 // Create MultiwriteCB structure
7267c094 4457 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
4458 mcb->num_requests = 0;
4459 mcb->num_callbacks = num_reqs;
4460
4461 for (i = 0; i < num_reqs; i++) {
4462 mcb->callbacks[i].cb = reqs[i].cb;
4463 mcb->callbacks[i].opaque = reqs[i].opaque;
4464 }
4465
4466 // Check for mergable requests
4467 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
4468
6d519a5f
SH
4469 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
4470
df9309fb
PB
4471 /* Run the aio requests. */
4472 mcb->num_requests = num_reqs;
40b4f539 4473 for (i = 0; i < num_reqs; i++) {
d20d9b7c
PB
4474 bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
4475 reqs[i].nb_sectors, reqs[i].flags,
4476 multiwrite_cb, mcb,
4477 true);
40b4f539
KW
4478 }
4479
4480 return 0;
40b4f539
KW
4481}
4482
83f64091 4483void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 4484{
d7331bed 4485 acb->aiocb_info->cancel(acb);
83f64091
FB
4486}
4487
4488/**************************************************************/
4489/* async block device emulation */
4490
c16b5a2c
CH
4491typedef struct BlockDriverAIOCBSync {
4492 BlockDriverAIOCB common;
4493 QEMUBH *bh;
4494 int ret;
4495 /* vector translation state */
4496 QEMUIOVector *qiov;
4497 uint8_t *bounce;
4498 int is_write;
4499} BlockDriverAIOCBSync;
4500
4501static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
4502{
b666d239
KW
4503 BlockDriverAIOCBSync *acb =
4504 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 4505 qemu_bh_delete(acb->bh);
36afc451 4506 acb->bh = NULL;
c16b5a2c
CH
4507 qemu_aio_release(acb);
4508}
4509
d7331bed 4510static const AIOCBInfo bdrv_em_aiocb_info = {
c16b5a2c
CH
4511 .aiocb_size = sizeof(BlockDriverAIOCBSync),
4512 .cancel = bdrv_aio_cancel_em,
4513};
4514
ce1a14dc 4515static void bdrv_aio_bh_cb(void *opaque)
83f64091 4516{
ce1a14dc 4517 BlockDriverAIOCBSync *acb = opaque;
f141eafe 4518
f141eafe 4519 if (!acb->is_write)
03396148 4520 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
ceb42de8 4521 qemu_vfree(acb->bounce);
ce1a14dc 4522 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 4523 qemu_bh_delete(acb->bh);
36afc451 4524 acb->bh = NULL;
ce1a14dc 4525 qemu_aio_release(acb);
83f64091 4526}
beac80cd 4527
f141eafe
AL
4528static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
4529 int64_t sector_num,
4530 QEMUIOVector *qiov,
4531 int nb_sectors,
4532 BlockDriverCompletionFunc *cb,
4533 void *opaque,
4534 int is_write)
4535
83f64091 4536{
ce1a14dc 4537 BlockDriverAIOCBSync *acb;
ce1a14dc 4538
d7331bed 4539 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
f141eafe
AL
4540 acb->is_write = is_write;
4541 acb->qiov = qiov;
e268ca52 4542 acb->bounce = qemu_blockalign(bs, qiov->size);
3f3aace8 4543 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
4544
4545 if (is_write) {
d5e6b161 4546 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 4547 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 4548 } else {
1ed20acf 4549 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
4550 }
4551
ce1a14dc 4552 qemu_bh_schedule(acb->bh);
f141eafe 4553
ce1a14dc 4554 return &acb->common;
beac80cd
FB
4555}
4556
f141eafe
AL
4557static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
4558 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 4559 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 4560{
f141eafe
AL
4561 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
4562}
83f64091 4563
f141eafe
AL
4564static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
4565 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
4566 BlockDriverCompletionFunc *cb, void *opaque)
4567{
4568 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 4569}
beac80cd 4570
68485420
KW
4571
4572typedef struct BlockDriverAIOCBCoroutine {
4573 BlockDriverAIOCB common;
4574 BlockRequest req;
4575 bool is_write;
d318aea9 4576 bool *done;
68485420
KW
4577 QEMUBH* bh;
4578} BlockDriverAIOCBCoroutine;
4579
4580static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
4581{
d318aea9
KW
4582 BlockDriverAIOCBCoroutine *acb =
4583 container_of(blockacb, BlockDriverAIOCBCoroutine, common);
4584 bool done = false;
4585
4586 acb->done = &done;
4587 while (!done) {
4588 qemu_aio_wait();
4589 }
68485420
KW
4590}
4591
d7331bed 4592static const AIOCBInfo bdrv_em_co_aiocb_info = {
68485420
KW
4593 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
4594 .cancel = bdrv_aio_co_cancel_em,
4595};
4596
35246a68 4597static void bdrv_co_em_bh(void *opaque)
68485420
KW
4598{
4599 BlockDriverAIOCBCoroutine *acb = opaque;
4600
4601 acb->common.cb(acb->common.opaque, acb->req.error);
d318aea9
KW
4602
4603 if (acb->done) {
4604 *acb->done = true;
4605 }
4606
68485420
KW
4607 qemu_bh_delete(acb->bh);
4608 qemu_aio_release(acb);
4609}
4610
b2a61371
SH
4611/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
4612static void coroutine_fn bdrv_co_do_rw(void *opaque)
4613{
4614 BlockDriverAIOCBCoroutine *acb = opaque;
4615 BlockDriverState *bs = acb->common.bs;
4616
4617 if (!acb->is_write) {
4618 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
d20d9b7c 4619 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4620 } else {
4621 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
d20d9b7c 4622 acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
b2a61371
SH
4623 }
4624
35246a68 4625 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
4626 qemu_bh_schedule(acb->bh);
4627}
4628
68485420
KW
4629static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
4630 int64_t sector_num,
4631 QEMUIOVector *qiov,
4632 int nb_sectors,
d20d9b7c 4633 BdrvRequestFlags flags,
68485420
KW
4634 BlockDriverCompletionFunc *cb,
4635 void *opaque,
8c5873d6 4636 bool is_write)
68485420
KW
4637{
4638 Coroutine *co;
4639 BlockDriverAIOCBCoroutine *acb;
4640
d7331bed 4641 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
68485420
KW
4642 acb->req.sector = sector_num;
4643 acb->req.nb_sectors = nb_sectors;
4644 acb->req.qiov = qiov;
d20d9b7c 4645 acb->req.flags = flags;
68485420 4646 acb->is_write = is_write;
d318aea9 4647 acb->done = NULL;
68485420 4648
8c5873d6 4649 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
4650 qemu_coroutine_enter(co, acb);
4651
4652 return &acb->common;
4653}
4654
07f07615 4655static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 4656{
07f07615
PB
4657 BlockDriverAIOCBCoroutine *acb = opaque;
4658 BlockDriverState *bs = acb->common.bs;
b2e12bc6 4659
07f07615
PB
4660 acb->req.error = bdrv_co_flush(bs);
4661 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 4662 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
4663}
4664
07f07615 4665BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
4666 BlockDriverCompletionFunc *cb, void *opaque)
4667{
07f07615 4668 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 4669
07f07615
PB
4670 Coroutine *co;
4671 BlockDriverAIOCBCoroutine *acb;
016f5cf6 4672
d7331bed 4673 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
d318aea9
KW
4674 acb->done = NULL;
4675
07f07615
PB
4676 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
4677 qemu_coroutine_enter(co, acb);
016f5cf6 4678
016f5cf6
AG
4679 return &acb->common;
4680}
4681
4265d620
PB
4682static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
4683{
4684 BlockDriverAIOCBCoroutine *acb = opaque;
4685 BlockDriverState *bs = acb->common.bs;
4686
4687 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
4688 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
4689 qemu_bh_schedule(acb->bh);
4690}
4691
4692BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
4693 int64_t sector_num, int nb_sectors,
4694 BlockDriverCompletionFunc *cb, void *opaque)
4695{
4696 Coroutine *co;
4697 BlockDriverAIOCBCoroutine *acb;
4698
4699 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
4700
d7331bed 4701 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
4265d620
PB
4702 acb->req.sector = sector_num;
4703 acb->req.nb_sectors = nb_sectors;
d318aea9 4704 acb->done = NULL;
4265d620
PB
4705 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
4706 qemu_coroutine_enter(co, acb);
4707
4708 return &acb->common;
4709}
4710
ea2384d3
FB
4711void bdrv_init(void)
4712{
5efa9d5a 4713 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 4714}
ce1a14dc 4715
eb852011
MA
4716void bdrv_init_with_whitelist(void)
4717{
4718 use_bdrv_whitelist = 1;
4719 bdrv_init();
4720}
4721
d7331bed 4722void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
c16b5a2c 4723 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 4724{
ce1a14dc
PB
4725 BlockDriverAIOCB *acb;
4726
d7331bed
SH
4727 acb = g_slice_alloc(aiocb_info->aiocb_size);
4728 acb->aiocb_info = aiocb_info;
ce1a14dc
PB
4729 acb->bs = bs;
4730 acb->cb = cb;
4731 acb->opaque = opaque;
4732 return acb;
4733}
4734
4735void qemu_aio_release(void *p)
4736{
d37c975f 4737 BlockDriverAIOCB *acb = p;
d7331bed 4738 g_slice_free1(acb->aiocb_info->aiocb_size, acb);
ce1a14dc 4739}
19cb3738 4740
f9f05dc5
KW
4741/**************************************************************/
4742/* Coroutine block device emulation */
4743
4744typedef struct CoroutineIOCompletion {
4745 Coroutine *coroutine;
4746 int ret;
4747} CoroutineIOCompletion;
4748
4749static void bdrv_co_io_em_complete(void *opaque, int ret)
4750{
4751 CoroutineIOCompletion *co = opaque;
4752
4753 co->ret = ret;
4754 qemu_coroutine_enter(co->coroutine, NULL);
4755}
4756
4757static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
4758 int nb_sectors, QEMUIOVector *iov,
4759 bool is_write)
4760{
4761 CoroutineIOCompletion co = {
4762 .coroutine = qemu_coroutine_self(),
4763 };
4764 BlockDriverAIOCB *acb;
4765
4766 if (is_write) {
a652d160
SH
4767 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
4768 bdrv_co_io_em_complete, &co);
f9f05dc5 4769 } else {
a652d160
SH
4770 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
4771 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
4772 }
4773
59370aaa 4774 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
4775 if (!acb) {
4776 return -EIO;
4777 }
4778 qemu_coroutine_yield();
4779
4780 return co.ret;
4781}
4782
4783static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
4784 int64_t sector_num, int nb_sectors,
4785 QEMUIOVector *iov)
4786{
4787 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
4788}
4789
4790static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
4791 int64_t sector_num, int nb_sectors,
4792 QEMUIOVector *iov)
4793{
4794 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
4795}
4796
07f07615 4797static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 4798{
07f07615
PB
4799 RwCo *rwco = opaque;
4800
4801 rwco->ret = bdrv_co_flush(rwco->bs);
4802}
4803
4804int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
4805{
eb489bb1
KW
4806 int ret;
4807
29cdb251 4808 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 4809 return 0;
eb489bb1
KW
4810 }
4811
ca716364 4812 /* Write back cached data to the OS even with cache=unsafe */
bf736fe3 4813 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
eb489bb1
KW
4814 if (bs->drv->bdrv_co_flush_to_os) {
4815 ret = bs->drv->bdrv_co_flush_to_os(bs);
4816 if (ret < 0) {
4817 return ret;
4818 }
4819 }
4820
ca716364
KW
4821 /* But don't actually force it to the disk with cache=unsafe */
4822 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 4823 goto flush_parent;
ca716364
KW
4824 }
4825
bf736fe3 4826 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
eb489bb1 4827 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 4828 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
4829 } else if (bs->drv->bdrv_aio_flush) {
4830 BlockDriverAIOCB *acb;
4831 CoroutineIOCompletion co = {
4832 .coroutine = qemu_coroutine_self(),
4833 };
4834
4835 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
4836 if (acb == NULL) {
29cdb251 4837 ret = -EIO;
07f07615
PB
4838 } else {
4839 qemu_coroutine_yield();
29cdb251 4840 ret = co.ret;
07f07615 4841 }
07f07615
PB
4842 } else {
4843 /*
4844 * Some block drivers always operate in either writethrough or unsafe
4845 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
4846 * know how the server works (because the behaviour is hardcoded or
4847 * depends on server-side configuration), so we can't ensure that
4848 * everything is safe on disk. Returning an error doesn't work because
4849 * that would break guests even if the server operates in writethrough
4850 * mode.
4851 *
4852 * Let's hope the user knows what he's doing.
4853 */
29cdb251 4854 ret = 0;
07f07615 4855 }
29cdb251
PB
4856 if (ret < 0) {
4857 return ret;
4858 }
4859
4860 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
4861 * in the case of cache=unsafe, so there are no useless flushes.
4862 */
d4c82329 4863flush_parent:
29cdb251 4864 return bdrv_co_flush(bs->file);
07f07615
PB
4865}
4866
5a8a30db 4867void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
0f15423c 4868{
5a8a30db
KW
4869 Error *local_err = NULL;
4870 int ret;
4871
3456a8d1
KW
4872 if (!bs->drv) {
4873 return;
4874 }
4875
4876 if (bs->drv->bdrv_invalidate_cache) {
5a8a30db 4877 bs->drv->bdrv_invalidate_cache(bs, &local_err);
3456a8d1 4878 } else if (bs->file) {
5a8a30db
KW
4879 bdrv_invalidate_cache(bs->file, &local_err);
4880 }
4881 if (local_err) {
4882 error_propagate(errp, local_err);
4883 return;
0f15423c 4884 }
3456a8d1 4885
5a8a30db
KW
4886 ret = refresh_total_sectors(bs, bs->total_sectors);
4887 if (ret < 0) {
4888 error_setg_errno(errp, -ret, "Could not refresh total sector count");
4889 return;
4890 }
0f15423c
AL
4891}
4892
5a8a30db 4893void bdrv_invalidate_cache_all(Error **errp)
0f15423c
AL
4894{
4895 BlockDriverState *bs;
5a8a30db 4896 Error *local_err = NULL;
0f15423c 4897
dc364f4c 4898 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5a8a30db
KW
4899 bdrv_invalidate_cache(bs, &local_err);
4900 if (local_err) {
4901 error_propagate(errp, local_err);
4902 return;
4903 }
0f15423c
AL
4904 }
4905}
4906
07789269
BC
4907void bdrv_clear_incoming_migration_all(void)
4908{
4909 BlockDriverState *bs;
4910
dc364f4c 4911 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
07789269
BC
4912 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4913 }
4914}
4915
07f07615
PB
4916int bdrv_flush(BlockDriverState *bs)
4917{
4918 Coroutine *co;
4919 RwCo rwco = {
4920 .bs = bs,
4921 .ret = NOT_DONE,
e7a8a783 4922 };
e7a8a783 4923
07f07615
PB
4924 if (qemu_in_coroutine()) {
4925 /* Fast-path if already in coroutine context */
4926 bdrv_flush_co_entry(&rwco);
4927 } else {
4928 co = qemu_coroutine_create(bdrv_flush_co_entry);
4929 qemu_coroutine_enter(co, &rwco);
4930 while (rwco.ret == NOT_DONE) {
4931 qemu_aio_wait();
4932 }
e7a8a783 4933 }
07f07615
PB
4934
4935 return rwco.ret;
e7a8a783
KW
4936}
4937
775aa8b6
KW
4938typedef struct DiscardCo {
4939 BlockDriverState *bs;
4940 int64_t sector_num;
4941 int nb_sectors;
4942 int ret;
4943} DiscardCo;
4265d620
PB
4944static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4945{
775aa8b6 4946 DiscardCo *rwco = opaque;
4265d620
PB
4947
4948 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4949}
4950
6f14da52
PL
4951/* if no limit is specified in the BlockLimits use a default
4952 * of 32768 512-byte sectors (16 MiB) per request.
4953 */
4954#define MAX_DISCARD_DEFAULT 32768
4955
4265d620
PB
4956int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4957 int nb_sectors)
4958{
d51e9fe5
PB
4959 int max_discard;
4960
4265d620
PB
4961 if (!bs->drv) {
4962 return -ENOMEDIUM;
4963 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4964 return -EIO;
4965 } else if (bs->read_only) {
4966 return -EROFS;
df702c9b
PB
4967 }
4968
e4654d2d 4969 bdrv_reset_dirty(bs, sector_num, nb_sectors);
df702c9b 4970
9e8f1835
PB
4971 /* Do nothing if disabled. */
4972 if (!(bs->open_flags & BDRV_O_UNMAP)) {
4973 return 0;
4974 }
4975
d51e9fe5
PB
4976 if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
4977 return 0;
4978 }
6f14da52 4979
d51e9fe5
PB
4980 max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT;
4981 while (nb_sectors > 0) {
4982 int ret;
4983 int num = nb_sectors;
6f14da52 4984
d51e9fe5
PB
4985 /* align request */
4986 if (bs->bl.discard_alignment &&
4987 num >= bs->bl.discard_alignment &&
4988 sector_num % bs->bl.discard_alignment) {
4989 if (num > bs->bl.discard_alignment) {
4990 num = bs->bl.discard_alignment;
6f14da52 4991 }
d51e9fe5
PB
4992 num -= sector_num % bs->bl.discard_alignment;
4993 }
6f14da52 4994
d51e9fe5
PB
4995 /* limit request size */
4996 if (num > max_discard) {
4997 num = max_discard;
4998 }
6f14da52 4999
d51e9fe5 5000 if (bs->drv->bdrv_co_discard) {
6f14da52 5001 ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
d51e9fe5
PB
5002 } else {
5003 BlockDriverAIOCB *acb;
5004 CoroutineIOCompletion co = {
5005 .coroutine = qemu_coroutine_self(),
5006 };
5007
5008 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
5009 bdrv_co_io_em_complete, &co);
5010 if (acb == NULL) {
5011 return -EIO;
5012 } else {
5013 qemu_coroutine_yield();
5014 ret = co.ret;
6f14da52 5015 }
6f14da52 5016 }
7ce21016 5017 if (ret && ret != -ENOTSUP) {
d51e9fe5 5018 return ret;
4265d620 5019 }
d51e9fe5
PB
5020
5021 sector_num += num;
5022 nb_sectors -= num;
4265d620 5023 }
d51e9fe5 5024 return 0;
4265d620
PB
5025}
5026
5027int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
5028{
5029 Coroutine *co;
775aa8b6 5030 DiscardCo rwco = {
4265d620
PB
5031 .bs = bs,
5032 .sector_num = sector_num,
5033 .nb_sectors = nb_sectors,
5034 .ret = NOT_DONE,
5035 };
5036
5037 if (qemu_in_coroutine()) {
5038 /* Fast-path if already in coroutine context */
5039 bdrv_discard_co_entry(&rwco);
5040 } else {
5041 co = qemu_coroutine_create(bdrv_discard_co_entry);
5042 qemu_coroutine_enter(co, &rwco);
5043 while (rwco.ret == NOT_DONE) {
5044 qemu_aio_wait();
5045 }
5046 }
5047
5048 return rwco.ret;
5049}
5050
19cb3738
FB
5051/**************************************************************/
5052/* removable device support */
5053
5054/**
5055 * Return TRUE if the media is present
5056 */
5057int bdrv_is_inserted(BlockDriverState *bs)
5058{
5059 BlockDriver *drv = bs->drv;
a1aff5bf 5060
19cb3738
FB
5061 if (!drv)
5062 return 0;
5063 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
5064 return 1;
5065 return drv->bdrv_is_inserted(bs);
19cb3738
FB
5066}
5067
5068/**
8e49ca46
MA
5069 * Return whether the media changed since the last call to this
5070 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
5071 */
5072int bdrv_media_changed(BlockDriverState *bs)
5073{
5074 BlockDriver *drv = bs->drv;
19cb3738 5075
8e49ca46
MA
5076 if (drv && drv->bdrv_media_changed) {
5077 return drv->bdrv_media_changed(bs);
5078 }
5079 return -ENOTSUP;
19cb3738
FB
5080}
5081
5082/**
5083 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
5084 */
f36f3949 5085void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
5086{
5087 BlockDriver *drv = bs->drv;
19cb3738 5088
822e1cd1
MA
5089 if (drv && drv->bdrv_eject) {
5090 drv->bdrv_eject(bs, eject_flag);
19cb3738 5091 }
6f382ed2
LC
5092
5093 if (bs->device_name[0] != '\0') {
5094 bdrv_emit_qmp_eject_event(bs, eject_flag);
5095 }
19cb3738
FB
5096}
5097
19cb3738
FB
5098/**
5099 * Lock or unlock the media (if it is locked, the user won't be able
5100 * to eject it manually).
5101 */
025e849a 5102void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
5103{
5104 BlockDriver *drv = bs->drv;
5105
025e849a 5106 trace_bdrv_lock_medium(bs, locked);
b8c6d095 5107
025e849a
MA
5108 if (drv && drv->bdrv_lock_medium) {
5109 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
5110 }
5111}
985a03b0
TS
5112
5113/* needed for generic scsi interface */
5114
5115int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
5116{
5117 BlockDriver *drv = bs->drv;
5118
5119 if (drv && drv->bdrv_ioctl)
5120 return drv->bdrv_ioctl(bs, req, buf);
5121 return -ENOTSUP;
5122}
7d780669 5123
221f715d
AL
5124BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
5125 unsigned long int req, void *buf,
5126 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 5127{
221f715d 5128 BlockDriver *drv = bs->drv;
7d780669 5129
221f715d
AL
5130 if (drv && drv->bdrv_aio_ioctl)
5131 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
5132 return NULL;
7d780669 5133}
e268ca52 5134
1b7fd729 5135void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
7b6f9300 5136{
1b7fd729 5137 bs->guest_block_size = align;
7b6f9300 5138}
7cd1e32a 5139
e268ca52
AL
5140void *qemu_blockalign(BlockDriverState *bs, size_t size)
5141{
339064d5 5142 return qemu_memalign(bdrv_opt_mem_align(bs), size);
e268ca52 5143}
7cd1e32a 5144
c53b1c51
SH
5145/*
5146 * Check if all memory in this vector is sector aligned.
5147 */
5148bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
5149{
5150 int i;
339064d5 5151 size_t alignment = bdrv_opt_mem_align(bs);
c53b1c51
SH
5152
5153 for (i = 0; i < qiov->niov; i++) {
339064d5 5154 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
c53b1c51 5155 return false;
1ff735bd 5156 }
339064d5 5157 if (qiov->iov[i].iov_len % alignment) {
1ff735bd 5158 return false;
c53b1c51
SH
5159 }
5160 }
5161
5162 return true;
5163}
5164
b8afb520
FZ
5165BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
5166 Error **errp)
7cd1e32a
LS
5167{
5168 int64_t bitmap_size;
e4654d2d 5169 BdrvDirtyBitmap *bitmap;
a55eb92c 5170
50717e94
PB
5171 assert((granularity & (granularity - 1)) == 0);
5172
e4654d2d
FZ
5173 granularity >>= BDRV_SECTOR_BITS;
5174 assert(granularity);
b8afb520
FZ
5175 bitmap_size = bdrv_getlength(bs);
5176 if (bitmap_size < 0) {
5177 error_setg_errno(errp, -bitmap_size, "could not get length of device");
5178 errno = -bitmap_size;
5179 return NULL;
5180 }
5181 bitmap_size >>= BDRV_SECTOR_BITS;
e4654d2d
FZ
5182 bitmap = g_malloc0(sizeof(BdrvDirtyBitmap));
5183 bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
5184 QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
5185 return bitmap;
5186}
5187
5188void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
5189{
5190 BdrvDirtyBitmap *bm, *next;
5191 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
5192 if (bm == bitmap) {
5193 QLIST_REMOVE(bitmap, list);
5194 hbitmap_free(bitmap->bitmap);
5195 g_free(bitmap);
5196 return;
a55eb92c 5197 }
7cd1e32a
LS
5198 }
5199}
5200
21b56835
FZ
5201BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
5202{
5203 BdrvDirtyBitmap *bm;
5204 BlockDirtyInfoList *list = NULL;
5205 BlockDirtyInfoList **plist = &list;
5206
5207 QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
5208 BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo));
5209 BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList));
5210 info->count = bdrv_get_dirty_count(bs, bm);
5211 info->granularity =
5212 ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
5213 entry->value = info;
5214 *plist = entry;
5215 plist = &entry->next;
5216 }
5217
5218 return list;
5219}
5220
e4654d2d 5221int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector)
7cd1e32a 5222{
e4654d2d
FZ
5223 if (bitmap) {
5224 return hbitmap_get(bitmap->bitmap, sector);
7cd1e32a
LS
5225 } else {
5226 return 0;
5227 }
5228}
5229
e4654d2d
FZ
5230void bdrv_dirty_iter_init(BlockDriverState *bs,
5231 BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
1755da16 5232{
e4654d2d 5233 hbitmap_iter_init(hbi, bitmap->bitmap, 0);
1755da16
PB
5234}
5235
5236void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
5237 int nr_sectors)
5238{
e4654d2d
FZ
5239 BdrvDirtyBitmap *bitmap;
5240 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5241 hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
5242 }
1755da16
PB
5243}
5244
e4654d2d 5245void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors)
7cd1e32a 5246{
e4654d2d
FZ
5247 BdrvDirtyBitmap *bitmap;
5248 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
5249 hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
5250 }
7cd1e32a 5251}
aaa0eb75 5252
e4654d2d 5253int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
aaa0eb75 5254{
e4654d2d 5255 return hbitmap_count(bitmap->bitmap);
aaa0eb75 5256}
f88e1a42 5257
9fcb0251
FZ
5258/* Get a reference to bs */
5259void bdrv_ref(BlockDriverState *bs)
5260{
5261 bs->refcnt++;
5262}
5263
5264/* Release a previously grabbed reference to bs.
5265 * If after releasing, reference count is zero, the BlockDriverState is
5266 * deleted. */
5267void bdrv_unref(BlockDriverState *bs)
5268{
5269 assert(bs->refcnt > 0);
5270 if (--bs->refcnt == 0) {
5271 bdrv_delete(bs);
5272 }
5273}
5274
db593f25
MT
5275void bdrv_set_in_use(BlockDriverState *bs, int in_use)
5276{
5277 assert(bs->in_use != in_use);
5278 bs->in_use = in_use;
5279}
5280
5281int bdrv_in_use(BlockDriverState *bs)
5282{
5283 return bs->in_use;
5284}
5285
28a7282a
LC
5286void bdrv_iostatus_enable(BlockDriverState *bs)
5287{
d6bf279e 5288 bs->iostatus_enabled = true;
58e21ef5 5289 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
5290}
5291
5292/* The I/O status is only enabled if the drive explicitly
5293 * enables it _and_ the VM is configured to stop on errors */
5294bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
5295{
d6bf279e 5296 return (bs->iostatus_enabled &&
92aa5c6d
PB
5297 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
5298 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
5299 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
5300}
5301
5302void bdrv_iostatus_disable(BlockDriverState *bs)
5303{
d6bf279e 5304 bs->iostatus_enabled = false;
28a7282a
LC
5305}
5306
5307void bdrv_iostatus_reset(BlockDriverState *bs)
5308{
5309 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 5310 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3bd293c3
PB
5311 if (bs->job) {
5312 block_job_iostatus_reset(bs->job);
5313 }
28a7282a
LC
5314 }
5315}
5316
28a7282a
LC
5317void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
5318{
3e1caa5f
PB
5319 assert(bdrv_iostatus_is_enabled(bs));
5320 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
5321 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
5322 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
5323 }
5324}
5325
a597e79c
CH
5326void
5327bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
5328 enum BlockAcctType type)
5329{
5330 assert(type < BDRV_MAX_IOTYPE);
5331
5332 cookie->bytes = bytes;
c488c7f6 5333 cookie->start_time_ns = get_clock();
a597e79c
CH
5334 cookie->type = type;
5335}
5336
5337void
5338bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
5339{
5340 assert(cookie->type < BDRV_MAX_IOTYPE);
5341
5342 bs->nr_bytes[cookie->type] += cookie->bytes;
5343 bs->nr_ops[cookie->type]++;
c488c7f6 5344 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
5345}
5346
d92ada22
LC
5347void bdrv_img_create(const char *filename, const char *fmt,
5348 const char *base_filename, const char *base_fmt,
f382d43a
MR
5349 char *options, uint64_t img_size, int flags,
5350 Error **errp, bool quiet)
f88e1a42
JS
5351{
5352 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 5353 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42 5354 BlockDriver *drv, *proto_drv;
96df67d1 5355 BlockDriver *backing_drv = NULL;
cc84d90f 5356 Error *local_err = NULL;
f88e1a42
JS
5357 int ret = 0;
5358
5359 /* Find driver and parse its options */
5360 drv = bdrv_find_format(fmt);
5361 if (!drv) {
71c79813 5362 error_setg(errp, "Unknown file format '%s'", fmt);
d92ada22 5363 return;
f88e1a42
JS
5364 }
5365
98289620 5366 proto_drv = bdrv_find_protocol(filename, true);
f88e1a42 5367 if (!proto_drv) {
71c79813 5368 error_setg(errp, "Unknown protocol '%s'", filename);
d92ada22 5369 return;
f88e1a42
JS
5370 }
5371
5372 create_options = append_option_parameters(create_options,
5373 drv->create_options);
5374 create_options = append_option_parameters(create_options,
5375 proto_drv->create_options);
5376
5377 /* Create parameter list with default values */
5378 param = parse_option_parameters("", create_options, param);
5379
5380 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
5381
5382 /* Parse -o options */
5383 if (options) {
5384 param = parse_option_parameters(options, create_options, param);
5385 if (param == NULL) {
71c79813 5386 error_setg(errp, "Invalid options for file format '%s'.", fmt);
f88e1a42
JS
5387 goto out;
5388 }
5389 }
5390
5391 if (base_filename) {
5392 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
5393 base_filename)) {
71c79813
LC
5394 error_setg(errp, "Backing file not supported for file format '%s'",
5395 fmt);
f88e1a42
JS
5396 goto out;
5397 }
5398 }
5399
5400 if (base_fmt) {
5401 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
71c79813
LC
5402 error_setg(errp, "Backing file format not supported for file "
5403 "format '%s'", fmt);
f88e1a42
JS
5404 goto out;
5405 }
5406 }
5407
792da93a
JS
5408 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
5409 if (backing_file && backing_file->value.s) {
5410 if (!strcmp(filename, backing_file->value.s)) {
71c79813
LC
5411 error_setg(errp, "Error: Trying to create an image with the "
5412 "same filename as the backing file");
792da93a
JS
5413 goto out;
5414 }
5415 }
5416
f88e1a42
JS
5417 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
5418 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
5419 backing_drv = bdrv_find_format(backing_fmt->value.s);
5420 if (!backing_drv) {
71c79813
LC
5421 error_setg(errp, "Unknown backing file format '%s'",
5422 backing_fmt->value.s);
f88e1a42
JS
5423 goto out;
5424 }
5425 }
5426
5427 // The size for the image must always be specified, with one exception:
5428 // If we are using a backing file, we can obtain the size from there
d220894e
KW
5429 size = get_option_parameter(param, BLOCK_OPT_SIZE);
5430 if (size && size->value.n == -1) {
f88e1a42 5431 if (backing_file && backing_file->value.s) {
66f6b814 5432 BlockDriverState *bs;
f88e1a42 5433 uint64_t size;
f88e1a42 5434 char buf[32];
63090dac
PB
5435 int back_flags;
5436
5437 /* backing files always opened read-only */
5438 back_flags =
5439 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 5440
f67503e5 5441 bs = NULL;
ddf5636d 5442 ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags,
cc84d90f 5443 backing_drv, &local_err);
f88e1a42 5444 if (ret < 0) {
cc84d90f
HR
5445 error_setg_errno(errp, -ret, "Could not open '%s': %s",
5446 backing_file->value.s,
5447 error_get_pretty(local_err));
5448 error_free(local_err);
5449 local_err = NULL;
f88e1a42
JS
5450 goto out;
5451 }
5452 bdrv_get_geometry(bs, &size);
5453 size *= 512;
5454
5455 snprintf(buf, sizeof(buf), "%" PRId64, size);
5456 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
66f6b814
HR
5457
5458 bdrv_unref(bs);
f88e1a42 5459 } else {
71c79813 5460 error_setg(errp, "Image creation needs a size parameter");
f88e1a42
JS
5461 goto out;
5462 }
5463 }
5464
f382d43a
MR
5465 if (!quiet) {
5466 printf("Formatting '%s', fmt=%s ", filename, fmt);
5467 print_option_parameters(param);
5468 puts("");
5469 }
cc84d90f
HR
5470 ret = bdrv_create(drv, filename, param, &local_err);
5471 if (ret == -EFBIG) {
5472 /* This is generally a better message than whatever the driver would
5473 * deliver (especially because of the cluster_size_hint), since that
5474 * is most probably not much different from "image too large". */
5475 const char *cluster_size_hint = "";
5476 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) {
5477 cluster_size_hint = " (try using a larger cluster size)";
f88e1a42 5478 }
cc84d90f
HR
5479 error_setg(errp, "The image size is too large for file format '%s'"
5480 "%s", fmt, cluster_size_hint);
5481 error_free(local_err);
5482 local_err = NULL;
f88e1a42
JS
5483 }
5484
5485out:
5486 free_option_parameters(create_options);
5487 free_option_parameters(param);
5488
84d18f06 5489 if (local_err) {
cc84d90f
HR
5490 error_propagate(errp, local_err);
5491 }
f88e1a42 5492}
85d126f3
SH
5493
5494AioContext *bdrv_get_aio_context(BlockDriverState *bs)
5495{
5496 /* Currently BlockDriverState always uses the main loop AioContext */
5497 return qemu_get_aio_context();
5498}
d616b224
SH
5499
5500void bdrv_add_before_write_notifier(BlockDriverState *bs,
5501 NotifierWithReturn *notifier)
5502{
5503 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
5504}
6f176b48
HR
5505
5506int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
5507{
5508 if (bs->drv->bdrv_amend_options == NULL) {
5509 return -ENOTSUP;
5510 }
5511 return bs->drv->bdrv_amend_options(bs, options);
5512}
f6186f49 5513
b5042a36
BC
5514/* This function will be called by the bdrv_recurse_is_first_non_filter method
5515 * of block filter and by bdrv_is_first_non_filter.
5516 * It is used to test if the given bs is the candidate or recurse more in the
5517 * node graph.
212a5a8f 5518 */
b5042a36 5519bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
212a5a8f 5520 BlockDriverState *candidate)
f6186f49 5521{
b5042a36
BC
5522 /* return false if basic checks fails */
5523 if (!bs || !bs->drv) {
212a5a8f 5524 return false;
f6186f49
BC
5525 }
5526
b5042a36
BC
5527 /* the code reached a non block filter driver -> check if the bs is
5528 * the same as the candidate. It's the recursion termination condition.
5529 */
5530 if (!bs->drv->is_filter) {
5531 return bs == candidate;
212a5a8f 5532 }
b5042a36 5533 /* Down this path the driver is a block filter driver */
212a5a8f 5534
b5042a36
BC
5535 /* If the block filter recursion method is defined use it to recurse down
5536 * the node graph.
5537 */
5538 if (bs->drv->bdrv_recurse_is_first_non_filter) {
212a5a8f 5539 return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
f6186f49
BC
5540 }
5541
b5042a36
BC
5542 /* the driver is a block filter but don't allow to recurse -> return false
5543 */
5544 return false;
f6186f49
BC
5545}
5546
212a5a8f
BC
5547/* This function checks if the candidate is the first non filter bs down it's
5548 * bs chain. Since we don't have pointers to parents it explore all bs chains
5549 * from the top. Some filters can choose not to pass down the recursion.
5550 */
5551bool bdrv_is_first_non_filter(BlockDriverState *candidate)
f6186f49 5552{
212a5a8f
BC
5553 BlockDriverState *bs;
5554
5555 /* walk down the bs forest recursively */
5556 QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
5557 bool perm;
5558
b5042a36 5559 /* try to recurse in this top level bs */
e6dc8a1f 5560 perm = bdrv_recurse_is_first_non_filter(bs, candidate);
212a5a8f
BC
5561
5562 /* candidate is the first non filter */
5563 if (perm) {
5564 return true;
5565 }
5566 }
5567
5568 return false;
f6186f49 5569}