]> git.ipfire.org Git - thirdparty/qemu.git/blame - nbd/server.c
nbd/server: do not poll within a coroutine context
[thirdparty/qemu.git] / nbd / server.c
CommitLineData
75818250 1/*
a7c8ed36 2 * Copyright Red Hat
7a5ca864
FB
3 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
4 *
798bfe00 5 * Network Block Device Server Side
7a5ca864
FB
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; under version 2 of the License.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
8167ee88 17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
75818250 18 */
7a5ca864 19
d38ea87a 20#include "qemu/osdep.h"
56ee8626 21
e2c1c34f 22#include "block/block_int.h"
56ee8626 23#include "block/export.h"
e2c1c34f 24#include "block/dirty-bitmap.h"
da34e65c 25#include "qapi/error.h"
dc5e9ac7 26#include "qemu/queue.h"
9588463e 27#include "trace.h"
798bfe00 28#include "nbd-internal.h"
416e34bd 29#include "qemu/units.h"
5df022cf 30#include "qemu/memalign.h"
ca441480 31
e7b1948d 32#define NBD_META_ID_BASE_ALLOCATION 0
71719cd5 33#define NBD_META_ID_ALLOCATION_DEPTH 1
3b1f244c 34/* Dirty bitmaps use 'NBD_META_ID_DIRTY_BITMAP + i', so keep this id last. */
71719cd5 35#define NBD_META_ID_DIRTY_BITMAP 2
3d068aff 36
416e34bd
EB
37/*
38 * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
3d068aff
VSO
39 * constant. If an increase is needed, note that the NBD protocol
40 * recommends no larger than 32 mb, so that the client won't consider
416e34bd
EB
41 * the reply as a denial of service attack.
42 */
43#define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8)
e7b1948d 44
ca441480
PB
45static int system_errno_to_nbd_errno(int err)
46{
47 switch (err) {
48 case 0:
49 return NBD_SUCCESS;
50 case EPERM:
c0301fcc 51 case EROFS:
ca441480
PB
52 return NBD_EPERM;
53 case EIO:
54 return NBD_EIO;
55 case ENOMEM:
56 return NBD_ENOMEM;
57#ifdef EDQUOT
58 case EDQUOT:
59#endif
60 case EFBIG:
61 case ENOSPC:
62 return NBD_ENOSPC;
bae245d1
EB
63 case EOVERFLOW:
64 return NBD_EOVERFLOW;
0a479545
EB
65 case ENOTSUP:
66#if ENOTSUP != EOPNOTSUPP
67 case EOPNOTSUPP:
68#endif
69 return NBD_ENOTSUP;
b6f5d3b5
EB
70 case ESHUTDOWN:
71 return NBD_ESHUTDOWN;
ca441480
PB
72 case EINVAL:
73 default:
74 return NBD_EINVAL;
75 }
76}
77
9a304d29
PB
78/* Definitions for opaque data types */
79
315f78ab 80typedef struct NBDRequestData NBDRequestData;
9a304d29 81
315f78ab 82struct NBDRequestData {
9a304d29
PB
83 NBDClient *client;
84 uint8_t *data;
29b6c3b3 85 bool complete;
9a304d29
PB
86};
87
88struct NBDExport {
56ee8626 89 BlockExport common;
0ddf08db 90
ee0a19ec 91 char *name;
b1a75b33 92 char *description;
9d26dfcb 93 uint64_t size;
7423f417 94 uint16_t nbdflags;
4b9441f6 95 QTAILQ_HEAD(, NBDClient) clients;
ee0a19ec 96 QTAILQ_ENTRY(NBDExport) next;
958c717d 97
cd7fca95 98 BlockBackend *eject_notifier_blk;
741cc431 99 Notifier eject_notifier;
3d068aff 100
71719cd5 101 bool allocation_depth;
3b1f244c
EB
102 BdrvDirtyBitmap **export_bitmaps;
103 size_t nr_export_bitmaps;
9a304d29
PB
104};
105
ee0a19ec
PB
106static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
107
fd358d83
EB
108/*
109 * NBDMetaContexts represents a list of meta contexts in use,
e7b1948d 110 * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
fd358d83
EB
111 * NBD_OPT_LIST_META_CONTEXT.
112 */
113struct NBDMetaContexts {
114 const NBDExport *exp; /* associated export */
47ec485e 115 size_t count; /* number of negotiated contexts */
e7b1948d 116 bool base_allocation; /* export base:allocation context (block status) */
71719cd5 117 bool allocation_depth; /* export qemu:allocation-depth */
3b1f244c
EB
118 bool *bitmaps; /*
119 * export qemu:dirty-bitmap:<export bitmap name>,
120 * sized by exp->nr_export_bitmaps
121 */
fd358d83 122};
e7b1948d 123
9a304d29 124struct NBDClient {
f816310d 125 int refcount; /* atomic */
0c9390d9 126 void (*close_fn)(NBDClient *client, bool negotiated);
9a304d29 127
7075d235
SH
128 QemuMutex lock;
129
9a304d29 130 NBDExport *exp;
f95910fe 131 QCryptoTLSCreds *tlscreds;
b25e12da 132 char *tlsauthz;
1c778ef7
DB
133 QIOChannelSocket *sioc; /* The underlying data channel */
134 QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
9a304d29 135
7075d235 136 Coroutine *recv_coroutine; /* protected by lock */
9a304d29
PB
137
138 CoMutex send_lock;
139 Coroutine *send_coroutine;
140
7075d235
SH
141 bool read_yielding; /* protected by lock */
142 bool quiescing; /* protected by lock */
f148ae7d 143
4b9441f6 144 QTAILQ_ENTRY(NBDClient) next;
7075d235
SH
145 int nb_requests; /* protected by lock */
146 bool closing; /* protected by lock */
5c54e7fa 147
6e280648
EB
148 uint32_t check_align; /* If non-zero, check for aligned client requests */
149
ac132d05 150 NBDMode mode;
fd358d83 151 NBDMetaContexts contexts; /* Negotiated meta contexts */
9a304d29 152
0cfae925
VSO
153 uint32_t opt; /* Current option being negotiated */
154 uint32_t optlen; /* remaining length of data in ioc for the option being
155 negotiated now */
156};
7a5ca864 157
ff82911c 158static void nbd_client_receive_next_request(NBDClient *client);
958c717d 159
6b8c01e7 160/* Basic flow for negotiation
7a5ca864
FB
161
162 Server Client
7a5ca864 163 Negotiate
6b8c01e7
PB
164
165 or
166
167 Server Client
168 Negotiate #1
169 Option
170 Negotiate #2
171
172 ----
173
174 followed by
175
176 Server Client
7a5ca864
FB
177 Request
178 Response
179 Request
180 Response
181 ...
182 ...
183 Request (type == 2)
6b8c01e7 184
7a5ca864
FB
185*/
186
1d17922a
VSO
187static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option,
188 uint32_t type, uint32_t length)
189{
190 stq_be_p(&rep->magic, NBD_REP_MAGIC);
191 stl_be_p(&rep->option, option);
192 stl_be_p(&rep->type, type);
193 stl_be_p(&rep->length, length);
194}
195
526e5c65
EB
196/* Send a reply header, including length, but no payload.
197 * Return -errno on error, 0 on success. */
0cfae925
VSO
198static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type,
199 uint32_t len, Error **errp)
6b8c01e7 200{
1d17922a 201 NBDOptionReply rep;
6b8c01e7 202
1d17922a 203 trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt),
3736cc5b 204 type, nbd_rep_lookup(type), len);
f95910fe 205
f37708f6 206 assert(len < NBD_MAX_BUFFER_SIZE);
2fd2c840 207
1d17922a
VSO
208 set_be_option_rep(&rep, client->opt, type, len);
209 return nbd_write(client->ioc, &rep, sizeof(rep), errp);
f5076b5a 210}
6b8c01e7 211
526e5c65
EB
212/* Send a reply header with default 0 length.
213 * Return -errno on error, 0 on success. */
0cfae925 214static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type,
2fd2c840 215 Error **errp)
526e5c65 216{
0cfae925 217 return nbd_negotiate_send_rep_len(client, type, 0, errp);
526e5c65
EB
218}
219
36683283
EB
220/* Send an error reply.
221 * Return -errno on error, 0 on success. */
9edc6313 222static int G_GNUC_PRINTF(4, 0)
41f5dfaf
EB
223nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
224 Error **errp, const char *fmt, va_list va)
36683283 225{
795d946d 226 ERRP_GUARD();
df18c04e 227 g_autofree char *msg = NULL;
36683283
EB
228 int ret;
229 size_t len;
230
36683283 231 msg = g_strdup_vprintf(fmt, va);
36683283 232 len = strlen(msg);
5c4fe018 233 assert(len < NBD_MAX_STRING_SIZE);
9588463e 234 trace_nbd_negotiate_send_rep_err(msg);
0cfae925 235 ret = nbd_negotiate_send_rep_len(client, type, len, errp);
36683283 236 if (ret < 0) {
df18c04e 237 return ret;
36683283 238 }
0cfae925 239 if (nbd_write(client->ioc, msg, len, errp) < 0) {
2fd2c840 240 error_prepend(errp, "write failed (error message): ");
df18c04e 241 return -EIO;
36683283 242 }
2fd2c840 243
df18c04e 244 return 0;
36683283
EB
245}
246
5c4fe018
EB
247/*
248 * Return a malloc'd copy of @name suitable for use in an error reply.
249 */
250static char *
251nbd_sanitize_name(const char *name)
252{
253 if (strnlen(name, 80) < 80) {
254 return g_strdup(name);
255 }
256 /* XXX Should we also try to sanitize any control characters? */
257 return g_strdup_printf("%.80s...", name);
258}
259
41f5dfaf
EB
260/* Send an error reply.
261 * Return -errno on error, 0 on success. */
9edc6313 262static int G_GNUC_PRINTF(4, 5)
41f5dfaf
EB
263nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
264 Error **errp, const char *fmt, ...)
265{
266 va_list va;
267 int ret;
268
269 va_start(va, fmt);
270 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
271 va_end(va);
272 return ret;
273}
274
894e0280
EB
275/* Drop remainder of the current option, and send a reply with the
276 * given error type and message. Return -errno on read or write
277 * failure; or 0 if connection is still live. */
9edc6313 278static int G_GNUC_PRINTF(4, 0)
2e425fd5
VSO
279nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
280 const char *fmt, va_list va)
894e0280
EB
281{
282 int ret = nbd_drop(client->ioc, client->optlen, errp);
894e0280
EB
283
284 client->optlen = 0;
285 if (!ret) {
894e0280 286 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
894e0280
EB
287 }
288 return ret;
289}
290
9edc6313 291static int G_GNUC_PRINTF(4, 5)
2e425fd5
VSO
292nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
293 const char *fmt, ...)
294{
295 int ret;
296 va_list va;
297
298 va_start(va, fmt);
299 ret = nbd_opt_vdrop(client, type, errp, fmt, va);
300 va_end(va);
301
302 return ret;
303}
304
9edc6313 305static int G_GNUC_PRINTF(3, 4)
2e425fd5
VSO
306nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
307{
308 int ret;
309 va_list va;
310
311 va_start(va, fmt);
312 ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va);
313 va_end(va);
314
315 return ret;
316}
317
894e0280 318/* Read size bytes from the unparsed payload of the current option.
d1e2c3e7 319 * If @check_nul, require that no NUL bytes appear in buffer.
894e0280
EB
320 * Return -errno on I/O error, 0 if option was completely handled by
321 * sending a reply about inconsistent lengths, or 1 on success. */
322static int nbd_opt_read(NBDClient *client, void *buffer, size_t size,
d1e2c3e7 323 bool check_nul, Error **errp)
894e0280
EB
324{
325 if (size > client->optlen) {
2e425fd5
VSO
326 return nbd_opt_invalid(client, errp,
327 "Inconsistent lengths in option %s",
328 nbd_opt_lookup(client->opt));
894e0280
EB
329 }
330 client->optlen -= size;
d1e2c3e7
EB
331 if (qio_channel_read_all(client->ioc, buffer, size, errp) < 0) {
332 return -EIO;
333 }
334
335 if (check_nul && strnlen(buffer, size) != size) {
336 return nbd_opt_invalid(client, errp,
337 "Unexpected embedded NUL in option %s",
338 nbd_opt_lookup(client->opt));
339 }
340 return 1;
894e0280
EB
341}
342
e7b1948d
VSO
343/* Drop size bytes from the unparsed payload of the current option.
344 * Return -errno on I/O error, 0 if option was completely handled by
345 * sending a reply about inconsistent lengths, or 1 on success. */
346static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp)
347{
348 if (size > client->optlen) {
349 return nbd_opt_invalid(client, errp,
350 "Inconsistent lengths in option %s",
351 nbd_opt_lookup(client->opt));
352 }
353 client->optlen -= size;
354 return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1;
355}
356
12296459
VSO
357/* nbd_opt_read_name
358 *
359 * Read a string with the format:
93676c88 360 * uint32_t len (<= NBD_MAX_STRING_SIZE)
12296459
VSO
361 * len bytes string (not 0-terminated)
362 *
9d7ab222 363 * On success, @name will be allocated.
12296459
VSO
364 * If @length is non-null, it will be set to the actual string length.
365 *
366 * Return -errno on I/O error, 0 if option was completely handled by
367 * sending a reply about inconsistent lengths, or 1 on success.
368 */
9d7ab222 369static int nbd_opt_read_name(NBDClient *client, char **name, uint32_t *length,
12296459
VSO
370 Error **errp)
371{
372 int ret;
373 uint32_t len;
9d7ab222 374 g_autofree char *local_name = NULL;
12296459 375
9d7ab222 376 *name = NULL;
d1e2c3e7 377 ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
12296459
VSO
378 if (ret <= 0) {
379 return ret;
380 }
80c7c2b0 381 len = cpu_to_be32(len);
12296459 382
93676c88 383 if (len > NBD_MAX_STRING_SIZE) {
12296459
VSO
384 return nbd_opt_invalid(client, errp,
385 "Invalid name length: %" PRIu32, len);
386 }
387
9d7ab222 388 local_name = g_malloc(len + 1);
d1e2c3e7 389 ret = nbd_opt_read(client, local_name, len, true, errp);
12296459
VSO
390 if (ret <= 0) {
391 return ret;
392 }
9d7ab222 393 local_name[len] = '\0';
12296459
VSO
394
395 if (length) {
396 *length = len;
397 }
9d7ab222 398 *name = g_steal_pointer(&local_name);
12296459
VSO
399
400 return 1;
401}
402
526e5c65
EB
403/* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
404 * Return -errno on error, 0 on success. */
0cfae925 405static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp,
2fd2c840 406 Error **errp)
32d7d2e0 407{
795d946d 408 ERRP_GUARD();
b1a75b33 409 size_t name_len, desc_len;
526e5c65 410 uint32_t len;
b1a75b33
EB
411 const char *name = exp->name ? exp->name : "";
412 const char *desc = exp->description ? exp->description : "";
0cfae925 413 QIOChannel *ioc = client->ioc;
2e5c9ad6 414 int ret;
32d7d2e0 415
9588463e 416 trace_nbd_negotiate_send_rep_list(name, desc);
b1a75b33
EB
417 name_len = strlen(name);
418 desc_len = strlen(desc);
93676c88 419 assert(name_len <= NBD_MAX_STRING_SIZE && desc_len <= NBD_MAX_STRING_SIZE);
526e5c65 420 len = name_len + desc_len + sizeof(len);
0cfae925 421 ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp);
2e5c9ad6
VSO
422 if (ret < 0) {
423 return ret;
32d7d2e0 424 }
526e5c65 425
32d7d2e0 426 len = cpu_to_be32(name_len);
2fd2c840
VSO
427 if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
428 error_prepend(errp, "write failed (name length): ");
b1a75b33
EB
429 return -EINVAL;
430 }
2fd2c840
VSO
431
432 if (nbd_write(ioc, name, name_len, errp) < 0) {
433 error_prepend(errp, "write failed (name buffer): ");
32d7d2e0
HB
434 return -EINVAL;
435 }
2fd2c840
VSO
436
437 if (nbd_write(ioc, desc, desc_len, errp) < 0) {
438 error_prepend(errp, "write failed (description buffer): ");
32d7d2e0
HB
439 return -EINVAL;
440 }
2fd2c840 441
32d7d2e0
HB
442 return 0;
443}
444
526e5c65
EB
445/* Process the NBD_OPT_LIST command, with a potential series of replies.
446 * Return -errno on error, 0 on success. */
e68c35cf 447static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
32d7d2e0 448{
32d7d2e0 449 NBDExport *exp;
0cfae925 450 assert(client->opt == NBD_OPT_LIST);
32d7d2e0 451
32d7d2e0
HB
452 /* For each export, send a NBD_REP_SERVER reply. */
453 QTAILQ_FOREACH(exp, &exports, next) {
0cfae925 454 if (nbd_negotiate_send_rep_list(client, exp, errp)) {
32d7d2e0
HB
455 return -EINVAL;
456 }
457 }
458 /* Finish with a NBD_REP_ACK. */
0cfae925 459 return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
32d7d2e0
HB
460}
461
fd358d83 462static void nbd_check_meta_export(NBDClient *client, NBDExport *exp)
e7b1948d 463{
fd358d83
EB
464 if (exp != client->contexts.exp) {
465 client->contexts.count = 0;
47ec485e 466 }
e7b1948d
VSO
467}
468
f37708f6
EB
469/* Send a reply to NBD_OPT_EXPORT_NAME.
470 * Return -errno on error, 0 on success. */
dbb38caa 471static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
2fd2c840 472 Error **errp)
f5076b5a 473{
795d946d 474 ERRP_GUARD();
9d7ab222 475 g_autofree char *name = NULL;
5f66d060 476 char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
23e099c3
EB
477 size_t len;
478 int ret;
dbb38caa 479 uint16_t myflags;
6b8c01e7 480
f5076b5a
HB
481 /* Client sends:
482 [20 .. xx] export name (length bytes)
5f66d060
EB
483 Server replies:
484 [ 0 .. 7] size
485 [ 8 .. 9] export flags
486 [10 .. 133] reserved (0) [unless no_zeroes]
f5076b5a 487 */
9588463e 488 trace_nbd_negotiate_handle_export_name();
9c1d2614
EB
489 if (client->mode >= NBD_MODE_EXTENDED) {
490 error_setg(errp, "Extended headers already negotiated");
491 return -EINVAL;
492 }
93676c88 493 if (client->optlen > NBD_MAX_STRING_SIZE) {
2fd2c840 494 error_setg(errp, "Bad length received");
d9faeed8 495 return -EINVAL;
6b8c01e7 496 }
9d7ab222 497 name = g_malloc(client->optlen + 1);
e6798f06 498 if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) {
32f158a6 499 return -EIO;
6b8c01e7 500 }
0cfae925
VSO
501 name[client->optlen] = '\0';
502 client->optlen = 0;
6b8c01e7 503
9588463e 504 trace_nbd_negotiate_handle_export_name_request(name);
9344e5f5 505
6b8c01e7
PB
506 client->exp = nbd_export_find(name);
507 if (!client->exp) {
2fd2c840 508 error_setg(errp, "export not found");
d9faeed8 509 return -EINVAL;
6b8c01e7 510 }
fd358d83 511 nbd_check_meta_export(client, client->exp);
6b8c01e7 512
dbb38caa 513 myflags = client->exp->nbdflags;
ac132d05 514 if (client->mode >= NBD_MODE_STRUCTURED) {
dbb38caa
EB
515 myflags |= NBD_FLAG_SEND_DF;
516 }
2dcbb11b
EB
517 if (client->mode >= NBD_MODE_EXTENDED && client->contexts.count) {
518 myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
519 }
dbb38caa 520 trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
23e099c3 521 stq_be_p(buf, client->exp->size);
dbb38caa 522 stw_be_p(buf + 8, myflags);
23e099c3
EB
523 len = no_zeroes ? 10 : sizeof(buf);
524 ret = nbd_write(client->ioc, buf, len, errp);
525 if (ret < 0) {
526 error_prepend(errp, "write failed: ");
527 return ret;
528 }
529
6b8c01e7 530 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
c69de1be 531 blk_exp_ref(&client->exp->common);
d9faeed8
VSO
532
533 return 0;
6b8c01e7
PB
534}
535
f37708f6
EB
536/* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
537 * The buffer does NOT include the info type prefix.
538 * Return -errno on error, 0 if ready to send more. */
0cfae925 539static int nbd_negotiate_send_info(NBDClient *client,
f37708f6
EB
540 uint16_t info, uint32_t length, void *buf,
541 Error **errp)
542{
543 int rc;
544
545 trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
0cfae925 546 rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO,
f37708f6
EB
547 sizeof(info) + length, errp);
548 if (rc < 0) {
549 return rc;
550 }
80c7c2b0 551 info = cpu_to_be16(info);
f37708f6
EB
552 if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
553 return -EIO;
554 }
555 if (nbd_write(client->ioc, buf, length, errp) < 0) {
556 return -EIO;
557 }
558 return 0;
559}
560
a16a7907
EB
561/* nbd_reject_length: Handle any unexpected payload.
562 * @fatal requests that we quit talking to the client, even if we are able
563 * to successfully send an error reply.
564 * Return:
565 * -errno transmission error occurred or @fatal was requested, errp is set
566 * 0 error message successfully sent to client, errp is not set
567 */
0cfae925 568static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp)
a16a7907
EB
569{
570 int ret;
571
0cfae925 572 assert(client->optlen);
2e425fd5
VSO
573 ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length",
574 nbd_opt_lookup(client->opt));
a16a7907 575 if (fatal && !ret) {
894e0280 576 error_setg(errp, "option '%s' has unexpected length",
0cfae925 577 nbd_opt_lookup(client->opt));
a16a7907
EB
578 return -EINVAL;
579 }
580 return ret;
581}
582
f37708f6
EB
583/* Handle NBD_OPT_INFO and NBD_OPT_GO.
584 * Return -errno on error, 0 if ready for next option, and 1 to move
585 * into transmission phase. */
dbb38caa 586static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
f37708f6
EB
587{
588 int rc;
9d7ab222 589 g_autofree char *name = NULL;
f37708f6
EB
590 NBDExport *exp;
591 uint16_t requests;
592 uint16_t request;
bbc35fc2 593 uint32_t namelen = 0;
f37708f6 594 bool sendname = false;
0c1d50bd
EB
595 bool blocksize = false;
596 uint32_t sizes[3];
f37708f6 597 char buf[sizeof(uint64_t) + sizeof(uint16_t)];
6e280648 598 uint32_t check_align = 0;
dbb38caa 599 uint16_t myflags;
f37708f6
EB
600
601 /* Client sends:
602 4 bytes: L, name length (can be 0)
603 L bytes: export name
604 2 bytes: N, number of requests (can be 0)
605 N * 2 bytes: N requests
606 */
9d7ab222 607 rc = nbd_opt_read_name(client, &name, &namelen, errp);
894e0280
EB
608 if (rc <= 0) {
609 return rc;
f37708f6 610 }
f37708f6
EB
611 trace_nbd_negotiate_handle_export_name_request(name);
612
d1e2c3e7 613 rc = nbd_opt_read(client, &requests, sizeof(requests), false, errp);
894e0280
EB
614 if (rc <= 0) {
615 return rc;
f37708f6 616 }
80c7c2b0 617 requests = be16_to_cpu(requests);
f37708f6 618 trace_nbd_negotiate_handle_info_requests(requests);
f37708f6 619 while (requests--) {
d1e2c3e7 620 rc = nbd_opt_read(client, &request, sizeof(request), false, errp);
894e0280
EB
621 if (rc <= 0) {
622 return rc;
f37708f6 623 }
80c7c2b0 624 request = be16_to_cpu(request);
f37708f6
EB
625 trace_nbd_negotiate_handle_info_request(request,
626 nbd_info_lookup(request));
0c1d50bd
EB
627 /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
628 * everything else is either a request we don't know or
629 * something we send regardless of request */
630 switch (request) {
631 case NBD_INFO_NAME:
f37708f6 632 sendname = true;
0c1d50bd
EB
633 break;
634 case NBD_INFO_BLOCK_SIZE:
635 blocksize = true;
636 break;
f37708f6
EB
637 }
638 }
894e0280
EB
639 if (client->optlen) {
640 return nbd_reject_length(client, false, errp);
641 }
f37708f6
EB
642
643 exp = nbd_export_find(name);
644 if (!exp) {
5c4fe018
EB
645 g_autofree char *sane_name = nbd_sanitize_name(name);
646
0cfae925
VSO
647 return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN,
648 errp, "export '%s' not present",
5c4fe018 649 sane_name);
f37708f6 650 }
fd358d83
EB
651 if (client->opt == NBD_OPT_GO) {
652 nbd_check_meta_export(client, exp);
653 }
f37708f6
EB
654
655 /* Don't bother sending NBD_INFO_NAME unless client requested it */
656 if (sendname) {
0cfae925 657 rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name,
f37708f6
EB
658 errp);
659 if (rc < 0) {
660 return rc;
661 }
662 }
663
664 /* Send NBD_INFO_DESCRIPTION only if available, regardless of
665 * client request */
666 if (exp->description) {
667 size_t len = strlen(exp->description);
668
93676c88 669 assert(len <= NBD_MAX_STRING_SIZE);
0cfae925 670 rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION,
f37708f6
EB
671 len, exp->description, errp);
672 if (rc < 0) {
673 return rc;
674 }
675 }
676
0c1d50bd
EB
677 /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
678 * according to whether the client requested it, and according to
679 * whether this is OPT_INFO or OPT_GO. */
b0245d64
EB
680 /* minimum - 1 for back-compat, or actual if client will obey it. */
681 if (client->opt == NBD_OPT_INFO || blocksize) {
37a4f70c 682 check_align = sizes[0] = blk_get_request_alignment(exp->common.blk);
b0245d64
EB
683 } else {
684 sizes[0] = 1;
685 }
686 assert(sizes[0] <= NBD_MAX_BUFFER_SIZE);
0c1d50bd
EB
687 /* preferred - Hard-code to 4096 for now.
688 * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
b0245d64 689 sizes[1] = MAX(4096, sizes[0]);
0c1d50bd 690 /* maximum - At most 32M, but smaller as appropriate. */
37a4f70c 691 sizes[2] = MIN(blk_get_max_transfer(exp->common.blk), NBD_MAX_BUFFER_SIZE);
0c1d50bd 692 trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
80c7c2b0
PM
693 sizes[0] = cpu_to_be32(sizes[0]);
694 sizes[1] = cpu_to_be32(sizes[1]);
695 sizes[2] = cpu_to_be32(sizes[2]);
0cfae925 696 rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE,
0c1d50bd
EB
697 sizeof(sizes), sizes, errp);
698 if (rc < 0) {
699 return rc;
700 }
701
f37708f6 702 /* Send NBD_INFO_EXPORT always */
dbb38caa 703 myflags = exp->nbdflags;
ac132d05 704 if (client->mode >= NBD_MODE_STRUCTURED) {
dbb38caa
EB
705 myflags |= NBD_FLAG_SEND_DF;
706 }
2dcbb11b
EB
707 if (client->mode >= NBD_MODE_EXTENDED &&
708 (client->contexts.count || client->opt == NBD_OPT_INFO)) {
709 myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
710 }
dbb38caa 711 trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
f37708f6 712 stq_be_p(buf, exp->size);
dbb38caa 713 stw_be_p(buf + 8, myflags);
0cfae925 714 rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT,
f37708f6
EB
715 sizeof(buf), buf, errp);
716 if (rc < 0) {
717 return rc;
718 }
719
099fbcd6
EB
720 /*
721 * If the client is just asking for NBD_OPT_INFO, but forgot to
722 * request block sizes in a situation that would impact
723 * performance, then return an error. But for NBD_OPT_GO, we
724 * tolerate all clients, regardless of alignments.
725 */
726 if (client->opt == NBD_OPT_INFO && !blocksize &&
37a4f70c 727 blk_get_request_alignment(exp->common.blk) > 1) {
0cfae925
VSO
728 return nbd_negotiate_send_rep_err(client,
729 NBD_REP_ERR_BLOCK_SIZE_REQD,
0c1d50bd
EB
730 errp,
731 "request NBD_INFO_BLOCK_SIZE to "
732 "use this export");
733 }
734
f37708f6 735 /* Final reply */
0cfae925 736 rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
f37708f6
EB
737 if (rc < 0) {
738 return rc;
739 }
740
0cfae925 741 if (client->opt == NBD_OPT_GO) {
f37708f6 742 client->exp = exp;
6e280648 743 client->check_align = check_align;
f37708f6 744 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
c69de1be 745 blk_exp_ref(&client->exp->common);
f37708f6
EB
746 rc = 1;
747 }
748 return rc;
f37708f6
EB
749}
750
ae6d91a7
ZY
751/* Callback to learn when QIO TLS upgrade is complete */
752struct NBDTLSServerHandshakeData {
753 bool complete;
754 Error *error;
755 Coroutine *co;
756};
757
758static void nbd_server_tls_handshake(QIOTask *task, void *opaque)
759{
760 struct NBDTLSServerHandshakeData *data = opaque;
761
762 qio_task_propagate_error(task, &data->error);
763 data->complete = true;
764 if (!qemu_coroutine_entered(data->co)) {
765 aio_co_wake(data->co);
766 }
767}
f37708f6 768
36683283
EB
769/* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
770 * new channel for all further (now-encrypted) communication. */
f95910fe 771static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
2fd2c840 772 Error **errp)
f95910fe
DB
773{
774 QIOChannel *ioc;
775 QIOChannelTLS *tioc;
ae6d91a7 776 struct NBDTLSServerHandshakeData data = { 0 };
f95910fe 777
0cfae925
VSO
778 assert(client->opt == NBD_OPT_STARTTLS);
779
9588463e 780 trace_nbd_negotiate_handle_starttls();
f95910fe 781 ioc = client->ioc;
f95910fe 782
0cfae925 783 if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) {
63d5ef86
EB
784 return NULL;
785 }
f95910fe
DB
786
787 tioc = qio_channel_tls_new_server(ioc,
788 client->tlscreds,
b25e12da 789 client->tlsauthz,
2fd2c840 790 errp);
f95910fe
DB
791 if (!tioc) {
792 return NULL;
793 }
794
0d73f725 795 qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
9588463e 796 trace_nbd_negotiate_handle_starttls_handshake();
ae6d91a7 797 data.co = qemu_coroutine_self();
f95910fe 798 qio_channel_tls_handshake(tioc,
ae6d91a7 799 nbd_server_tls_handshake,
f95910fe 800 &data,
1939ccda 801 NULL,
f95910fe
DB
802 NULL);
803
804 if (!data.complete) {
ae6d91a7
ZY
805 qemu_coroutine_yield();
806 assert(data.complete);
f95910fe 807 }
ae6d91a7 808
f95910fe
DB
809 if (data.error) {
810 object_unref(OBJECT(tioc));
2fd2c840 811 error_propagate(errp, data.error);
f95910fe
DB
812 return NULL;
813 }
814
815 return QIO_CHANNEL(tioc);
816}
817
e7b1948d
VSO
818/* nbd_negotiate_send_meta_context
819 *
820 * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT
821 *
822 * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead.
823 */
824static int nbd_negotiate_send_meta_context(NBDClient *client,
825 const char *context,
826 uint32_t context_id,
827 Error **errp)
828{
829 NBDOptionReplyMetaContext opt;
830 struct iovec iov[] = {
831 {.iov_base = &opt, .iov_len = sizeof(opt)},
832 {.iov_base = (void *)context, .iov_len = strlen(context)}
833 };
834
93676c88 835 assert(iov[1].iov_len <= NBD_MAX_STRING_SIZE);
e7b1948d
VSO
836 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
837 context_id = 0;
838 }
839
2b53af25 840 trace_nbd_negotiate_meta_query_reply(context, context_id);
e7b1948d
VSO
841 set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
842 sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
843 stl_be_p(&opt.context_id, context_id);
844
845 return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
846}
847
ebd57062
EB
848/*
849 * Return true if @query matches @pattern, or if @query is empty when
850 * the @client is performing _LIST_.
dbb8b396 851 */
ebd57062
EB
852static bool nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern,
853 const char *query)
e7b1948d 854{
ebd57062
EB
855 if (!*query) {
856 trace_nbd_negotiate_meta_query_parse("empty");
857 return client->opt == NBD_OPT_LIST_META_CONTEXT;
e7b1948d 858 }
ebd57062 859 if (strcmp(query, pattern) == 0) {
b0769d8f 860 trace_nbd_negotiate_meta_query_parse(pattern);
ebd57062 861 return true;
e7b1948d 862 }
ebd57062
EB
863 trace_nbd_negotiate_meta_query_skip("pattern not matched");
864 return false;
e7b1948d
VSO
865}
866
b0769d8f 867/*
ebd57062 868 * Return true and adjust @str in place if it begins with @prefix.
b0769d8f 869 */
ebd57062 870static bool nbd_strshift(const char **str, const char *prefix)
b0769d8f 871{
ebd57062 872 size_t len = strlen(prefix);
b0769d8f 873
ebd57062
EB
874 if (strncmp(*str, prefix, len) == 0) {
875 *str += len;
876 return true;
b0769d8f 877 }
ebd57062 878 return false;
b0769d8f
VSO
879}
880
881/* nbd_meta_base_query
882 *
883 * Handle queries to 'base' namespace. For now, only the base:allocation
ebd57062 884 * context is available. Return true if @query has been handled.
b0769d8f 885 */
fd358d83 886static bool nbd_meta_base_query(NBDClient *client, NBDMetaContexts *meta,
ebd57062 887 const char *query)
b0769d8f 888{
ebd57062
EB
889 if (!nbd_strshift(&query, "base:")) {
890 return false;
891 }
892 trace_nbd_negotiate_meta_query_parse("base:");
893
894 if (nbd_meta_empty_or_pattern(client, "allocation", query)) {
895 meta->base_allocation = true;
896 }
897 return true;
b0769d8f
VSO
898}
899
ebd57062 900/* nbd_meta_qemu_query
3d068aff 901 *
ebd57062 902 * Handle queries to 'qemu' namespace. For now, only the qemu:dirty-bitmap:
71719cd5
EB
903 * and qemu:allocation-depth contexts are available. Return true if @query
904 * has been handled.
ebd57062 905 */
fd358d83 906static bool nbd_meta_qemu_query(NBDClient *client, NBDMetaContexts *meta,
ebd57062 907 const char *query)
3d068aff 908{
3b1f244c
EB
909 size_t i;
910
ebd57062
EB
911 if (!nbd_strshift(&query, "qemu:")) {
912 return false;
3d068aff 913 }
ebd57062 914 trace_nbd_negotiate_meta_query_parse("qemu:");
3d068aff 915
ebd57062 916 if (!*query) {
3d068aff 917 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
71719cd5 918 meta->allocation_depth = meta->exp->allocation_depth;
76df2b8d
EB
919 if (meta->exp->nr_export_bitmaps) {
920 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
921 }
3d068aff
VSO
922 }
923 trace_nbd_negotiate_meta_query_parse("empty");
ebd57062 924 return true;
3d068aff
VSO
925 }
926
71719cd5
EB
927 if (strcmp(query, "allocation-depth") == 0) {
928 trace_nbd_negotiate_meta_query_parse("allocation-depth");
929 meta->allocation_depth = meta->exp->allocation_depth;
930 return true;
931 }
932
ebd57062
EB
933 if (nbd_strshift(&query, "dirty-bitmap:")) {
934 trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
3b1f244c 935 if (!*query) {
76df2b8d
EB
936 if (client->opt == NBD_OPT_LIST_META_CONTEXT &&
937 meta->exp->nr_export_bitmaps) {
3b1f244c
EB
938 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
939 }
940 trace_nbd_negotiate_meta_query_parse("empty");
ebd57062
EB
941 return true;
942 }
3b1f244c
EB
943
944 for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
945 const char *bm_name;
946
947 bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
948 if (strcmp(bm_name, query) == 0) {
949 meta->bitmaps[i] = true;
950 trace_nbd_negotiate_meta_query_parse(query);
951 return true;
952 }
ebd57062 953 }
3b1f244c 954 trace_nbd_negotiate_meta_query_skip("no dirty-bitmap match");
ebd57062 955 return true;
3d068aff
VSO
956 }
957
71719cd5 958 trace_nbd_negotiate_meta_query_skip("unknown qemu context");
ebd57062 959 return true;
3d068aff
VSO
960}
961
e7b1948d
VSO
962/* nbd_negotiate_meta_query
963 *
964 * Parse namespace name and call corresponding function to parse body of the
965 * query.
966 *
93676c88 967 * The only supported namespaces are 'base' and 'qemu'.
e7b1948d 968 *
e7b1948d
VSO
969 * Return -errno on I/O error, 0 if option was completely handled by
970 * sending a reply about inconsistent lengths, or 1 on success. */
971static int nbd_negotiate_meta_query(NBDClient *client,
fd358d83 972 NBDMetaContexts *meta, Error **errp)
e7b1948d
VSO
973{
974 int ret;
ebd57062 975 g_autofree char *query = NULL;
e7b1948d
VSO
976 uint32_t len;
977
d1e2c3e7 978 ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
e7b1948d
VSO
979 if (ret <= 0) {
980 return ret;
981 }
80c7c2b0 982 len = cpu_to_be32(len);
e7b1948d 983
93676c88
EB
984 if (len > NBD_MAX_STRING_SIZE) {
985 trace_nbd_negotiate_meta_query_skip("length too long");
986 return nbd_opt_skip(client, len, errp);
987 }
e7b1948d 988
ebd57062
EB
989 query = g_malloc(len + 1);
990 ret = nbd_opt_read(client, query, len, true, errp);
e7b1948d
VSO
991 if (ret <= 0) {
992 return ret;
993 }
ebd57062 994 query[len] = '\0';
3d068aff 995
ebd57062
EB
996 if (nbd_meta_base_query(client, meta, query)) {
997 return 1;
998 }
999 if (nbd_meta_qemu_query(client, meta, query)) {
1000 return 1;
e7b1948d
VSO
1001 }
1002
3d068aff 1003 trace_nbd_negotiate_meta_query_skip("unknown namespace");
ebd57062 1004 return 1;
e7b1948d
VSO
1005}
1006
1007/* nbd_negotiate_meta_queries
1008 * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
1009 *
1010 * Return -errno on I/O error, or 0 if option was completely handled. */
fd358d83 1011static int nbd_negotiate_meta_queries(NBDClient *client, Error **errp)
e7b1948d
VSO
1012{
1013 int ret;
9d7ab222 1014 g_autofree char *export_name = NULL;
cd1675f8
RH
1015 /* Mark unused to work around https://bugs.llvm.org/show_bug.cgi?id=3888 */
1016 g_autofree G_GNUC_UNUSED bool *bitmaps = NULL;
fd358d83
EB
1017 NBDMetaContexts local_meta = {0};
1018 NBDMetaContexts *meta;
e7b1948d 1019 uint32_t nb_queries;
3b1f244c 1020 size_t i;
47ec485e 1021 size_t count = 0;
e7b1948d 1022
ac132d05
EB
1023 if (client->opt == NBD_OPT_SET_META_CONTEXT &&
1024 client->mode < NBD_MODE_STRUCTURED) {
e7b1948d
VSO
1025 return nbd_opt_invalid(client, errp,
1026 "request option '%s' when structured reply "
1027 "is not negotiated",
1028 nbd_opt_lookup(client->opt));
1029 }
1030
1031 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1032 /* Only change the caller's meta on SET. */
1033 meta = &local_meta;
fd358d83
EB
1034 } else {
1035 meta = &client->contexts;
e7b1948d
VSO
1036 }
1037
3b1f244c 1038 g_free(meta->bitmaps);
e7b1948d
VSO
1039 memset(meta, 0, sizeof(*meta));
1040
9d7ab222 1041 ret = nbd_opt_read_name(client, &export_name, NULL, errp);
e7b1948d
VSO
1042 if (ret <= 0) {
1043 return ret;
1044 }
1045
af736e54
VSO
1046 meta->exp = nbd_export_find(export_name);
1047 if (meta->exp == NULL) {
5c4fe018
EB
1048 g_autofree char *sane_name = nbd_sanitize_name(export_name);
1049
e7b1948d 1050 return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp,
5c4fe018 1051 "export '%s' not present", sane_name);
e7b1948d 1052 }
3b1f244c
EB
1053 meta->bitmaps = g_new0(bool, meta->exp->nr_export_bitmaps);
1054 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1055 bitmaps = meta->bitmaps;
1056 }
e7b1948d 1057
d1e2c3e7 1058 ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), false, errp);
e7b1948d
VSO
1059 if (ret <= 0) {
1060 return ret;
1061 }
80c7c2b0 1062 nb_queries = cpu_to_be32(nb_queries);
2b53af25 1063 trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt),
af736e54 1064 export_name, nb_queries);
e7b1948d
VSO
1065
1066 if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
1067 /* enable all known contexts */
1068 meta->base_allocation = true;
71719cd5 1069 meta->allocation_depth = meta->exp->allocation_depth;
76df2b8d
EB
1070 if (meta->exp->nr_export_bitmaps) {
1071 memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
1072 }
e7b1948d
VSO
1073 } else {
1074 for (i = 0; i < nb_queries; ++i) {
1075 ret = nbd_negotiate_meta_query(client, meta, errp);
1076 if (ret <= 0) {
1077 return ret;
1078 }
1079 }
1080 }
1081
1082 if (meta->base_allocation) {
1083 ret = nbd_negotiate_send_meta_context(client, "base:allocation",
1084 NBD_META_ID_BASE_ALLOCATION,
1085 errp);
1086 if (ret < 0) {
1087 return ret;
1088 }
47ec485e 1089 count++;
e7b1948d
VSO
1090 }
1091
71719cd5
EB
1092 if (meta->allocation_depth) {
1093 ret = nbd_negotiate_send_meta_context(client, "qemu:allocation-depth",
1094 NBD_META_ID_ALLOCATION_DEPTH,
1095 errp);
1096 if (ret < 0) {
1097 return ret;
1098 }
1099 count++;
1100 }
1101
3b1f244c
EB
1102 for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
1103 const char *bm_name;
1104 g_autofree char *context = NULL;
1105
1106 if (!meta->bitmaps[i]) {
1107 continue;
1108 }
1109
1110 bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
1111 context = g_strdup_printf("qemu:dirty-bitmap:%s", bm_name);
02e87e3b
EB
1112
1113 ret = nbd_negotiate_send_meta_context(client, context,
3b1f244c 1114 NBD_META_ID_DIRTY_BITMAP + i,
3d068aff
VSO
1115 errp);
1116 if (ret < 0) {
1117 return ret;
1118 }
47ec485e 1119 count++;
3d068aff
VSO
1120 }
1121
e7b1948d
VSO
1122 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1123 if (ret == 0) {
47ec485e 1124 meta->count = count;
e7b1948d
VSO
1125 }
1126
1127 return ret;
1128}
1129
1e120ffe 1130/* nbd_negotiate_options
f37708f6
EB
1131 * Process all NBD_OPT_* client option commands, during fixed newstyle
1132 * negotiation.
1e120ffe 1133 * Return:
2fd2c840
VSO
1134 * -errno on error, errp is set
1135 * 0 on successful negotiation, errp is not set
1136 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1137 * errp is not set
1e120ffe 1138 */
dbb38caa 1139static int nbd_negotiate_options(NBDClient *client, Error **errp)
f5076b5a 1140{
9c122ada 1141 uint32_t flags;
26afa868 1142 bool fixedNewstyle = false;
23e099c3 1143 bool no_zeroes = false;
9c122ada
HR
1144
1145 /* Client sends:
1146 [ 0 .. 3] client flags
1147
f37708f6 1148 Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
9c122ada
HR
1149 [ 0 .. 7] NBD_OPTS_MAGIC
1150 [ 8 .. 11] NBD option
1151 [12 .. 15] Data length
1152 ... Rest of request
1153
1154 [ 0 .. 7] NBD_OPTS_MAGIC
1155 [ 8 .. 11] Second NBD option
1156 [12 .. 15] Data length
1157 ... Rest of request
1158 */
1159
e6798f06 1160 if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) {
9c122ada
HR
1161 return -EIO;
1162 }
ac132d05 1163 client->mode = NBD_MODE_EXPORT_NAME;
621c4f4e 1164 trace_nbd_negotiate_options_flags(flags);
26afa868 1165 if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
26afa868
DB
1166 fixedNewstyle = true;
1167 flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
ac132d05 1168 client->mode = NBD_MODE_SIMPLE;
26afa868 1169 }
c203c59a 1170 if (flags & NBD_FLAG_C_NO_ZEROES) {
23e099c3 1171 no_zeroes = true;
c203c59a
EB
1172 flags &= ~NBD_FLAG_C_NO_ZEROES;
1173 }
26afa868 1174 if (flags != 0) {
2fd2c840 1175 error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
621c4f4e 1176 return -EINVAL;
9c122ada
HR
1177 }
1178
f5076b5a 1179 while (1) {
9c122ada 1180 int ret;
7f9039cd 1181 uint32_t option, length;
f5076b5a
HB
1182 uint64_t magic;
1183
e6798f06 1184 if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) {
f5076b5a
HB
1185 return -EINVAL;
1186 }
9588463e
VSO
1187 trace_nbd_negotiate_options_check_magic(magic);
1188 if (magic != NBD_OPTS_MAGIC) {
2fd2c840 1189 error_setg(errp, "Bad magic received");
f5076b5a
HB
1190 return -EINVAL;
1191 }
1192
e6798f06 1193 if (nbd_read32(client->ioc, &option, "option", errp) < 0) {
f5076b5a
HB
1194 return -EINVAL;
1195 }
0cfae925 1196 client->opt = option;
f5076b5a 1197
e6798f06 1198 if (nbd_read32(client->ioc, &length, "option length", errp) < 0) {
f5076b5a
HB
1199 return -EINVAL;
1200 }
894e0280 1201 assert(!client->optlen);
0cfae925 1202 client->optlen = length;
f5076b5a 1203
fdad35ef 1204 if (length > NBD_MAX_BUFFER_SIZE) {
b2578459 1205 error_setg(errp, "len (%" PRIu32 ") is larger than max len (%u)",
fdad35ef
EB
1206 length, NBD_MAX_BUFFER_SIZE);
1207 return -EINVAL;
1208 }
1209
3736cc5b
EB
1210 trace_nbd_negotiate_options_check_option(option,
1211 nbd_opt_lookup(option));
f95910fe
DB
1212 if (client->tlscreds &&
1213 client->ioc == (QIOChannel *)client->sioc) {
1214 QIOChannel *tioc;
1215 if (!fixedNewstyle) {
7f9039cd 1216 error_setg(errp, "Unsupported option 0x%" PRIx32, option);
f95910fe
DB
1217 return -EINVAL;
1218 }
7f9039cd 1219 switch (option) {
f95910fe 1220 case NBD_OPT_STARTTLS:
e68c35cf
EB
1221 if (length) {
1222 /* Unconditionally drop the connection if the client
1223 * can't start a TLS negotiation correctly */
0cfae925 1224 return nbd_reject_length(client, true, errp);
e68c35cf
EB
1225 }
1226 tioc = nbd_negotiate_handle_starttls(client, errp);
f95910fe
DB
1227 if (!tioc) {
1228 return -EIO;
1229 }
8cbee49e 1230 ret = 0;
f95910fe 1231 object_unref(OBJECT(client->ioc));
7d5b0d68 1232 client->ioc = tioc;
f95910fe
DB
1233 break;
1234
d1129a8a
EB
1235 case NBD_OPT_EXPORT_NAME:
1236 /* No way to return an error to client, so drop connection */
2fd2c840 1237 error_setg(errp, "Option 0x%x not permitted before TLS",
7f9039cd 1238 option);
d1129a8a
EB
1239 return -EINVAL;
1240
f95910fe 1241 default:
3e99ebb9
EB
1242 /* Let the client keep trying, unless they asked to
1243 * quit. Always try to give an error back to the
1244 * client; but when replying to OPT_ABORT, be aware
1245 * that the client may hang up before receiving the
1246 * error, in which case we are fine ignoring the
1247 * resulting EPIPE. */
1248 ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD,
1249 option == NBD_OPT_ABORT ? NULL : errp,
894e0280 1250 "Option 0x%" PRIx32
0b0bb124 1251 " not permitted before TLS", option);
7f9039cd 1252 if (option == NBD_OPT_ABORT) {
1e120ffe 1253 return 1;
b6f5d3b5 1254 }
d1129a8a 1255 break;
f95910fe
DB
1256 }
1257 } else if (fixedNewstyle) {
7f9039cd 1258 switch (option) {
26afa868 1259 case NBD_OPT_LIST:
e68c35cf 1260 if (length) {
0cfae925 1261 ret = nbd_reject_length(client, false, errp);
e68c35cf
EB
1262 } else {
1263 ret = nbd_negotiate_handle_list(client, errp);
1264 }
26afa868
DB
1265 break;
1266
1267 case NBD_OPT_ABORT:
b6f5d3b5
EB
1268 /* NBD spec says we must try to reply before
1269 * disconnecting, but that we must also tolerate
1270 * guests that don't wait for our reply. */
0cfae925 1271 nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL);
1e120ffe 1272 return 1;
26afa868
DB
1273
1274 case NBD_OPT_EXPORT_NAME:
dbb38caa 1275 return nbd_negotiate_handle_export_name(client, no_zeroes,
23e099c3 1276 errp);
26afa868 1277
f37708f6
EB
1278 case NBD_OPT_INFO:
1279 case NBD_OPT_GO:
dbb38caa 1280 ret = nbd_negotiate_handle_info(client, errp);
f37708f6
EB
1281 if (ret == 1) {
1282 assert(option == NBD_OPT_GO);
1283 return 0;
1284 }
f37708f6
EB
1285 break;
1286
f95910fe 1287 case NBD_OPT_STARTTLS:
e68c35cf 1288 if (length) {
0cfae925 1289 ret = nbd_reject_length(client, false, errp);
e68c35cf 1290 } else if (client->tlscreds) {
0cfae925
VSO
1291 ret = nbd_negotiate_send_rep_err(client,
1292 NBD_REP_ERR_INVALID, errp,
36683283 1293 "TLS already enabled");
f95910fe 1294 } else {
0cfae925
VSO
1295 ret = nbd_negotiate_send_rep_err(client,
1296 NBD_REP_ERR_POLICY, errp,
36683283 1297 "TLS not configured");
63d5ef86 1298 }
d1129a8a 1299 break;
5c54e7fa
VSO
1300
1301 case NBD_OPT_STRUCTURED_REPLY:
1302 if (length) {
0cfae925 1303 ret = nbd_reject_length(client, false, errp);
9c1d2614
EB
1304 } else if (client->mode >= NBD_MODE_EXTENDED) {
1305 ret = nbd_negotiate_send_rep_err(
1306 client, NBD_REP_ERR_EXT_HEADER_REQD, errp,
1307 "extended headers already negotiated");
ac132d05 1308 } else if (client->mode >= NBD_MODE_STRUCTURED) {
5c54e7fa 1309 ret = nbd_negotiate_send_rep_err(
0cfae925 1310 client, NBD_REP_ERR_INVALID, errp,
5c54e7fa
VSO
1311 "structured reply already negotiated");
1312 } else {
0cfae925 1313 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
ac132d05 1314 client->mode = NBD_MODE_STRUCTURED;
5c54e7fa
VSO
1315 }
1316 break;
1317
e7b1948d
VSO
1318 case NBD_OPT_LIST_META_CONTEXT:
1319 case NBD_OPT_SET_META_CONTEXT:
fd358d83 1320 ret = nbd_negotiate_meta_queries(client, errp);
e7b1948d
VSO
1321 break;
1322
9c1d2614
EB
1323 case NBD_OPT_EXTENDED_HEADERS:
1324 if (length) {
1325 ret = nbd_reject_length(client, false, errp);
1326 } else if (client->mode >= NBD_MODE_EXTENDED) {
1327 ret = nbd_negotiate_send_rep_err(
1328 client, NBD_REP_ERR_INVALID, errp,
1329 "extended headers already negotiated");
1330 } else {
1331 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1332 client->mode = NBD_MODE_EXTENDED;
1333 }
1334 break;
1335
26afa868 1336 default:
894e0280 1337 ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
28fb494f 1338 "Unsupported option %" PRIu32 " (%s)",
894e0280 1339 option, nbd_opt_lookup(option));
156f6a10 1340 break;
26afa868
DB
1341 }
1342 } else {
1343 /*
1344 * If broken new-style we should drop the connection
1345 * for anything except NBD_OPT_EXPORT_NAME
1346 */
7f9039cd 1347 switch (option) {
26afa868 1348 case NBD_OPT_EXPORT_NAME:
dbb38caa 1349 return nbd_negotiate_handle_export_name(client, no_zeroes,
23e099c3 1350 errp);
26afa868
DB
1351
1352 default:
28fb494f 1353 error_setg(errp, "Unsupported option %" PRIu32 " (%s)",
3736cc5b 1354 option, nbd_opt_lookup(option));
26afa868 1355 return -EINVAL;
32d7d2e0 1356 }
f5076b5a 1357 }
8cbee49e
EB
1358 if (ret < 0) {
1359 return ret;
1360 }
f5076b5a
HB
1361 }
1362}
1363
1e120ffe
VSO
1364/* nbd_negotiate
1365 * Return:
2fd2c840
VSO
1366 * -errno on error, errp is set
1367 * 0 on successful negotiation, errp is not set
1368 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1369 * errp is not set
1e120ffe 1370 */
2fd2c840 1371static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
7a5ca864 1372{
795d946d 1373 ERRP_GUARD();
5f66d060 1374 char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
2e5c9ad6 1375 int ret;
b2e3d87f 1376
5f66d060 1377 /* Old style negotiation header, no room for options
6b8c01e7
PB
1378 [ 0 .. 7] passwd ("NBDMAGIC")
1379 [ 8 .. 15] magic (NBD_CLIENT_MAGIC)
b2e3d87f 1380 [16 .. 23] size
5f66d060 1381 [24 .. 27] export flags (zero-extended)
6b8c01e7
PB
1382 [28 .. 151] reserved (0)
1383
5f66d060 1384 New style negotiation header, client can send options
6b8c01e7
PB
1385 [ 0 .. 7] passwd ("NBDMAGIC")
1386 [ 8 .. 15] magic (NBD_OPTS_MAGIC)
1387 [16 .. 17] server flags (0)
f37708f6 1388 ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
b2e3d87f
NT
1389 */
1390
1c778ef7 1391 qio_channel_set_blocking(client->ioc, false, NULL);
06e0f098 1392 qio_channel_set_follow_coroutine_ctx(client->ioc, true);
185b4338 1393
9588463e 1394 trace_nbd_negotiate_begin();
b2e3d87f 1395 memcpy(buf, "NBDMAGIC", 8);
f95910fe 1396
7f7dfe2a
VSO
1397 stq_be_p(buf + 8, NBD_OPTS_MAGIC);
1398 stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
b2e3d87f 1399
7f7dfe2a
VSO
1400 if (nbd_write(client->ioc, buf, 18, errp) < 0) {
1401 error_prepend(errp, "write failed: ");
1402 return -EINVAL;
1403 }
dbb38caa 1404 ret = nbd_negotiate_options(client, errp);
7f7dfe2a
VSO
1405 if (ret != 0) {
1406 if (ret < 0) {
1407 error_prepend(errp, "option negotiation failed: ");
6b8c01e7 1408 }
7f7dfe2a 1409 return ret;
b2e3d87f
NT
1410 }
1411
0cfae925 1412 assert(!client->optlen);
9588463e 1413 trace_nbd_negotiate_success();
d9faeed8
VSO
1414
1415 return 0;
7a5ca864
FB
1416}
1417
f148ae7d
SL
1418/* nbd_read_eof
1419 * Tries to read @size bytes from @ioc. This is a local implementation of
1420 * qio_channel_readv_all_eof. We have it here because we need it to be
1421 * interruptible and to know when the coroutine is yielding.
1422 * Returns 1 on success
1423 * 0 on eof, when no data was read (errp is not set)
1424 * negative errno on failure (errp is set)
1425 */
1426static inline int coroutine_fn
1427nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp)
1428{
1429 bool partial = false;
1430
1431 assert(size);
1432 while (size > 0) {
1433 struct iovec iov = { .iov_base = buffer, .iov_len = size };
1434 ssize_t len;
1435
1436 len = qio_channel_readv(client->ioc, &iov, 1, errp);
1437 if (len == QIO_CHANNEL_ERR_BLOCK) {
7075d235
SH
1438 WITH_QEMU_LOCK_GUARD(&client->lock) {
1439 client->read_yielding = true;
1440
1441 /* Prompt main loop thread to re-run nbd_drained_poll() */
1442 aio_wait_kick();
1443 }
f148ae7d 1444 qio_channel_yield(client->ioc, G_IO_IN);
7075d235
SH
1445 WITH_QEMU_LOCK_GUARD(&client->lock) {
1446 client->read_yielding = false;
1447 if (client->quiescing) {
1448 return -EAGAIN;
1449 }
f148ae7d
SL
1450 }
1451 continue;
1452 } else if (len < 0) {
1453 return -EIO;
1454 } else if (len == 0) {
1455 if (partial) {
1456 error_setg(errp,
1457 "Unexpected end-of-file before all bytes were read");
1458 return -EIO;
1459 } else {
1460 return 0;
1461 }
1462 }
1463
1464 partial = true;
1465 size -= len;
1466 buffer = (uint8_t *) buffer + len;
1467 }
1468 return 1;
1469}
1470
d2223cdd
PB
1471static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *request,
1472 Error **errp)
75818250 1473{
c8720ca0
EB
1474 uint8_t buf[NBD_EXTENDED_REQUEST_SIZE];
1475 uint32_t magic, expect;
a0dc63a6 1476 int ret;
c8720ca0
EB
1477 size_t size = client->mode >= NBD_MODE_EXTENDED ?
1478 NBD_EXTENDED_REQUEST_SIZE : NBD_REQUEST_SIZE;
b2e3d87f 1479
c8720ca0 1480 ret = nbd_read_eof(client, buf, size, errp);
185b4338
PB
1481 if (ret < 0) {
1482 return ret;
1483 }
1644ccce
EB
1484 if (ret == 0) {
1485 return -EIO;
1486 }
185b4338 1487
c8720ca0
EB
1488 /*
1489 * Compact request
1490 * [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
1491 * [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...)
1492 * [ 6 .. 7] type (NBD_CMD_READ, ...)
1493 * [ 8 .. 15] cookie
1494 * [16 .. 23] from
1495 * [24 .. 27] len
1496 * Extended request
1497 * [ 0 .. 3] magic (NBD_EXTENDED_REQUEST_MAGIC)
1498 * [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, NBD_CMD_FLAG_PAYLOAD_LEN, ...)
1499 * [ 6 .. 7] type (NBD_CMD_READ, ...)
1500 * [ 8 .. 15] cookie
1501 * [16 .. 23] from
1502 * [24 .. 31] len
b2e3d87f
NT
1503 */
1504
773dce3c 1505 magic = ldl_be_p(buf);
b626b51a
EB
1506 request->flags = lduw_be_p(buf + 4);
1507 request->type = lduw_be_p(buf + 6);
22efd811 1508 request->cookie = ldq_be_p(buf + 8);
773dce3c 1509 request->from = ldq_be_p(buf + 16);
c8720ca0
EB
1510 if (client->mode >= NBD_MODE_EXTENDED) {
1511 request->len = ldq_be_p(buf + 24);
1512 expect = NBD_EXTENDED_REQUEST_MAGIC;
1513 } else {
1514 request->len = (uint32_t)ldl_be_p(buf + 24); /* widen 32 to 64 bits */
1515 expect = NBD_REQUEST_MAGIC;
1516 }
b2e3d87f 1517
9588463e
VSO
1518 trace_nbd_receive_request(magic, request->flags, request->type,
1519 request->from, request->len);
b2e3d87f 1520
c8720ca0
EB
1521 if (magic != expect) {
1522 error_setg(errp, "invalid magic (got 0x%" PRIx32 ", expected 0x%"
1523 PRIx32 ")", magic, expect);
185b4338 1524 return -EINVAL;
b2e3d87f
NT
1525 }
1526 return 0;
75818250
TS
1527}
1528
41996e38
PB
1529#define MAX_NBD_REQUESTS 16
1530
f816310d 1531/* Runs in export AioContext and main loop thread */
ce33967a 1532void nbd_client_get(NBDClient *client)
1743b515 1533{
f816310d 1534 qatomic_inc(&client->refcount);
1743b515
PB
1535}
1536
ce33967a 1537void nbd_client_put(NBDClient *client)
1743b515 1538{
f816310d
SH
1539 assert(qemu_in_main_thread());
1540
1541 if (qatomic_fetch_dec(&client->refcount) == 1) {
ff2b68aa 1542 /* The last reference should be dropped by client->close,
f53a829b 1543 * which is called by client_close.
ff2b68aa
PB
1544 */
1545 assert(client->closing);
1546
1c778ef7
DB
1547 object_unref(OBJECT(client->sioc));
1548 object_unref(OBJECT(client->ioc));
f95910fe
DB
1549 if (client->tlscreds) {
1550 object_unref(OBJECT(client->tlscreds));
1551 }
b25e12da 1552 g_free(client->tlsauthz);
6b8c01e7
PB
1553 if (client->exp) {
1554 QTAILQ_REMOVE(&client->exp->clients, client, next);
c69de1be 1555 blk_exp_unref(&client->exp->common);
6b8c01e7 1556 }
fd358d83 1557 g_free(client->contexts.bitmaps);
7075d235 1558 qemu_mutex_destroy(&client->lock);
1743b515
PB
1559 g_free(client);
1560 }
1561}
1562
f816310d
SH
1563/*
1564 * Tries to release the reference to @client, but only if other references
1565 * remain. This is an optimization for the common case where we want to avoid
1566 * the expense of scheduling nbd_client_put() in the main loop thread.
1567 *
1568 * Returns true upon success or false if the reference was not released because
1569 * it is the last reference.
1570 */
1571static bool nbd_client_put_nonzero(NBDClient *client)
1572{
1573 int old = qatomic_read(&client->refcount);
1574 int expected;
1575
1576 do {
1577 if (old == 1) {
1578 return false;
1579 }
1580
1581 expected = old;
1582 old = qatomic_cmpxchg(&client->refcount, expected, expected - 1);
1583 } while (old != expected);
1584
1585 return true;
1586}
1587
0c9390d9 1588static void client_close(NBDClient *client, bool negotiated)
1743b515 1589{
f816310d
SH
1590 assert(qemu_in_main_thread());
1591
7075d235
SH
1592 WITH_QEMU_LOCK_GUARD(&client->lock) {
1593 if (client->closing) {
1594 return;
1595 }
ff2b68aa 1596
7075d235
SH
1597 client->closing = true;
1598 }
ff2b68aa
PB
1599
1600 /* Force requests to finish. They will drop their own references,
1601 * then we'll close the socket and free the NBDClient.
1602 */
1c778ef7
DB
1603 qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
1604 NULL);
ff2b68aa
PB
1605
1606 /* Also tell the client, so that they release their reference. */
0c9390d9
EB
1607 if (client->close_fn) {
1608 client->close_fn(client, negotiated);
1743b515 1609 }
1743b515
PB
1610}
1611
7075d235 1612/* Runs in export AioContext with client->lock held */
315f78ab 1613static NBDRequestData *nbd_request_get(NBDClient *client)
d9a73806 1614{
315f78ab 1615 NBDRequestData *req;
72deddc5 1616
41996e38
PB
1617 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
1618 client->nb_requests++;
1619
315f78ab 1620 req = g_new0(NBDRequestData, 1);
72deddc5 1621 req->client = client;
d9a73806
PB
1622 return req;
1623}
1624
7075d235 1625/* Runs in export AioContext with client->lock held */
315f78ab 1626static void nbd_request_put(NBDRequestData *req)
d9a73806 1627{
72deddc5 1628 NBDClient *client = req->client;
e1adb27a 1629
2d821488
SH
1630 if (req->data) {
1631 qemu_vfree(req->data);
1632 }
1729404c 1633 g_free(req);
e1adb27a 1634
958c717d 1635 client->nb_requests--;
fd6afc50
SL
1636
1637 if (client->quiescing && client->nb_requests == 0) {
1638 aio_wait_kick();
1639 }
1640
ff82911c 1641 nbd_client_receive_next_request(client);
d9a73806
PB
1642}
1643
aadf99a7 1644static void blk_aio_attached(AioContext *ctx, void *opaque)
f2149281
HR
1645{
1646 NBDExport *exp = opaque;
1647 NBDClient *client;
1648
7075d235
SH
1649 assert(qemu_in_main_thread());
1650
9588463e 1651 trace_nbd_blk_aio_attached(exp->name, ctx);
f2149281 1652
8612c686 1653 exp->common.ctx = ctx;
f2149281
HR
1654
1655 QTAILQ_FOREACH(client, &exp->clients, next) {
7075d235
SH
1656 WITH_QEMU_LOCK_GUARD(&client->lock) {
1657 assert(client->nb_requests == 0);
1658 assert(client->recv_coroutine == NULL);
1659 assert(client->send_coroutine == NULL);
1660 }
f148ae7d
SL
1661 }
1662}
1663
fd6afc50 1664static void blk_aio_detach(void *opaque)
f148ae7d
SL
1665{
1666 NBDExport *exp = opaque;
f148ae7d 1667
7075d235
SH
1668 assert(qemu_in_main_thread());
1669
fd6afc50
SL
1670 trace_nbd_blk_aio_detach(exp->name, exp->common.ctx);
1671
fd6afc50
SL
1672 exp->common.ctx = NULL;
1673}
1674
1675static void nbd_drained_begin(void *opaque)
1676{
1677 NBDExport *exp = opaque;
1678 NBDClient *client;
1679
7075d235
SH
1680 assert(qemu_in_main_thread());
1681
fd6afc50 1682 QTAILQ_FOREACH(client, &exp->clients, next) {
7075d235
SH
1683 WITH_QEMU_LOCK_GUARD(&client->lock) {
1684 client->quiescing = true;
1685 }
fd6afc50
SL
1686 }
1687}
f148ae7d 1688
fd6afc50
SL
1689static void nbd_drained_end(void *opaque)
1690{
1691 NBDExport *exp = opaque;
1692 NBDClient *client;
f148ae7d 1693
7075d235
SH
1694 assert(qemu_in_main_thread());
1695
fd6afc50 1696 QTAILQ_FOREACH(client, &exp->clients, next) {
7075d235
SH
1697 WITH_QEMU_LOCK_GUARD(&client->lock) {
1698 client->quiescing = false;
1699 nbd_client_receive_next_request(client);
1700 }
f2149281
HR
1701 }
1702}
1703
7075d235
SH
1704/* Runs in export AioContext */
1705static void nbd_wake_read_bh(void *opaque)
1706{
1707 NBDClient *client = opaque;
1708 qio_channel_wake_read(client->ioc);
1709}
1710
fd6afc50 1711static bool nbd_drained_poll(void *opaque)
f2149281
HR
1712{
1713 NBDExport *exp = opaque;
fd6afc50 1714 NBDClient *client;
f2149281 1715
7075d235
SH
1716 assert(qemu_in_main_thread());
1717
fd6afc50 1718 QTAILQ_FOREACH(client, &exp->clients, next) {
7075d235
SH
1719 WITH_QEMU_LOCK_GUARD(&client->lock) {
1720 if (client->nb_requests != 0) {
1721 /*
1722 * If there's a coroutine waiting for a request on nbd_read_eof()
1723 * enter it here so we don't depend on the client to wake it up.
1724 *
1725 * Schedule a BH in the export AioContext to avoid missing the
1726 * wake up due to the race between qio_channel_wake_read() and
1727 * qio_channel_yield().
1728 */
1729 if (client->recv_coroutine != NULL && client->read_yielding) {
1730 aio_bh_schedule_oneshot(nbd_export_aio_context(client->exp),
1731 nbd_wake_read_bh, client);
1732 }
f2149281 1733
7075d235
SH
1734 return true;
1735 }
fd6afc50
SL
1736 }
1737 }
f2149281 1738
fd6afc50 1739 return false;
f2149281
HR
1740}
1741
741cc431
HR
1742static void nbd_eject_notifier(Notifier *n, void *data)
1743{
1744 NBDExport *exp = container_of(n, NBDExport, eject_notifier);
61bc846d 1745
7075d235
SH
1746 assert(qemu_in_main_thread());
1747
bc4ee65b 1748 blk_exp_request_shutdown(&exp->common);
741cc431
HR
1749}
1750
9b562c64
KW
1751void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk)
1752{
1753 NBDExport *nbd_exp = container_of(exp, NBDExport, common);
1754 assert(exp->drv == &blk_exp_nbd);
1755 assert(nbd_exp->eject_notifier_blk == NULL);
1756
1757 blk_ref(blk);
1758 nbd_exp->eject_notifier_blk = blk;
1759 nbd_exp->eject_notifier.notify = nbd_eject_notifier;
1760 blk_add_remove_bs_notifier(blk, &nbd_exp->eject_notifier);
1761}
1762
fd6afc50
SL
1763static const BlockDevOps nbd_block_ops = {
1764 .drained_begin = nbd_drained_begin,
1765 .drained_end = nbd_drained_end,
1766 .drained_poll = nbd_drained_poll,
1767};
1768
5b1cb497
KW
1769static int nbd_export_create(BlockExport *blk_exp, BlockExportOptions *exp_args,
1770 Error **errp)
af49bbbe 1771{
a6ff7989 1772 NBDExport *exp = container_of(blk_exp, NBDExport, common);
5b1cb497 1773 BlockExportOptionsNbd *arg = &exp_args->u.nbd;
8461b4d6 1774 const char *name = arg->name ?: exp_args->node_name;
331170e0 1775 BlockBackend *blk = blk_exp->blk;
b57e4de0 1776 int64_t size;
331170e0 1777 uint64_t perm, shared_perm;
5b1cb497 1778 bool readonly = !exp_args->writable;
e5fb29d5 1779 BlockDirtyBitmapOrStrList *bitmaps;
3b1f244c 1780 size_t i;
d7086422 1781 int ret;
cd7fca95 1782
372b69f5 1783 GLOBAL_STATE_CODE();
5b1cb497
KW
1784 assert(exp_args->type == BLOCK_EXPORT_TYPE_NBD);
1785
1786 if (!nbd_server_is_running()) {
1787 error_setg(errp, "NBD server not running");
1788 return -EINVAL;
1789 }
1790
8461b4d6
MA
1791 if (strlen(name) > NBD_MAX_STRING_SIZE) {
1792 error_setg(errp, "export name '%s' too long", name);
5b1cb497
KW
1793 return -EINVAL;
1794 }
1795
1796 if (arg->description && strlen(arg->description) > NBD_MAX_STRING_SIZE) {
1797 error_setg(errp, "description '%s' too long", arg->description);
1798 return -EINVAL;
1799 }
1800
8461b4d6
MA
1801 if (nbd_export_find(name)) {
1802 error_setg(errp, "NBD server already has export named '%s'", name);
5b1cb497
KW
1803 return -EEXIST;
1804 }
1805
331170e0 1806 size = blk_getlength(blk);
b57e4de0
KW
1807 if (size < 0) {
1808 error_setg_errno(errp, -size,
1809 "Failed to determine the NBD export's length");
a6ff7989 1810 return size;
b57e4de0
KW
1811 }
1812
8a7ce4f9
KW
1813 /* Don't allow resize while the NBD server is running, otherwise we don't
1814 * care what happens with the node. */
331170e0 1815 blk_get_perm(blk, &perm, &shared_perm);
331170e0 1816 ret = blk_set_perm(blk, perm, shared_perm & ~BLK_PERM_RESIZE, errp);
d7086422 1817 if (ret < 0) {
331170e0 1818 return ret;
d7086422 1819 }
331170e0 1820
4b9441f6 1821 QTAILQ_INIT(&exp->clients);
8461b4d6 1822 exp->name = g_strdup(name);
5b1cb497 1823 exp->description = g_strdup(arg->description);
dbb38caa
EB
1824 exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
1825 NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
58a6fdcc
EB
1826
1827 if (nbd_server_max_connections() != 1) {
1828 exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
1829 }
dbb38caa
EB
1830 if (readonly) {
1831 exp->nbdflags |= NBD_FLAG_READ_ONLY;
dbb38caa 1832 } else {
b491dbb7
EB
1833 exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
1834 NBD_FLAG_SEND_FAST_ZERO);
dbb38caa 1835 }
7596bbb3 1836 exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE);
98f44bbe 1837
372b69f5
KW
1838 bdrv_graph_rdlock_main_loop();
1839
cbad81ce 1840 for (bitmaps = arg->bitmaps; bitmaps; bitmaps = bitmaps->next) {
3b1f244c
EB
1841 exp->nr_export_bitmaps++;
1842 }
1843 exp->export_bitmaps = g_new0(BdrvDirtyBitmap *, exp->nr_export_bitmaps);
1844 for (i = 0, bitmaps = arg->bitmaps; bitmaps;
e5fb29d5
VSO
1845 i++, bitmaps = bitmaps->next)
1846 {
1847 const char *bitmap;
331170e0 1848 BlockDriverState *bs = blk_bs(blk);
678ba275 1849 BdrvDirtyBitmap *bm = NULL;
678ba275 1850
e5fb29d5
VSO
1851 switch (bitmaps->value->type) {
1852 case QTYPE_QSTRING:
1853 bitmap = bitmaps->value->u.local;
1854 while (bs) {
1855 bm = bdrv_find_dirty_bitmap(bs, bitmap);
1856 if (bm != NULL) {
1857 break;
1858 }
1859
1860 bs = bdrv_filter_or_cow_bs(bs);
678ba275
EB
1861 }
1862
e5fb29d5
VSO
1863 if (bm == NULL) {
1864 ret = -ENOENT;
1865 error_setg(errp, "Bitmap '%s' is not found",
1866 bitmaps->value->u.local);
1867 goto fail;
1868 }
678ba275 1869
e5fb29d5
VSO
1870 if (readonly && bdrv_is_writable(bs) &&
1871 bdrv_dirty_bitmap_enabled(bm)) {
1872 ret = -EINVAL;
1873 error_setg(errp, "Enabled bitmap '%s' incompatible with "
1874 "readonly export", bitmap);
1875 goto fail;
1876 }
1877 break;
1878 case QTYPE_QDICT:
1879 bitmap = bitmaps->value->u.external.name;
1880 bm = block_dirty_bitmap_lookup(bitmaps->value->u.external.node,
1881 bitmap, NULL, errp);
1882 if (!bm) {
1883 ret = -ENOENT;
1884 goto fail;
1885 }
1886 break;
1887 default:
1888 abort();
678ba275
EB
1889 }
1890
e5fb29d5 1891 assert(bm);
3b78a927 1892
e5fb29d5 1893 if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
a6ff7989 1894 ret = -EINVAL;
678ba275
EB
1895 goto fail;
1896 }
1897
3b1f244c 1898 exp->export_bitmaps[i] = bm;
cbad81ce 1899 assert(strlen(bitmap) <= BDRV_BITMAP_MAX_NAME_SIZE);
678ba275
EB
1900 }
1901
3b1f244c
EB
1902 /* Mark bitmaps busy in a separate loop, to simplify roll-back concerns. */
1903 for (i = 0; i < exp->nr_export_bitmaps; i++) {
1904 bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], true);
1905 }
1906
dbc7b014
EB
1907 exp->allocation_depth = arg->allocation_depth;
1908
fd6afc50
SL
1909 /*
1910 * We need to inhibit request queuing in the block layer to ensure we can
1911 * be properly quiesced when entering a drained section, as our coroutines
1912 * servicing pending requests might enter blk_pread().
1913 */
1914 blk_set_disable_request_queuing(blk, true);
1915
aadf99a7 1916 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
741cc431 1917
fd6afc50
SL
1918 blk_set_dev_ops(blk, &nbd_block_ops, exp);
1919
3fa4c765 1920 QTAILQ_INSERT_TAIL(&exports, exp, next);
c69de1be 1921
372b69f5
KW
1922 bdrv_graph_rdunlock_main_loop();
1923
a6ff7989 1924 return 0;
98f44bbe
HR
1925
1926fail:
372b69f5 1927 bdrv_graph_rdunlock_main_loop();
3b1f244c 1928 g_free(exp->export_bitmaps);
3fa4c765
EB
1929 g_free(exp->name);
1930 g_free(exp->description);
a6ff7989 1931 return ret;
af49bbbe
PB
1932}
1933
ee0a19ec
PB
1934NBDExport *nbd_export_find(const char *name)
1935{
1936 NBDExport *exp;
1937 QTAILQ_FOREACH(exp, &exports, next) {
1938 if (strcmp(name, exp->name) == 0) {
1939 return exp;
1940 }
1941 }
1942
1943 return NULL;
1944}
1945
61bc846d
EB
1946AioContext *
1947nbd_export_aio_context(NBDExport *exp)
1948{
8612c686 1949 return exp->common.ctx;
61bc846d
EB
1950}
1951
bc4ee65b 1952static void nbd_export_request_shutdown(BlockExport *blk_exp)
af49bbbe 1953{
bc4ee65b 1954 NBDExport *exp = container_of(blk_exp, NBDExport, common);
4b9441f6 1955 NBDClient *client, *next;
2c8d9f06 1956
c69de1be 1957 blk_exp_ref(&exp->common);
3fa4c765
EB
1958 /*
1959 * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a
1960 * close mode that stops advertising the export to new clients but
1961 * still permits existing clients to run to completion? Because of
1962 * that possibility, nbd_export_close() can be called more than
1963 * once on an export.
1964 */
4b9441f6 1965 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
0c9390d9 1966 client_close(client, true);
4b9441f6 1967 }
3fa4c765 1968 if (exp->name) {
3fa4c765
EB
1969 g_free(exp->name);
1970 exp->name = NULL;
1971 QTAILQ_REMOVE(&exports, exp, next);
1972 }
c69de1be 1973 blk_exp_unref(&exp->common);
2c8d9f06
PB
1974}
1975
c69de1be 1976static void nbd_export_delete(BlockExport *blk_exp)
2c8d9f06 1977{
3b1f244c 1978 size_t i;
c69de1be 1979 NBDExport *exp = container_of(blk_exp, NBDExport, common);
2c8d9f06 1980
c69de1be
KW
1981 assert(exp->name == NULL);
1982 assert(QTAILQ_EMPTY(&exp->clients));
d6268348 1983
c69de1be
KW
1984 g_free(exp->description);
1985 exp->description = NULL;
1986
dd5b6780
PB
1987 if (exp->eject_notifier_blk) {
1988 notifier_remove(&exp->eject_notifier);
1989 blk_unref(exp->eject_notifier_blk);
c69de1be 1990 }
dd5b6780
PB
1991 blk_remove_aio_context_notifier(exp->common.blk, blk_aio_attached,
1992 blk_aio_detach, exp);
1993 blk_set_disable_request_queuing(exp->common.blk, false);
3d068aff 1994
3b1f244c
EB
1995 for (i = 0; i < exp->nr_export_bitmaps; i++) {
1996 bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], false);
2c8d9f06 1997 }
af49bbbe
PB
1998}
1999
56ee8626
KW
2000const BlockExportDriver blk_exp_nbd = {
2001 .type = BLOCK_EXPORT_TYPE_NBD,
a6ff7989 2002 .instance_size = sizeof(NBDExport),
56ee8626 2003 .create = nbd_export_create,
c69de1be 2004 .delete = nbd_export_delete,
bc4ee65b 2005 .request_shutdown = nbd_export_request_shutdown,
56ee8626
KW
2006};
2007
de79bfc3
VSO
2008static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
2009 unsigned niov, Error **errp)
2010{
2011 int ret;
2012
2013 g_assert(qemu_in_coroutine());
2014 qemu_co_mutex_lock(&client->send_lock);
2015 client->send_coroutine = qemu_coroutine_self();
2016
2017 ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
2018
2019 client->send_coroutine = NULL;
2020 qemu_co_mutex_unlock(&client->send_lock);
2021
2022 return ret;
2023}
2024
caad5384 2025static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
22efd811 2026 uint64_t cookie)
caad5384
VSO
2027{
2028 stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
2029 stl_be_p(&reply->error, error);
22efd811 2030 stq_be_p(&reply->cookie, cookie);
caad5384
VSO
2031}
2032
d2223cdd 2033static int coroutine_fn nbd_co_send_simple_reply(NBDClient *client,
66d4f4fe 2034 NBDRequest *request,
d2223cdd
PB
2035 uint32_t error,
2036 void *data,
b2578459 2037 uint64_t len,
d2223cdd 2038 Error **errp)
22045592 2039{
de79bfc3 2040 NBDSimpleReply reply;
14cea41d 2041 int nbd_err = system_errno_to_nbd_errno(error);
de79bfc3
VSO
2042 struct iovec iov[] = {
2043 {.iov_base = &reply, .iov_len = sizeof(reply)},
2044 {.iov_base = data, .iov_len = len}
2045 };
6fb2b972 2046
a7c8ed36 2047 assert(!len || !nbd_err);
b2578459 2048 assert(len <= NBD_MAX_BUFFER_SIZE);
ac132d05
EB
2049 assert(client->mode < NBD_MODE_STRUCTURED ||
2050 (client->mode == NBD_MODE_STRUCTURED &&
2051 request->type != NBD_CMD_READ));
22efd811 2052 trace_nbd_co_send_simple_reply(request->cookie, nbd_err,
66d4f4fe 2053 nbd_err_lookup(nbd_err), len);
22efd811 2054 set_be_simple_reply(&reply, nbd_err, request->cookie);
262db388 2055
a7c8ed36 2056 return nbd_co_send_iov(client, iov, 2, errp);
22045592
PB
2057}
2058
a7c8ed36
EB
2059/*
2060 * Prepare the header of a reply chunk for network transmission.
2061 *
2062 * On input, @iov is partially initialized: iov[0].iov_base must point
2063 * to an uninitialized NBDReply, while the remaining @niov elements
2064 * (if any) must be ready for transmission. This function then
2065 * populates iov[0] for transmission.
2066 */
2067static inline void set_be_chunk(NBDClient *client, struct iovec *iov,
2068 size_t niov, uint16_t flags, uint16_t type,
66d4f4fe 2069 NBDRequest *request)
5c54e7fa 2070{
a7c8ed36
EB
2071 size_t i, length = 0;
2072
2073 for (i = 1; i < niov; i++) {
2074 length += iov[i].iov_len;
2075 }
2076 assert(length <= NBD_MAX_BUFFER_SIZE + sizeof(NBDStructuredReadData));
2077
11d3355f
EB
2078 if (client->mode >= NBD_MODE_EXTENDED) {
2079 NBDExtendedReplyChunk *chunk = iov->iov_base;
2080
2081 iov[0].iov_len = sizeof(*chunk);
2082 stl_be_p(&chunk->magic, NBD_EXTENDED_REPLY_MAGIC);
2083 stw_be_p(&chunk->flags, flags);
2084 stw_be_p(&chunk->type, type);
2085 stq_be_p(&chunk->cookie, request->cookie);
2086 stq_be_p(&chunk->offset, request->from);
2087 stq_be_p(&chunk->length, length);
2088 } else {
2089 NBDStructuredReplyChunk *chunk = iov->iov_base;
2090
2091 iov[0].iov_len = sizeof(*chunk);
2092 stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
2093 stw_be_p(&chunk->flags, flags);
2094 stw_be_p(&chunk->type, type);
2095 stq_be_p(&chunk->cookie, request->cookie);
2096 stl_be_p(&chunk->length, length);
2097 }
5c54e7fa
VSO
2098}
2099
a7c8ed36 2100static int coroutine_fn nbd_co_send_chunk_done(NBDClient *client,
66d4f4fe 2101 NBDRequest *request,
a7c8ed36 2102 Error **errp)
ef8c887e 2103{
a7c8ed36 2104 NBDReply hdr;
ef8c887e 2105 struct iovec iov[] = {
a7c8ed36 2106 {.iov_base = &hdr},
ef8c887e
EB
2107 };
2108
22efd811 2109 trace_nbd_co_send_chunk_done(request->cookie);
a7c8ed36 2110 set_be_chunk(client, iov, 1, NBD_REPLY_FLAG_DONE,
66d4f4fe 2111 NBD_REPLY_TYPE_NONE, request);
ef8c887e
EB
2112 return nbd_co_send_iov(client, iov, 1, errp);
2113}
2114
a7c8ed36 2115static int coroutine_fn nbd_co_send_chunk_read(NBDClient *client,
66d4f4fe 2116 NBDRequest *request,
a7c8ed36
EB
2117 uint64_t offset,
2118 void *data,
b2578459 2119 uint64_t size,
a7c8ed36
EB
2120 bool final,
2121 Error **errp)
5c54e7fa 2122{
a7c8ed36 2123 NBDReply hdr;
efdc0c10 2124 NBDStructuredReadData chunk;
5c54e7fa 2125 struct iovec iov[] = {
a7c8ed36 2126 {.iov_base = &hdr},
5c54e7fa
VSO
2127 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2128 {.iov_base = data, .iov_len = size}
2129 };
2130
b2578459 2131 assert(size && size <= NBD_MAX_BUFFER_SIZE);
22efd811 2132 trace_nbd_co_send_chunk_read(request->cookie, offset, data, size);
a7c8ed36 2133 set_be_chunk(client, iov, 3, final ? NBD_REPLY_FLAG_DONE : 0,
66d4f4fe 2134 NBD_REPLY_TYPE_OFFSET_DATA, request);
5c54e7fa
VSO
2135 stq_be_p(&chunk.offset, offset);
2136
a7c8ed36 2137 return nbd_co_send_iov(client, iov, 3, errp);
5c54e7fa 2138}
ac132d05 2139
a7c8ed36 2140static int coroutine_fn nbd_co_send_chunk_error(NBDClient *client,
66d4f4fe 2141 NBDRequest *request,
a7c8ed36
EB
2142 uint32_t error,
2143 const char *msg,
2144 Error **errp)
60ace2ba 2145{
a7c8ed36 2146 NBDReply hdr;
60ace2ba
VSO
2147 NBDStructuredError chunk;
2148 int nbd_err = system_errno_to_nbd_errno(error);
2149 struct iovec iov[] = {
a7c8ed36 2150 {.iov_base = &hdr},
60ace2ba
VSO
2151 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2152 {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
2153 };
2154
2155 assert(nbd_err);
22efd811 2156 trace_nbd_co_send_chunk_error(request->cookie, nbd_err,
a7c8ed36
EB
2157 nbd_err_lookup(nbd_err), msg ? msg : "");
2158 set_be_chunk(client, iov, 3, NBD_REPLY_FLAG_DONE,
66d4f4fe 2159 NBD_REPLY_TYPE_ERROR, request);
60ace2ba 2160 stl_be_p(&chunk.error, nbd_err);
a7c8ed36 2161 stw_be_p(&chunk.message_length, iov[2].iov_len);
60ace2ba 2162
a7c8ed36 2163 return nbd_co_send_iov(client, iov, 3, errp);
60ace2ba
VSO
2164}
2165
37e02aeb 2166/* Do a sparse read and send the structured reply to the client.
ff7e261b 2167 * Returns -errno if sending fails. blk_co_block_status_above() failure is
37e02aeb
VSO
2168 * reported to the client, at which point this function succeeds.
2169 */
418638d3 2170static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
66d4f4fe 2171 NBDRequest *request,
418638d3
EB
2172 uint64_t offset,
2173 uint8_t *data,
b2578459 2174 uint64_t size,
418638d3
EB
2175 Error **errp)
2176{
2177 int ret = 0;
2178 NBDExport *exp = client->exp;
2179 size_t progress = 0;
2180
b2578459 2181 assert(size <= NBD_MAX_BUFFER_SIZE);
418638d3
EB
2182 while (progress < size) {
2183 int64_t pnum;
ff7e261b
EGE
2184 int status = blk_co_block_status_above(exp->common.blk, NULL,
2185 offset + progress,
2186 size - progress, &pnum, NULL,
2187 NULL);
e2de3256 2188 bool final;
418638d3
EB
2189
2190 if (status < 0) {
37e02aeb
VSO
2191 char *msg = g_strdup_printf("unable to check for holes: %s",
2192 strerror(-status));
2193
66d4f4fe 2194 ret = nbd_co_send_chunk_error(client, request, -status, msg, errp);
37e02aeb
VSO
2195 g_free(msg);
2196 return ret;
418638d3
EB
2197 }
2198 assert(pnum && pnum <= size - progress);
e2de3256 2199 final = progress + pnum == size;
418638d3 2200 if (status & BDRV_BLOCK_ZERO) {
a7c8ed36 2201 NBDReply hdr;
418638d3
EB
2202 NBDStructuredReadHole chunk;
2203 struct iovec iov[] = {
a7c8ed36 2204 {.iov_base = &hdr},
418638d3
EB
2205 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2206 };
2207
22efd811 2208 trace_nbd_co_send_chunk_read_hole(request->cookie,
66d4f4fe 2209 offset + progress, pnum);
a7c8ed36
EB
2210 set_be_chunk(client, iov, 2,
2211 final ? NBD_REPLY_FLAG_DONE : 0,
66d4f4fe 2212 NBD_REPLY_TYPE_OFFSET_HOLE, request);
418638d3
EB
2213 stq_be_p(&chunk.offset, offset + progress);
2214 stl_be_p(&chunk.length, pnum);
a7c8ed36 2215 ret = nbd_co_send_iov(client, iov, 2, errp);
418638d3 2216 } else {
d2223cdd
PB
2217 ret = blk_co_pread(exp->common.blk, offset + progress, pnum,
2218 data + progress, 0);
418638d3
EB
2219 if (ret < 0) {
2220 error_setg_errno(errp, -ret, "reading from file failed");
2221 break;
2222 }
66d4f4fe 2223 ret = nbd_co_send_chunk_read(client, request, offset + progress,
a7c8ed36 2224 data + progress, pnum, final, errp);
418638d3
EB
2225 }
2226
2227 if (ret < 0) {
2228 break;
2229 }
2230 progress += pnum;
2231 }
418638d3
EB
2232 return ret;
2233}
2234
89cbc7e3 2235typedef struct NBDExtentArray {
bcc16cc1 2236 NBDExtent64 *extents;
89cbc7e3
VSO
2237 unsigned int nb_alloc;
2238 unsigned int count;
2239 uint64_t total_length;
bcc16cc1 2240 bool extended;
89cbc7e3
VSO
2241 bool can_add;
2242 bool converted_to_be;
2243} NBDExtentArray;
2244
bcc16cc1
EB
2245static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc,
2246 NBDMode mode)
89cbc7e3
VSO
2247{
2248 NBDExtentArray *ea = g_new0(NBDExtentArray, 1);
2249
bcc16cc1 2250 assert(mode >= NBD_MODE_STRUCTURED);
89cbc7e3 2251 ea->nb_alloc = nb_alloc;
bcc16cc1
EB
2252 ea->extents = g_new(NBDExtent64, nb_alloc);
2253 ea->extended = mode >= NBD_MODE_EXTENDED;
89cbc7e3
VSO
2254 ea->can_add = true;
2255
2256 return ea;
2257}
2258
2259static void nbd_extent_array_free(NBDExtentArray *ea)
2260{
2261 g_free(ea->extents);
2262 g_free(ea);
2263}
e0e7fe07 2264G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free)
89cbc7e3
VSO
2265
2266/* Further modifications of the array after conversion are abandoned */
2267static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
2268{
2269 int i;
2270
2271 assert(!ea->converted_to_be);
bcc16cc1 2272 assert(ea->extended);
89cbc7e3
VSO
2273 ea->can_add = false;
2274 ea->converted_to_be = true;
2275
2276 for (i = 0; i < ea->count; i++) {
bcc16cc1
EB
2277 ea->extents[i].length = cpu_to_be64(ea->extents[i].length);
2278 ea->extents[i].flags = cpu_to_be64(ea->extents[i].flags);
89cbc7e3
VSO
2279 }
2280}
2281
bcc16cc1
EB
2282/* Further modifications of the array after conversion are abandoned */
2283static NBDExtent32 *nbd_extent_array_convert_to_narrow(NBDExtentArray *ea)
2284{
2285 int i;
2286 NBDExtent32 *extents = g_new(NBDExtent32, ea->count);
2287
2288 assert(!ea->converted_to_be);
2289 assert(!ea->extended);
2290 ea->can_add = false;
2291 ea->converted_to_be = true;
2292
2293 for (i = 0; i < ea->count; i++) {
2294 assert((ea->extents[i].length | ea->extents[i].flags) <= UINT32_MAX);
2295 extents[i].length = cpu_to_be32(ea->extents[i].length);
2296 extents[i].flags = cpu_to_be32(ea->extents[i].flags);
2297 }
2298
2299 return extents;
2300}
2301
fb7afc79 2302/*
89cbc7e3
VSO
2303 * Add extent to NBDExtentArray. If extent can't be added (no available space),
2304 * return -1.
2305 * For safety, when returning -1 for the first time, .can_add is set to false,
314b9026
EB
2306 * and further calls to nbd_extent_array_add() will crash.
2307 * (this avoids the situation where a caller ignores failure to add one extent,
2308 * where adding another extent that would squash into the last array entry
2309 * would result in an incorrect range reported to the client)
fb7afc79 2310 */
89cbc7e3 2311static int nbd_extent_array_add(NBDExtentArray *ea,
bcc16cc1 2312 uint64_t length, uint32_t flags)
e7b1948d 2313{
89cbc7e3
VSO
2314 assert(ea->can_add);
2315
2316 if (!length) {
2317 return 0;
2318 }
bcc16cc1
EB
2319 if (!ea->extended) {
2320 assert(length <= UINT32_MAX);
2321 }
89cbc7e3
VSO
2322
2323 /* Extend previous extent if flags are the same */
2324 if (ea->count > 0 && flags == ea->extents[ea->count - 1].flags) {
bcc16cc1 2325 uint64_t sum = length + ea->extents[ea->count - 1].length;
89cbc7e3 2326
bcc16cc1
EB
2327 /*
2328 * sum cannot overflow: the block layer bounds image size at
2329 * 2^63, and ea->extents[].length comes from the block layer.
2330 */
2331 assert(sum >= length);
2332 if (sum <= UINT32_MAX || ea->extended) {
89cbc7e3
VSO
2333 ea->extents[ea->count - 1].length = sum;
2334 ea->total_length += length;
2335 return 0;
2336 }
2337 }
2338
2339 if (ea->count >= ea->nb_alloc) {
2340 ea->can_add = false;
2341 return -1;
2342 }
2343
2344 ea->total_length += length;
bcc16cc1 2345 ea->extents[ea->count] = (NBDExtent64) {.length = length, .flags = flags};
89cbc7e3 2346 ea->count++;
e7b1948d 2347
89cbc7e3
VSO
2348 return 0;
2349}
2350
ff7e261b 2351static int coroutine_fn blockstatus_to_extents(BlockBackend *blk,
6f58ac55
EGE
2352 uint64_t offset, uint64_t bytes,
2353 NBDExtentArray *ea)
89cbc7e3
VSO
2354{
2355 while (bytes) {
e7b1948d
VSO
2356 uint32_t flags;
2357 int64_t num;
ff7e261b
EGE
2358 int ret = blk_co_block_status_above(blk, NULL, offset, bytes, &num,
2359 NULL, NULL);
fb7afc79 2360
e7b1948d
VSO
2361 if (ret < 0) {
2362 return ret;
2363 }
2364
0da98568
NS
2365 flags = (ret & BDRV_BLOCK_DATA ? 0 : NBD_STATE_HOLE) |
2366 (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
e7b1948d 2367
89cbc7e3
VSO
2368 if (nbd_extent_array_add(ea, num, flags) < 0) {
2369 return 0;
e7b1948d 2370 }
fb7afc79 2371
89cbc7e3
VSO
2372 offset += num;
2373 bytes -= num;
e7b1948d
VSO
2374 }
2375
e7b1948d
VSO
2376 return 0;
2377}
2378
ff7e261b 2379static int coroutine_fn blockalloc_to_extents(BlockBackend *blk,
6f58ac55
EGE
2380 uint64_t offset, uint64_t bytes,
2381 NBDExtentArray *ea)
71719cd5
EB
2382{
2383 while (bytes) {
2384 int64_t num;
ff7e261b
EGE
2385 int ret = blk_co_is_allocated_above(blk, NULL, false, offset, bytes,
2386 &num);
71719cd5
EB
2387
2388 if (ret < 0) {
2389 return ret;
2390 }
2391
2392 if (nbd_extent_array_add(ea, num, ret) < 0) {
2393 return 0;
2394 }
2395
2396 offset += num;
2397 bytes -= num;
2398 }
2399
2400 return 0;
2401}
2402
89cbc7e3
VSO
2403/*
2404 * nbd_co_send_extents
3d068aff 2405 *
89cbc7e3
VSO
2406 * @ea is converted to BE by the function
2407 * @last controls whether NBD_REPLY_FLAG_DONE is sent.
3d068aff 2408 */
d2223cdd 2409static int coroutine_fn
66d4f4fe 2410nbd_co_send_extents(NBDClient *client, NBDRequest *request, NBDExtentArray *ea,
d2223cdd 2411 bool last, uint32_t context_id, Error **errp)
e7b1948d 2412{
a7c8ed36 2413 NBDReply hdr;
bcc16cc1
EB
2414 NBDStructuredMeta meta;
2415 NBDExtendedMeta meta_ext;
2416 g_autofree NBDExtent32 *extents = NULL;
2417 uint16_t type;
2418 struct iovec iov[] = { {.iov_base = &hdr}, {0}, {0} };
e7b1948d 2419
bcc16cc1
EB
2420 if (client->mode >= NBD_MODE_EXTENDED) {
2421 type = NBD_REPLY_TYPE_BLOCK_STATUS_EXT;
2422
2423 iov[1].iov_base = &meta_ext;
2424 iov[1].iov_len = sizeof(meta_ext);
2425 stl_be_p(&meta_ext.context_id, context_id);
2426 stl_be_p(&meta_ext.count, ea->count);
2427
2428 nbd_extent_array_convert_to_be(ea);
2429 iov[2].iov_base = ea->extents;
2430 iov[2].iov_len = ea->count * sizeof(ea->extents[0]);
2431 } else {
2432 type = NBD_REPLY_TYPE_BLOCK_STATUS;
2433
2434 iov[1].iov_base = &meta;
2435 iov[1].iov_len = sizeof(meta);
2436 stl_be_p(&meta.context_id, context_id);
2437
2438 extents = nbd_extent_array_convert_to_narrow(ea);
2439 iov[2].iov_base = extents;
2440 iov[2].iov_len = ea->count * sizeof(extents[0]);
2441 }
89cbc7e3 2442
22efd811 2443 trace_nbd_co_send_extents(request->cookie, ea->count, context_id,
66d4f4fe 2444 ea->total_length, last);
bcc16cc1
EB
2445 set_be_chunk(client, iov, 3, last ? NBD_REPLY_FLAG_DONE : 0, type,
2446 request);
e7b1948d 2447
a7c8ed36 2448 return nbd_co_send_iov(client, iov, 3, errp);
e7b1948d
VSO
2449}
2450
2451/* Get block status from the exported device and send it to the client */
6f58ac55 2452static int
66d4f4fe 2453coroutine_fn nbd_co_send_block_status(NBDClient *client, NBDRequest *request,
ff7e261b 2454 BlockBackend *blk, uint64_t offset,
bcc16cc1 2455 uint64_t length, bool dont_fragment,
6f58ac55
EGE
2456 bool last, uint32_t context_id,
2457 Error **errp)
e7b1948d
VSO
2458{
2459 int ret;
416e34bd 2460 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
bcc16cc1
EB
2461 g_autoptr(NBDExtentArray) ea =
2462 nbd_extent_array_new(nb_extents, client->mode);
e7b1948d 2463
71719cd5 2464 if (context_id == NBD_META_ID_BASE_ALLOCATION) {
ff7e261b 2465 ret = blockstatus_to_extents(blk, offset, length, ea);
71719cd5 2466 } else {
ff7e261b 2467 ret = blockalloc_to_extents(blk, offset, length, ea);
71719cd5 2468 }
e7b1948d 2469 if (ret < 0) {
66d4f4fe 2470 return nbd_co_send_chunk_error(client, request, -ret,
a7c8ed36 2471 "can't get block status", errp);
e7b1948d
VSO
2472 }
2473
66d4f4fe 2474 return nbd_co_send_extents(client, request, ea, last, context_id, errp);
3d068aff
VSO
2475}
2476
dacbb6eb 2477/* Populate @ea from a dirty bitmap. */
89cbc7e3
VSO
2478static void bitmap_to_extents(BdrvDirtyBitmap *bitmap,
2479 uint64_t offset, uint64_t length,
dacbb6eb 2480 NBDExtentArray *es)
3d068aff 2481{
dacbb6eb
VSO
2482 int64_t start, dirty_start, dirty_count;
2483 int64_t end = offset + length;
2484 bool full = false;
bcc16cc1 2485 int64_t bound = es->extended ? INT64_MAX : INT32_MAX;
3d068aff
VSO
2486
2487 bdrv_dirty_bitmap_lock(bitmap);
2488
dacbb6eb 2489 for (start = offset;
bcc16cc1 2490 bdrv_dirty_bitmap_next_dirty_area(bitmap, start, end, bound,
dacbb6eb
VSO
2491 &dirty_start, &dirty_count);
2492 start = dirty_start + dirty_count)
2493 {
2494 if ((nbd_extent_array_add(es, dirty_start - start, 0) < 0) ||
2495 (nbd_extent_array_add(es, dirty_count, NBD_STATE_DIRTY) < 0))
2496 {
2497 full = true;
89cbc7e3
VSO
2498 break;
2499 }
3d068aff
VSO
2500 }
2501
dacbb6eb 2502 if (!full) {
c0b21f2e
EB
2503 /* last non dirty extent, nothing to do if array is now full */
2504 (void) nbd_extent_array_add(es, end - start, 0);
dacbb6eb 2505 }
3d068aff
VSO
2506
2507 bdrv_dirty_bitmap_unlock(bitmap);
3d068aff
VSO
2508}
2509
66d4f4fe
EB
2510static int coroutine_fn nbd_co_send_bitmap(NBDClient *client,
2511 NBDRequest *request,
2512 BdrvDirtyBitmap *bitmap,
2513 uint64_t offset,
bcc16cc1 2514 uint64_t length, bool dont_fragment,
66d4f4fe
EB
2515 bool last, uint32_t context_id,
2516 Error **errp)
3d068aff 2517{
416e34bd 2518 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
bcc16cc1
EB
2519 g_autoptr(NBDExtentArray) ea =
2520 nbd_extent_array_new(nb_extents, client->mode);
3d068aff 2521
dacbb6eb 2522 bitmap_to_extents(bitmap, offset, length, ea);
3d068aff 2523
66d4f4fe 2524 return nbd_co_send_extents(client, request, ea, last, context_id, errp);
e7b1948d
VSO
2525}
2526
2dcbb11b
EB
2527/*
2528 * nbd_co_block_status_payload_read
2529 * Called when a client wants a subset of negotiated contexts via a
2530 * BLOCK_STATUS payload. Check the payload for valid length and
2531 * contents. On success, return 0 with request updated to effective
2532 * length. If request was invalid but all payload consumed, return 0
2533 * with request->len and request->contexts->count set to 0 (which will
2534 * trigger an appropriate NBD_EINVAL response later on). Return
2535 * negative errno if the payload was not fully consumed.
2536 */
2537static int
2538nbd_co_block_status_payload_read(NBDClient *client, NBDRequest *request,
2539 Error **errp)
2540{
2541 uint64_t payload_len = request->len;
2542 g_autofree char *buf = NULL;
2543 size_t count, i, nr_bitmaps;
2544 uint32_t id;
2545
2546 if (payload_len > NBD_MAX_BUFFER_SIZE) {
2547 error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
2548 request->len, NBD_MAX_BUFFER_SIZE);
2549 return -EINVAL;
2550 }
2551
2552 assert(client->contexts.exp == client->exp);
2553 nr_bitmaps = client->exp->nr_export_bitmaps;
2554 request->contexts = g_new0(NBDMetaContexts, 1);
2555 request->contexts->exp = client->exp;
2556
2557 if (payload_len % sizeof(uint32_t) ||
2558 payload_len < sizeof(NBDBlockStatusPayload) ||
2559 payload_len > (sizeof(NBDBlockStatusPayload) +
2560 sizeof(id) * client->contexts.count)) {
2561 goto skip;
2562 }
2563
2564 buf = g_malloc(payload_len);
2565 if (nbd_read(client->ioc, buf, payload_len,
2566 "CMD_BLOCK_STATUS data", errp) < 0) {
2567 return -EIO;
2568 }
2569 trace_nbd_co_receive_request_payload_received(request->cookie,
2570 payload_len);
2571 request->contexts->bitmaps = g_new0(bool, nr_bitmaps);
2572 count = (payload_len - sizeof(NBDBlockStatusPayload)) / sizeof(id);
2573 payload_len = 0;
2574
2575 for (i = 0; i < count; i++) {
2576 id = ldl_be_p(buf + sizeof(NBDBlockStatusPayload) + sizeof(id) * i);
2577 if (id == NBD_META_ID_BASE_ALLOCATION) {
2578 if (!client->contexts.base_allocation ||
2579 request->contexts->base_allocation) {
2580 goto skip;
2581 }
2582 request->contexts->base_allocation = true;
2583 } else if (id == NBD_META_ID_ALLOCATION_DEPTH) {
2584 if (!client->contexts.allocation_depth ||
2585 request->contexts->allocation_depth) {
2586 goto skip;
2587 }
2588 request->contexts->allocation_depth = true;
2589 } else {
2590 unsigned idx = id - NBD_META_ID_DIRTY_BITMAP;
2591
2592 if (idx >= nr_bitmaps || !client->contexts.bitmaps[idx] ||
2593 request->contexts->bitmaps[idx]) {
2594 goto skip;
2595 }
2596 request->contexts->bitmaps[idx] = true;
2597 }
2598 }
2599
2600 request->len = ldq_be_p(buf);
2601 request->contexts->count = count;
2602 return 0;
2603
2604 skip:
2605 trace_nbd_co_receive_block_status_payload_compliance(request->from,
2606 request->len);
2607 request->len = request->contexts->count = 0;
2608 return nbd_drop(client->ioc, payload_len, errp);
2609}
2610
2a6e128b
VSO
2611/* nbd_co_receive_request
2612 * Collect a client request. Return 0 if request looks valid, -EIO to drop
f148ae7d
SL
2613 * connection right away, -EAGAIN to indicate we were interrupted and the
2614 * channel should be quiesced, and any other negative value to report an error
2615 * to the client (although the caller may still need to disconnect after
2616 * reporting the error).
2a6e128b 2617 */
8db7e2d6
EB
2618static int coroutine_fn nbd_co_receive_request(NBDRequestData *req,
2619 NBDRequest *request,
d2223cdd 2620 Error **errp)
a030b347 2621{
72deddc5 2622 NBDClient *client = req->client;
009cd866 2623 bool extended_with_payload;
8db7e2d6
EB
2624 bool check_length = false;
2625 bool check_rofs = false;
2626 bool allocate_buffer = false;
009cd866
EB
2627 bool payload_okay = false;
2628 uint64_t payload_len = 0;
8db7e2d6 2629 int valid_flags = NBD_CMD_FLAG_FUA;
f148ae7d 2630 int ret;
a030b347 2631
1c778ef7 2632 g_assert(qemu_in_coroutine());
f148ae7d
SL
2633 ret = nbd_receive_request(client, request, errp);
2634 if (ret < 0) {
314b9026 2635 return ret;
a030b347
PB
2636 }
2637
22efd811 2638 trace_nbd_co_receive_request_decode_type(request->cookie, request->type,
3736cc5b 2639 nbd_cmd_lookup(request->type));
009cd866
EB
2640 extended_with_payload = client->mode >= NBD_MODE_EXTENDED &&
2641 request->flags & NBD_CMD_FLAG_PAYLOAD_LEN;
2642 if (extended_with_payload) {
2643 payload_len = request->len;
2644 check_length = true;
2645 }
2646
8db7e2d6
EB
2647 switch (request->type) {
2648 case NBD_CMD_DISC:
29b6c3b3
EB
2649 /* Special case: we're going to disconnect without a reply,
2650 * whether or not flags, from, or len are bogus */
8db7e2d6 2651 req->complete = true;
ee898b87 2652 return -EIO;
29b6c3b3 2653
8db7e2d6
EB
2654 case NBD_CMD_READ:
2655 if (client->mode >= NBD_MODE_STRUCTURED) {
2656 valid_flags |= NBD_CMD_FLAG_DF;
eb38c3b6 2657 }
8db7e2d6
EB
2658 check_length = true;
2659 allocate_buffer = true;
2660 break;
eb38c3b6 2661
8db7e2d6 2662 case NBD_CMD_WRITE:
009cd866
EB
2663 if (client->mode >= NBD_MODE_EXTENDED) {
2664 if (!extended_with_payload) {
2665 /* The client is noncompliant. Trace it, but proceed. */
2666 trace_nbd_co_receive_ext_payload_compliance(request->from,
2667 request->len);
2668 }
2669 valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
2670 }
2671 payload_okay = true;
8db7e2d6
EB
2672 payload_len = request->len;
2673 check_length = true;
2674 allocate_buffer = true;
2675 check_rofs = true;
2676 break;
2677
2678 case NBD_CMD_FLUSH:
2679 break;
2680
2681 case NBD_CMD_TRIM:
2682 check_rofs = true;
2683 break;
2684
2685 case NBD_CMD_CACHE:
2686 check_length = true;
2687 break;
2688
2689 case NBD_CMD_WRITE_ZEROES:
2690 valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
2691 check_rofs = true;
2692 break;
2693
2694 case NBD_CMD_BLOCK_STATUS:
2dcbb11b
EB
2695 if (extended_with_payload) {
2696 ret = nbd_co_block_status_payload_read(client, request, errp);
2697 if (ret < 0) {
2698 return ret;
2699 }
2700 /* payload now consumed */
2701 check_length = false;
2702 payload_len = 0;
2703 valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
2704 } else {
2705 request->contexts = &client->contexts;
2706 }
8db7e2d6
EB
2707 valid_flags |= NBD_CMD_FLAG_REQ_ONE;
2708 break;
2709
2710 default:
2711 /* Unrecognized, will fail later */
2712 ;
2d821488 2713 }
7fa5c565 2714
8db7e2d6
EB
2715 /* Payload and buffer handling. */
2716 if (!payload_len) {
2717 req->complete = true;
2718 }
2719 if (check_length && request->len > NBD_MAX_BUFFER_SIZE) {
2720 /* READ, WRITE, CACHE */
2721 error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
2722 request->len, NBD_MAX_BUFFER_SIZE);
2723 return -EINVAL;
2724 }
009cd866
EB
2725 if (payload_len && !payload_okay) {
2726 /*
2727 * For now, we don't support payloads on other commands; but
2728 * we can keep the connection alive by ignoring the payload.
2729 * We will fail the command later with NBD_EINVAL for the use
2730 * of an unsupported flag (and not for access beyond bounds).
2731 */
2732 assert(request->type != NBD_CMD_WRITE);
2733 request->len = 0;
2734 }
8db7e2d6
EB
2735 if (allocate_buffer) {
2736 /* READ, WRITE */
2737 req->data = blk_try_blockalign(client->exp->common.blk,
2738 request->len);
2739 if (req->data == NULL) {
2740 error_setg(errp, "No memory");
2741 return -ENOMEM;
2742 }
2743 }
2744 if (payload_len) {
009cd866
EB
2745 if (payload_okay) {
2746 /* WRITE */
2747 assert(req->data);
2748 ret = nbd_read(client->ioc, req->data, payload_len,
2749 "CMD_WRITE data", errp);
2750 } else {
2751 ret = nbd_drop(client->ioc, payload_len, errp);
2752 }
8db7e2d6 2753 if (ret < 0) {
ee898b87 2754 return -EIO;
a030b347 2755 }
29b6c3b3 2756 req->complete = true;
22efd811 2757 trace_nbd_co_receive_request_payload_received(request->cookie,
8db7e2d6 2758 payload_len);
a030b347 2759 }
29b6c3b3 2760
fed5f8f8 2761 /* Sanity checks. */
8db7e2d6
EB
2762 if (client->exp->nbdflags & NBD_FLAG_READ_ONLY && check_rofs) {
2763 /* WRITE, TRIM, WRITE_ZEROES */
fed5f8f8
EB
2764 error_setg(errp, "Export is read-only");
2765 return -EROFS;
2766 }
2767 if (request->from > client->exp->size ||
9d26dfcb 2768 request->len > client->exp->size - request->from) {
b2578459 2769 error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu64
2fd2c840 2770 ", Size: %" PRIu64, request->from, request->len,
9d26dfcb 2771 client->exp->size);
fed5f8f8
EB
2772 return (request->type == NBD_CMD_WRITE ||
2773 request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
29b6c3b3 2774 }
6e280648
EB
2775 if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len,
2776 client->check_align)) {
2777 /*
2778 * The block layer gracefully handles unaligned requests, but
2779 * it's still worth tracing client non-compliance
2780 */
2781 trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type),
2782 request->from,
2783 request->len,
2784 client->check_align);
2785 }
5c54e7fa
VSO
2786 if (request->flags & ~valid_flags) {
2787 error_setg(errp, "unsupported flags for command %s (got 0x%x)",
2788 nbd_cmd_lookup(request->type), request->flags);
ee898b87 2789 return -EINVAL;
1f4d6d18 2790 }
29b6c3b3 2791
ee898b87 2792 return 0;
a030b347
PB
2793}
2794
6a417599
VSO
2795/* Send simple reply without a payload, or a structured error
2796 * @error_msg is ignored if @ret >= 0
2797 * Returns 0 if connection is still live, -errno on failure to talk to client
2798 */
2799static coroutine_fn int nbd_send_generic_reply(NBDClient *client,
66d4f4fe 2800 NBDRequest *request,
6a417599
VSO
2801 int ret,
2802 const char *error_msg,
2803 Error **errp)
2804{
ac132d05 2805 if (client->mode >= NBD_MODE_STRUCTURED && ret < 0) {
66d4f4fe 2806 return nbd_co_send_chunk_error(client, request, -ret, error_msg, errp);
11d3355f
EB
2807 } else if (client->mode >= NBD_MODE_EXTENDED) {
2808 return nbd_co_send_chunk_done(client, request, errp);
6a417599 2809 } else {
66d4f4fe 2810 return nbd_co_send_simple_reply(client, request, ret < 0 ? -ret : 0,
6a417599
VSO
2811 NULL, 0, errp);
2812 }
2813}
2814
2815/* Handle NBD_CMD_READ request.
2816 * Return -errno if sending fails. Other errors are reported directly to the
2817 * client as an error reply. */
2818static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request,
2819 uint8_t *data, Error **errp)
2820{
2821 int ret;
2822 NBDExport *exp = client->exp;
2823
7fa5c565 2824 assert(request->type == NBD_CMD_READ);
b2578459 2825 assert(request->len <= NBD_MAX_BUFFER_SIZE);
6a417599
VSO
2826
2827 /* XXX: NBD Protocol only documents use of FUA with WRITE */
2828 if (request->flags & NBD_CMD_FLAG_FUA) {
37a4f70c 2829 ret = blk_co_flush(exp->common.blk);
6a417599 2830 if (ret < 0) {
66d4f4fe 2831 return nbd_send_generic_reply(client, request, ret,
6a417599
VSO
2832 "flush failed", errp);
2833 }
2834 }
2835
ac132d05
EB
2836 if (client->mode >= NBD_MODE_STRUCTURED &&
2837 !(request->flags & NBD_CMD_FLAG_DF) && request->len)
2f454def 2838 {
66d4f4fe 2839 return nbd_co_send_sparse_read(client, request, request->from,
6a417599
VSO
2840 data, request->len, errp);
2841 }
2842
d2223cdd 2843 ret = blk_co_pread(exp->common.blk, request->from, request->len, data, 0);
7fa5c565 2844 if (ret < 0) {
66d4f4fe 2845 return nbd_send_generic_reply(client, request, ret,
6a417599
VSO
2846 "reading from file failed", errp);
2847 }
2848
ac132d05 2849 if (client->mode >= NBD_MODE_STRUCTURED) {
6a417599 2850 if (request->len) {
66d4f4fe 2851 return nbd_co_send_chunk_read(client, request, request->from, data,
a7c8ed36 2852 request->len, true, errp);
6a417599 2853 } else {
66d4f4fe 2854 return nbd_co_send_chunk_done(client, request, errp);
6a417599
VSO
2855 }
2856 } else {
66d4f4fe 2857 return nbd_co_send_simple_reply(client, request, 0,
6a417599
VSO
2858 data, request->len, errp);
2859 }
2860}
2861
7fa5c565
VSO
2862/*
2863 * nbd_do_cmd_cache
2864 *
2865 * Handle NBD_CMD_CACHE request.
2866 * Return -errno if sending fails. Other errors are reported directly to the
2867 * client as an error reply.
2868 */
2869static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request,
2870 Error **errp)
2871{
2872 int ret;
2873 NBDExport *exp = client->exp;
2874
2875 assert(request->type == NBD_CMD_CACHE);
b2578459 2876 assert(request->len <= NBD_MAX_BUFFER_SIZE);
7fa5c565 2877
37a4f70c 2878 ret = blk_co_preadv(exp->common.blk, request->from, request->len,
7fa5c565
VSO
2879 NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
2880
66d4f4fe 2881 return nbd_send_generic_reply(client, request, ret,
7fa5c565
VSO
2882 "caching data failed", errp);
2883}
2884
6f302e60
VSO
2885/* Handle NBD request.
2886 * Return -errno if sending fails. Other errors are reported directly to the
2887 * client as an error reply. */
2888static coroutine_fn int nbd_handle_request(NBDClient *client,
2889 NBDRequest *request,
2890 uint8_t *data, Error **errp)
2891{
2892 int ret;
2893 int flags;
2894 NBDExport *exp = client->exp;
2895 char *msg;
3b1f244c 2896 size_t i;
6f302e60
VSO
2897
2898 switch (request->type) {
bc37b06a 2899 case NBD_CMD_CACHE:
7fa5c565
VSO
2900 return nbd_do_cmd_cache(client, request, errp);
2901
2902 case NBD_CMD_READ:
6f302e60
VSO
2903 return nbd_do_cmd_read(client, request, data, errp);
2904
2905 case NBD_CMD_WRITE:
2906 flags = 0;
2907 if (request->flags & NBD_CMD_FLAG_FUA) {
2908 flags |= BDRV_REQ_FUA;
2909 }
b2578459 2910 assert(request->len <= NBD_MAX_BUFFER_SIZE);
d2223cdd
PB
2911 ret = blk_co_pwrite(exp->common.blk, request->from, request->len, data,
2912 flags);
66d4f4fe 2913 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2914 "writing to file failed", errp);
2915
2916 case NBD_CMD_WRITE_ZEROES:
2917 flags = 0;
2918 if (request->flags & NBD_CMD_FLAG_FUA) {
2919 flags |= BDRV_REQ_FUA;
2920 }
2921 if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
2922 flags |= BDRV_REQ_MAY_UNMAP;
2923 }
b491dbb7
EB
2924 if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
2925 flags |= BDRV_REQ_NO_FALLBACK;
2926 }
d2223cdd
PB
2927 ret = blk_co_pwrite_zeroes(exp->common.blk, request->from, request->len,
2928 flags);
66d4f4fe 2929 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2930 "writing to file failed", errp);
2931
2932 case NBD_CMD_DISC:
2933 /* unreachable, thanks to special case in nbd_co_receive_request() */
2934 abort();
2935
2936 case NBD_CMD_FLUSH:
37a4f70c 2937 ret = blk_co_flush(exp->common.blk);
66d4f4fe 2938 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2939 "flush failed", errp);
2940
2941 case NBD_CMD_TRIM:
e3557422 2942 ret = blk_co_pdiscard(exp->common.blk, request->from, request->len);
890cbccb 2943 if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) {
37a4f70c 2944 ret = blk_co_flush(exp->common.blk);
65529782 2945 }
66d4f4fe 2946 return nbd_send_generic_reply(client, request, ret,
6f302e60
VSO
2947 "discard failed", errp);
2948
e7b1948d 2949 case NBD_CMD_BLOCK_STATUS:
1dec4643 2950 assert(request->contexts);
bcc16cc1
EB
2951 assert(client->mode >= NBD_MODE_EXTENDED ||
2952 request->len <= UINT32_MAX);
1dec4643 2953 if (request->contexts->count) {
fb7afc79 2954 bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE;
1dec4643 2955 int contexts_remaining = request->contexts->count;
fb7afc79 2956
2dcbb11b
EB
2957 if (!request->len) {
2958 return nbd_send_generic_reply(client, request, -EINVAL,
2959 "need non-zero length", errp);
2960 }
1dec4643 2961 if (request->contexts->base_allocation) {
66d4f4fe 2962 ret = nbd_co_send_block_status(client, request,
ff7e261b 2963 exp->common.blk,
37a4f70c 2964 request->from,
fb7afc79 2965 request->len, dont_fragment,
47ec485e 2966 !--contexts_remaining,
3d068aff
VSO
2967 NBD_META_ID_BASE_ALLOCATION,
2968 errp);
73e064cc
EB
2969 if (ret < 0) {
2970 return ret;
2971 }
2972 }
2973
1dec4643 2974 if (request->contexts->allocation_depth) {
66d4f4fe 2975 ret = nbd_co_send_block_status(client, request,
ff7e261b 2976 exp->common.blk,
71719cd5
EB
2977 request->from, request->len,
2978 dont_fragment,
2979 !--contexts_remaining,
2980 NBD_META_ID_ALLOCATION_DEPTH,
2981 errp);
2982 if (ret < 0) {
2983 return ret;
2984 }
2985 }
2986
1dec4643 2987 assert(request->contexts->exp == client->exp);
3b1f244c 2988 for (i = 0; i < client->exp->nr_export_bitmaps; i++) {
1dec4643 2989 if (!request->contexts->bitmaps[i]) {
3b1f244c
EB
2990 continue;
2991 }
66d4f4fe 2992 ret = nbd_co_send_bitmap(client, request,
3b1f244c 2993 client->exp->export_bitmaps[i],
3d068aff 2994 request->from, request->len,
47ec485e 2995 dont_fragment, !--contexts_remaining,
3b1f244c 2996 NBD_META_ID_DIRTY_BITMAP + i, errp);
73e064cc
EB
2997 if (ret < 0) {
2998 return ret;
2999 }
3d068aff
VSO
3000 }
3001
47ec485e
EB
3002 assert(!contexts_remaining);
3003
73e064cc 3004 return 0;
1dec4643
EB
3005 } else if (client->contexts.count) {
3006 return nbd_send_generic_reply(client, request, -EINVAL,
3007 "CMD_BLOCK_STATUS payload not valid",
3008 errp);
e7b1948d 3009 } else {
66d4f4fe 3010 return nbd_send_generic_reply(client, request, -EINVAL,
e7b1948d
VSO
3011 "CMD_BLOCK_STATUS not negotiated",
3012 errp);
3013 }
3014
6f302e60
VSO
3015 default:
3016 msg = g_strdup_printf("invalid request type (%" PRIu32 ") received",
3017 request->type);
66d4f4fe 3018 ret = nbd_send_generic_reply(client, request, -EINVAL, msg,
6f302e60
VSO
3019 errp);
3020 g_free(msg);
3021 return ret;
3022 }
3023}
3024
ff82911c
PB
3025/* Owns a reference to the NBDClient passed as opaque. */
3026static coroutine_fn void nbd_trip(void *opaque)
75818250 3027{
9c707525
KW
3028 NBDRequestData *req = opaque;
3029 NBDClient *client = req->client;
ff82911c 3030 NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */
a0dc63a6 3031 int ret;
2fd2c840 3032 Error *local_err = NULL;
b2e3d87f 3033
f816310d
SH
3034 /*
3035 * Note that nbd_client_put() and client_close() must be called from the
3036 * main loop thread. Use aio_co_reschedule_self() to switch AioContext
3037 * before calling these functions.
3038 */
3039
9588463e 3040 trace_nbd_trip();
7075d235
SH
3041
3042 qemu_mutex_lock(&client->lock);
3043
ff2b68aa 3044 if (client->closing) {
f816310d 3045 goto done;
ff2b68aa 3046 }
b2e3d87f 3047
f148ae7d
SL
3048 if (client->quiescing) {
3049 /*
3050 * We're switching between AIO contexts. Don't attempt to receive a new
3051 * request and kick the main context which may be waiting for us.
3052 */
f148ae7d
SL
3053 client->recv_coroutine = NULL;
3054 aio_wait_kick();
f816310d 3055 goto done;
f148ae7d
SL
3056 }
3057
7075d235
SH
3058 /*
3059 * nbd_co_receive_request() returns -EAGAIN when nbd_drained_begin() has
3060 * set client->quiescing but by the time we get back nbd_drained_end() may
3061 * have already cleared client->quiescing. In that case we try again
3062 * because nothing else will spawn an nbd_trip() coroutine until we set
3063 * client->recv_coroutine = NULL further down.
3064 */
3065 do {
3066 assert(client->recv_coroutine == qemu_coroutine_self());
3067 qemu_mutex_unlock(&client->lock);
3068 ret = nbd_co_receive_request(req, &request, &local_err);
3069 qemu_mutex_lock(&client->lock);
3070 } while (ret == -EAGAIN && !client->quiescing);
3071
ee898b87 3072 client->recv_coroutine = NULL;
b2e3d87f 3073
d6268348
WC
3074 if (client->closing) {
3075 /*
3076 * The client may be closed when we are blocked in
3077 * nbd_co_receive_request()
3078 */
3079 goto done;
3080 }
3081
f148ae7d 3082 if (ret == -EAGAIN) {
f148ae7d
SL
3083 goto done;
3084 }
3085
a0d7ce20 3086 nbd_client_receive_next_request(client);
7075d235 3087
a0d7ce20
VSO
3088 if (ret == -EIO) {
3089 goto disconnect;
3090 }
3091
7075d235 3092 qemu_mutex_unlock(&client->lock);
bd2cd4a4
FW
3093 qio_channel_set_cork(client->ioc, true);
3094
a0d7ce20 3095 if (ret < 0) {
314b9026 3096 /* It wasn't -EIO, so, according to nbd_co_receive_request()
6a417599
VSO
3097 * semantics, we should return the error to the client. */
3098 Error *export_err = local_err;
3099
3100 local_err = NULL;
66d4f4fe 3101 ret = nbd_send_generic_reply(client, &request, -EINVAL,
6a417599
VSO
3102 error_get_pretty(export_err), &local_err);
3103 error_free(export_err);
6f302e60
VSO
3104 } else {
3105 ret = nbd_handle_request(client, &request, req->data, &local_err);
5c54e7fa 3106 }
1dec4643
EB
3107 if (request.contexts && request.contexts != &client->contexts) {
3108 assert(request.type == NBD_CMD_BLOCK_STATUS);
3109 g_free(request.contexts->bitmaps);
3110 g_free(request.contexts);
3111 }
7075d235
SH
3112
3113 qio_channel_set_cork(client->ioc, false);
3114 qemu_mutex_lock(&client->lock);
3115
5c54e7fa 3116 if (ret < 0) {
c7b97282 3117 error_prepend(&local_err, "Failed to send reply: ");
2fd2c840
VSO
3118 goto disconnect;
3119 }
3120
2dcbb11b
EB
3121 /*
3122 * We must disconnect after NBD_CMD_WRITE or BLOCK_STATUS with
3123 * payload if we did not read the payload.
8c372a02 3124 */
2fd2c840
VSO
3125 if (!req->complete) {
3126 error_setg(&local_err, "Request handling failed in intermediate state");
8c372a02 3127 goto disconnect;
b2e3d87f
NT
3128 }
3129
7fe7b68b 3130done:
9c707525 3131 nbd_request_put(req);
7075d235
SH
3132
3133 qemu_mutex_unlock(&client->lock);
3134
f816310d
SH
3135 if (!nbd_client_put_nonzero(client)) {
3136 aio_co_reschedule_self(qemu_get_aio_context());
3137 nbd_client_put(client);
3138 }
262db388
PB
3139 return;
3140
8c372a02 3141disconnect:
2fd2c840
VSO
3142 if (local_err) {
3143 error_reportf_err(local_err, "Disconnect client, due to: ");
3144 }
7075d235 3145
72deddc5 3146 nbd_request_put(req);
7075d235 3147 qemu_mutex_unlock(&client->lock);
f816310d
SH
3148
3149 aio_co_reschedule_self(qemu_get_aio_context());
0c9390d9 3150 client_close(client, true);
ff82911c 3151 nbd_client_put(client);
7a5ca864 3152}
af49bbbe 3153
7075d235
SH
3154/*
3155 * Runs in export AioContext and main loop thread. Caller must hold
3156 * client->lock.
3157 */
ff82911c 3158static void nbd_client_receive_next_request(NBDClient *client)
958c717d 3159{
9c707525
KW
3160 NBDRequestData *req;
3161
f148ae7d
SL
3162 if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS &&
3163 !client->quiescing) {
ff82911c 3164 nbd_client_get(client);
9c707525
KW
3165 req = nbd_request_get(client);
3166 client->recv_coroutine = qemu_coroutine_create(nbd_trip, req);
8612c686 3167 aio_co_schedule(client->exp->common.ctx, client->recv_coroutine);
958c717d
HR
3168 }
3169}
3170
1a6245a5
FZ
3171static coroutine_fn void nbd_co_client_start(void *opaque)
3172{
c84087f2 3173 NBDClient *client = opaque;
2fd2c840 3174 Error *local_err = NULL;
1a6245a5 3175
df8ad9f1
EB
3176 qemu_co_mutex_init(&client->send_lock);
3177
2fd2c840
VSO
3178 if (nbd_negotiate(client, &local_err)) {
3179 if (local_err) {
3180 error_report_err(local_err);
3181 }
0c9390d9 3182 client_close(client, false);
c84087f2 3183 return;
1a6245a5 3184 }
ff82911c 3185
7075d235
SH
3186 WITH_QEMU_LOCK_GUARD(&client->lock) {
3187 nbd_client_receive_next_request(client);
3188 }
1a6245a5
FZ
3189}
3190
0c9390d9 3191/*
7f7dfe2a
VSO
3192 * Create a new client listener using the given channel @sioc.
3193 * Begin servicing it in a coroutine. When the connection closes, call
3194 * @close_fn with an indication of whether the client completed negotiation.
0c9390d9 3195 */
7f7dfe2a 3196void nbd_client_new(QIOChannelSocket *sioc,
f95910fe 3197 QCryptoTLSCreds *tlscreds,
b25e12da 3198 const char *tlsauthz,
0c9390d9 3199 void (*close_fn)(NBDClient *, bool))
af49bbbe 3200{
1743b515 3201 NBDClient *client;
c84087f2 3202 Coroutine *co;
1a6245a5 3203
e8d3eb74 3204 client = g_new0(NBDClient, 1);
7075d235 3205 qemu_mutex_init(&client->lock);
1743b515 3206 client->refcount = 1;
f95910fe
DB
3207 client->tlscreds = tlscreds;
3208 if (tlscreds) {
3209 object_ref(OBJECT(client->tlscreds));
3210 }
b25e12da 3211 client->tlsauthz = g_strdup(tlsauthz);
1c778ef7 3212 client->sioc = sioc;
f1426881 3213 qio_channel_set_delay(QIO_CHANNEL(sioc), false);
1c778ef7
DB
3214 object_ref(OBJECT(client->sioc));
3215 client->ioc = QIO_CHANNEL(sioc);
3216 object_ref(OBJECT(client->ioc));
0c9390d9 3217 client->close_fn = close_fn;
2c8d9f06 3218
c84087f2
VSO
3219 co = qemu_coroutine_create(nbd_co_client_start, client);
3220 qemu_coroutine_enter(co);
af49bbbe 3221}