]> git.ipfire.org Git - thirdparty/qemu.git/blob
a2a478d293
[thirdparty/qemu.git] /
1 /*
2 * Present a block device as a raw image through FUSE
3 *
4 * Copyright (c) 2020, 2025 Hanna Czenczek <hreitz@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; under version 2 or later of the License.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #define FUSE_USE_VERSION 31
20
21 #include "qemu/osdep.h"
22 #include "qemu/memalign.h"
23 #include "qemu/aio.h"
24 #include "block/block_int-common.h"
25 #include "block/export.h"
26 #include "block/fuse.h"
27 #include "block/qapi.h"
28 #include "qapi/error.h"
29 #include "qapi/qapi-commands-block.h"
30 #include "qemu/coroutine.h"
31 #include "qemu/error-report.h"
32 #include "qemu/main-loop.h"
33 #include "system/block-backend.h"
34 #include "system/iothread.h"
35
36 #include <fuse.h>
37 #include <fuse_lowlevel.h>
38
39 #include "standard-headers/linux/fuse.h"
40 #include <sys/ioctl.h>
41
42 #if defined(CONFIG_FALLOCATE_ZERO_RANGE)
43 #include <linux/falloc.h>
44 #endif
45
46 #ifdef __linux__
47 #include <linux/fs.h>
48 #endif
49
50 /* Prevent overly long bounce buffer allocations */
51 #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
52 #define FUSE_MAX_WRITE_BYTES (64 * 1024)
53
54 /*
55 * fuse_init_in structure before 7.36. We don't need the flags2 field added
56 * there, so we can work with the smaller older structure to stay compatible
57 * with older kernels.
58 */
59 struct fuse_init_in_compat {
60 uint32_t major;
61 uint32_t minor;
62 uint32_t max_readahead;
63 uint32_t flags;
64 };
65
66 typedef struct FuseRequestInHeader {
67 struct fuse_in_header common;
68 /* All supported requests */
69 union {
70 struct fuse_init_in_compat init;
71 struct fuse_open_in open;
72 struct fuse_setattr_in setattr;
73 struct fuse_read_in read;
74 struct fuse_write_in write;
75 struct fuse_fallocate_in fallocate;
76 #ifdef CONFIG_FUSE_LSEEK
77 struct fuse_lseek_in lseek;
78 #endif
79 };
80 } FuseRequestInHeader;
81
82 typedef struct FuseRequestOutHeader {
83 struct fuse_out_header common;
84 /* All supported requests */
85 union {
86 struct fuse_init_out init;
87 struct fuse_statfs_out statfs;
88 struct fuse_open_out open;
89 struct fuse_attr_out attr;
90 struct fuse_write_out write;
91 #ifdef CONFIG_FUSE_LSEEK
92 struct fuse_lseek_out lseek;
93 #endif
94 };
95 } FuseRequestOutHeader;
96
97 typedef union FuseRequestInHeaderBuf {
98 struct FuseRequestInHeader structured;
99 struct {
100 /*
101 * Part of the request header that is filled for write requests
102 * (Needed because we want the data to go into a different buffer, to
103 * avoid having to use a bounce buffer)
104 */
105 char head[sizeof(struct fuse_in_header) +
106 sizeof(struct fuse_write_in)];
107 /*
108 * Rest of the request header for requests that have a longer header
109 * than write requests
110 */
111 char tail[sizeof(FuseRequestInHeader) -
112 (sizeof(struct fuse_in_header) +
113 sizeof(struct fuse_write_in))];
114 };
115 } FuseRequestInHeaderBuf;
116
117 QEMU_BUILD_BUG_ON(sizeof(FuseRequestInHeaderBuf) !=
118 sizeof(FuseRequestInHeader));
119 QEMU_BUILD_BUG_ON(sizeof(((FuseRequestInHeaderBuf *)0)->head) +
120 sizeof(((FuseRequestInHeaderBuf *)0)->tail) !=
121 sizeof(FuseRequestInHeader));
122
123 typedef struct FuseExport FuseExport;
124
125 /*
126 * One FUSE "queue", representing one FUSE FD from which requests are fetched
127 * and processed. Each queue is tied to an AioContext.
128 */
129 typedef struct FuseQueue {
130 FuseExport *exp;
131
132 AioContext *ctx;
133 int fuse_fd;
134
135 /*
136 * Cached buffer to receive the data of WRITE requests. Cached because:
137 * To read requests, we put a FuseRequestInHeaderBuf (FRIHB) object on the
138 * stack, and a (WRITE data) buffer on the heap. We pass FRIHB.head and the
139 * data buffer to readv(). This way, for WRITE requests, we get exactly
140 * their data in the data buffer and can avoid bounce buffering.
141 * However, for non-WRITE requests, some of the header may end up in the
142 * data buffer, so we will need to copy that back into the FRIHB object, and
143 * then we don't need the heap buffer anymore. That is why we cache it, so
144 * we can trivially reuse it between non-WRITE requests.
145 *
146 * Note that these data buffers and thus req_write_data_cached are allocated
147 * via blk_blockalign() and thus need to be freed via qemu_vfree().
148 */
149 void *req_write_data_cached;
150 } FuseQueue;
151
152 struct FuseExport {
153 BlockExport common;
154
155 struct fuse_session *fuse_session;
156 unsigned int in_flight; /* atomic */
157 bool mounted, fd_handler_set_up;
158
159 /*
160 * Set when there was an unrecoverable error and no requests should be read
161 * from the device anymore (basically only in case of something we would
162 * consider a kernel bug). Access atomically.
163 */
164 bool halted;
165
166 int num_queues;
167 FuseQueue *queues;
168 /*
169 * True if this export should follow the generic export's AioContext.
170 * Will be false if the queues' AioContexts have been explicitly set by the
171 * user, i.e. are expected to stay in those contexts.
172 * (I.e. is always false if there is more than one queue.)
173 */
174 bool follow_aio_context;
175
176 char *mountpoint;
177 bool writable;
178 bool growable;
179 /* Whether allow_other was used as a mount option or not */
180 bool allow_other;
181
182 /* All atomic */
183 mode_t st_mode;
184 uid_t st_uid;
185 gid_t st_gid;
186 };
187
188 /*
189 * Verify that the size of FuseRequestInHeaderBuf.head plus the data
190 * buffer are big enough to be accepted by the FUSE kernel driver.
191 */
192 QEMU_BUILD_BUG_ON(sizeof(((FuseRequestInHeaderBuf *)0)->head) +
193 FUSE_MAX_WRITE_BYTES <
194 FUSE_MIN_READ_BUFFER);
195
196 static GHashTable *exports;
197
198 static void fuse_export_shutdown(BlockExport *exp);
199 static void fuse_export_delete(BlockExport *exp);
200 static void fuse_export_halt(FuseExport *exp);
201
202 static void init_exports_table(void);
203
204 static int mount_fuse_export(FuseExport *exp, Error **errp);
205 static int clone_fuse_fd(int fd, Error **errp);
206
207 static bool is_regular_file(const char *path, Error **errp);
208
209 static void read_from_fuse_fd(void *opaque);
210 static void coroutine_fn
211 fuse_co_process_request(FuseQueue *q, const FuseRequestInHeader *in_hdr,
212 const void *data_buffer);
213 static int fuse_write_err(int fd, const struct fuse_in_header *in_hdr, int err);
214
215 static void fuse_inc_in_flight(FuseExport *exp)
216 {
217 if (qatomic_fetch_inc(&exp->in_flight) == 0) {
218 /* Prevent export from being deleted */
219 blk_exp_ref(&exp->common);
220 }
221 }
222
223 static void fuse_dec_in_flight(FuseExport *exp)
224 {
225 if (qatomic_fetch_dec(&exp->in_flight) == 1) {
226 /* Wake AIO_WAIT_WHILE() */
227 aio_wait_kick();
228
229 /* Now the export can be deleted */
230 blk_exp_unref(&exp->common);
231 }
232 }
233
234 /**
235 * Attach FUSE FD read handler.
236 */
237 static void fuse_attach_handlers(FuseExport *exp)
238 {
239 if (qatomic_read(&exp->halted)) {
240 return;
241 }
242
243 for (int i = 0; i < exp->num_queues; i++) {
244 aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
245 read_from_fuse_fd, NULL, NULL, NULL,
246 &exp->queues[i]);
247 }
248 exp->fd_handler_set_up = true;
249 }
250
251 /**
252 * Detach FUSE FD read handler.
253 */
254 static void fuse_detach_handlers(FuseExport *exp)
255 {
256 for (int i = 0; i < exp->num_queues; i++) {
257 aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
258 NULL, NULL, NULL, NULL, NULL);
259 }
260 exp->fd_handler_set_up = false;
261 }
262
263 static void fuse_export_drained_begin(void *opaque)
264 {
265 fuse_detach_handlers(opaque);
266 }
267
268 static void fuse_export_drained_end(void *opaque)
269 {
270 FuseExport *exp = opaque;
271
272 /* Refresh AioContext in case it changed */
273 exp->common.ctx = blk_get_aio_context(exp->common.blk);
274 if (exp->follow_aio_context) {
275 assert(exp->num_queues == 1);
276 exp->queues[0].ctx = exp->common.ctx;
277 }
278
279 fuse_attach_handlers(exp);
280 }
281
282 static bool fuse_export_drained_poll(void *opaque)
283 {
284 FuseExport *exp = opaque;
285
286 return qatomic_read(&exp->in_flight) > 0;
287 }
288
289 static const BlockDevOps fuse_export_blk_dev_ops = {
290 .drained_begin = fuse_export_drained_begin,
291 .drained_end = fuse_export_drained_end,
292 .drained_poll = fuse_export_drained_poll,
293 };
294
295 static int fuse_export_create(BlockExport *blk_exp,
296 BlockExportOptions *blk_exp_args,
297 AioContext *const *multithread,
298 size_t mt_count,
299 Error **errp)
300 {
301 ERRP_GUARD(); /* ensure clean-up even with error_fatal */
302 FuseExport *exp = container_of(blk_exp, FuseExport, common);
303 BlockExportOptionsFuse *args = &blk_exp_args->u.fuse;
304 uint32_t st_mode;
305 int ret;
306
307 assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
308
309 if (multithread) {
310 /* Guaranteed by common export code */
311 assert(mt_count >= 1);
312
313 exp->follow_aio_context = false;
314 exp->num_queues = mt_count;
315 exp->queues = g_new(FuseQueue, mt_count);
316
317 for (size_t i = 0; i < mt_count; i++) {
318 exp->queues[i] = (FuseQueue) {
319 .exp = exp,
320 .ctx = multithread[i],
321 .fuse_fd = -1,
322 };
323 }
324 } else {
325 /* Guaranteed by common export code */
326 assert(mt_count == 0);
327
328 exp->follow_aio_context = true;
329 exp->num_queues = 1;
330 exp->queues = g_new(FuseQueue, 1);
331 exp->queues[0] = (FuseQueue) {
332 .exp = exp,
333 .ctx = exp->common.ctx,
334 .fuse_fd = -1,
335 };
336 }
337
338 /* For growable and writable exports, take the RESIZE permission */
339 if (args->growable || blk_exp_args->writable) {
340 uint64_t blk_perm, blk_shared_perm;
341
342 blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm);
343
344 ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE,
345 blk_shared_perm, errp);
346 if (ret < 0) {
347 goto fail;
348 }
349 }
350
351 blk_set_dev_ops(exp->common.blk, &fuse_export_blk_dev_ops, exp);
352
353 /*
354 * We handle draining ourselves using an in-flight counter and by disabling
355 * the FUSE fd handler. Do not queue BlockBackend requests, they need to
356 * complete so the in-flight counter reaches zero.
357 */
358 blk_set_disable_request_queuing(exp->common.blk, true);
359
360 init_exports_table();
361
362 /*
363 * It is important to do this check before calling is_regular_file() --
364 * that function will do a stat(), which we would have to handle if we
365 * already exported something on @mountpoint. But we cannot, because
366 * we are currently caught up here.
367 * (Note that ideally we would want to resolve relative paths here,
368 * but bdrv_make_absolute_filename() might do the wrong thing for
369 * paths that contain colons, and realpath() would resolve symlinks,
370 * which we do not want: The mount point is not going to be the
371 * symlink's destination, but the link itself.)
372 * So this will not catch all potential clashes, but hopefully at
373 * least the most common one of specifying exactly the same path
374 * string twice.
375 */
376 if (g_hash_table_contains(exports, args->mountpoint)) {
377 error_setg(errp, "There already is a FUSE export on '%s'",
378 args->mountpoint);
379 ret = -EEXIST;
380 goto fail;
381 }
382
383 if (!is_regular_file(args->mountpoint, errp)) {
384 ret = -EINVAL;
385 goto fail;
386 }
387
388 exp->mountpoint = g_strdup(args->mountpoint);
389 exp->writable = blk_exp_args->writable;
390 exp->growable = args->growable;
391
392 /* set default */
393 if (!args->has_allow_other) {
394 args->allow_other = FUSE_EXPORT_ALLOW_OTHER_AUTO;
395 }
396
397 st_mode = S_IFREG | S_IRUSR;
398 if (exp->writable) {
399 st_mode |= S_IWUSR;
400 }
401 qatomic_set(&exp->st_mode, st_mode);
402 qatomic_set(&exp->st_uid, getuid());
403 qatomic_set(&exp->st_gid, getgid());
404
405 if (args->allow_other == FUSE_EXPORT_ALLOW_OTHER_AUTO) {
406 /* Try allow_other == true first, ignore errors */
407 exp->allow_other = true;
408 ret = mount_fuse_export(exp, NULL);
409 if (ret < 0) {
410 exp->allow_other = false;
411 ret = mount_fuse_export(exp, errp);
412 }
413 } else {
414 exp->allow_other = args->allow_other == FUSE_EXPORT_ALLOW_OTHER_ON;
415 ret = mount_fuse_export(exp, errp);
416 }
417 if (ret < 0) {
418 goto fail;
419 }
420
421 g_hash_table_insert(exports, g_strdup(exp->mountpoint), NULL);
422
423 assert(exp->num_queues >= 1);
424 exp->queues[0].fuse_fd = fuse_session_fd(exp->fuse_session);
425 ret = qemu_fcntl_addfl(exp->queues[0].fuse_fd, O_NONBLOCK);
426 if (ret < 0) {
427 error_setg_errno(errp, -ret, "Failed to make FUSE FD non-blocking");
428 goto fail;
429 }
430
431 for (int i = 1; i < exp->num_queues; i++) {
432 int fd = clone_fuse_fd(exp->queues[0].fuse_fd, errp);
433 if (fd < 0) {
434 ret = fd;
435 goto fail;
436 }
437 exp->queues[i].fuse_fd = fd;
438 }
439
440 fuse_attach_handlers(exp);
441 return 0;
442
443 fail:
444 fuse_export_shutdown(blk_exp);
445 fuse_export_delete(blk_exp);
446 return ret;
447 }
448
449 /**
450 * Allocates the global @exports hash table.
451 */
452 static void init_exports_table(void)
453 {
454 if (exports) {
455 return;
456 }
457
458 exports = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
459 }
460
461 /**
462 * Create exp->fuse_session and mount it. Expects exp->mountpoint,
463 * exp->writable, and exp->allow_other to be set as intended for the mount.
464 */
465 static int mount_fuse_export(FuseExport *exp, Error **errp)
466 {
467 const char *fuse_argv[4];
468 char *mount_opts;
469 struct fuse_args fuse_args;
470 int ret;
471 /*
472 * We just create the session for mounting/unmounting, no need to provide
473 * any operations. However, since libfuse commit 52a633a5d, we have to
474 * provide some op struct and cannot just pass NULL (even though the commit
475 * message ("allow passing ops as NULL") seems to imply the exact opposite,
476 * as does the comment added to fuse_session_new_fn() ("To create a no-op
477 * session just for mounting pass op as NULL.").
478 * This is how said libfuse commit implements a no-op session internally, so
479 * do it the same way.
480 */
481 static const struct fuse_lowlevel_ops null_ops = { 0 };
482
483 /*
484 * Note that these mount options differ from what we would pass to a direct
485 * mount() call:
486 * - nosuid, nodev, and noatime are not understood by the kernel; libfuse
487 * uses those options to construct the mount flags (MS_*)
488 * - The FUSE kernel driver requires additional options (fd, rootmode,
489 * user_id, group_id); these will be set by libfuse.
490 * Note that max_read is set here, while max_write is set via the FUSE INIT
491 * operation.
492 */
493 mount_opts = g_strdup_printf("%s,nosuid,nodev,noatime,max_read=%zu,"
494 "default_permissions%s",
495 exp->writable ? "rw" : "ro",
496 FUSE_MAX_READ_BYTES,
497 exp->allow_other ? ",allow_other" : "");
498
499 fuse_argv[0] = ""; /* Dummy program name */
500 fuse_argv[1] = "-o";
501 fuse_argv[2] = mount_opts;
502 fuse_argv[3] = NULL;
503 fuse_args = (struct fuse_args)FUSE_ARGS_INIT(3, (char **)fuse_argv);
504
505 exp->fuse_session = fuse_session_new(&fuse_args, &null_ops,
506 sizeof(null_ops), NULL);
507 g_free(mount_opts);
508 if (!exp->fuse_session) {
509 error_setg(errp, "Failed to set up FUSE session");
510 return -EIO;
511 }
512
513 ret = fuse_session_mount(exp->fuse_session, exp->mountpoint);
514 if (ret < 0) {
515 error_setg(errp, "Failed to mount FUSE session to export");
516 ret = -EIO;
517 goto fail;
518 }
519 exp->mounted = true;
520
521 return 0;
522
523 fail:
524 fuse_session_destroy(exp->fuse_session);
525 exp->fuse_session = NULL;
526 return ret;
527 }
528
529 /**
530 * Allocate a buffer to receive WRITE data, or take the cached one.
531 */
532 static void *get_write_data_buffer(FuseQueue *q)
533 {
534 if (q->req_write_data_cached) {
535 void *cached = q->req_write_data_cached;
536 q->req_write_data_cached = NULL;
537 return cached;
538 } else {
539 return blk_blockalign(q->exp->common.blk, FUSE_MAX_WRITE_BYTES);
540 }
541 }
542
543 /**
544 * Release a WRITE data buffer, possibly reusing it for a subsequent request.
545 */
546 static void release_write_data_buffer(FuseQueue *q, void **buffer)
547 {
548 if (!*buffer) {
549 return;
550 }
551
552 if (!q->req_write_data_cached) {
553 q->req_write_data_cached = *buffer;
554 } else {
555 qemu_vfree(*buffer);
556 }
557 *buffer = NULL;
558 }
559
560 /**
561 * Return the length of the specific operation's own in_header.
562 * Return -ENOSYS if the operation is not supported.
563 */
564 static ssize_t req_op_hdr_len(const FuseRequestInHeader *in_hdr)
565 {
566 switch (in_hdr->common.opcode) {
567 case FUSE_INIT:
568 return sizeof(in_hdr->init);
569 case FUSE_OPEN:
570 return sizeof(in_hdr->open);
571 case FUSE_SETATTR:
572 return sizeof(in_hdr->setattr);
573 case FUSE_READ:
574 return sizeof(in_hdr->read);
575 case FUSE_WRITE:
576 return sizeof(in_hdr->write);
577 case FUSE_FALLOCATE:
578 return sizeof(in_hdr->fallocate);
579 #ifdef CONFIG_FUSE_LSEEK
580 case FUSE_LSEEK:
581 return sizeof(in_hdr->lseek);
582 #endif
583 case FUSE_DESTROY:
584 case FUSE_STATFS:
585 case FUSE_RELEASE:
586 case FUSE_LOOKUP:
587 case FUSE_FORGET:
588 case FUSE_BATCH_FORGET:
589 case FUSE_GETATTR:
590 case FUSE_FSYNC:
591 case FUSE_FLUSH:
592 /* These requests don't have their own header or we don't care */
593 return 0;
594 default:
595 return -ENOSYS;
596 }
597 }
598
599 /**
600 * Clone the given /dev/fuse file descriptor, yielding a second FD from which
601 * requests can be pulled for the associated filesystem. Returns an FD on
602 * success, and -errno on error.
603 */
604 static int clone_fuse_fd(int fd, Error **errp)
605 {
606 uint32_t src_fd = fd;
607 int new_fd;
608 int ret;
609
610 /*
611 * The name "/dev/fuse" is fixed, see libfuse's lib/fuse_loop_mt.c
612 * (fuse_clone_chan()).
613 */
614 new_fd = open("/dev/fuse", O_RDWR | O_CLOEXEC | O_NONBLOCK);
615 if (new_fd < 0) {
616 ret = -errno;
617 error_setg_errno(errp, errno, "Failed to open /dev/fuse");
618 return ret;
619 }
620
621 ret = ioctl(new_fd, FUSE_DEV_IOC_CLONE, &src_fd);
622 if (ret < 0) {
623 ret = -errno;
624 error_setg_errno(errp, errno, "Failed to clone FUSE FD");
625 close(new_fd);
626 return ret;
627 }
628
629 return new_fd;
630 }
631
632 /**
633 * Try to read a single request from the FUSE FD.
634 * Takes a FuseQueue pointer in `opaque`.
635 *
636 * Assumes the export's in-flight counter has already been incremented.
637 *
638 * If a request is available, process it.
639 */
640 static void coroutine_fn co_read_from_fuse_fd(void *opaque)
641 {
642 FuseQueue *q = opaque;
643 int fuse_fd = q->fuse_fd;
644 FuseExport *exp = q->exp;
645 ssize_t ret;
646 FuseRequestInHeaderBuf in_hdr_buf;
647 const FuseRequestInHeader *in_hdr;
648 void *data_buffer = NULL;
649 struct iovec iov[2];
650 ssize_t op_hdr_len;
651
652 if (unlikely(qatomic_read(&exp->halted))) {
653 goto no_request;
654 }
655
656 data_buffer = get_write_data_buffer(q);
657
658 /* Construct the I/O vector to hold the FUSE request */
659 iov[0] = (struct iovec) { &in_hdr_buf.head, sizeof(in_hdr_buf.head) };
660 iov[1] = (struct iovec) { data_buffer, FUSE_MAX_WRITE_BYTES };
661 ret = RETRY_ON_EINTR(readv(fuse_fd, iov, ARRAY_SIZE(iov)));
662 if (ret < 0 && errno == EAGAIN) {
663 /* No request available */
664 goto no_request;
665 } else if (unlikely(ret < 0)) {
666 error_report("Failed to read from FUSE device: %s", strerror(errno));
667 goto no_request;
668 }
669
670 if (unlikely(ret < sizeof(in_hdr->common))) {
671 error_report("Incomplete read from FUSE device, expected at least %zu "
672 "bytes, read %zi bytes; cannot trust subsequent "
673 "requests, halting the export",
674 sizeof(in_hdr->common), ret);
675 fuse_export_halt(exp);
676 goto no_request;
677 }
678 in_hdr = &in_hdr_buf.structured;
679
680 if (unlikely(ret != in_hdr->common.len)) {
681 error_report("Number of bytes read from FUSE device does not match "
682 "request size, expected %" PRIu32 " bytes, read %zi "
683 "bytes; cannot trust subsequent requests, halting the "
684 "export",
685 in_hdr->common.len, ret);
686 fuse_export_halt(exp);
687 goto no_request;
688 }
689
690 op_hdr_len = req_op_hdr_len(in_hdr);
691 if (op_hdr_len < 0) {
692 fuse_write_err(fuse_fd, &in_hdr->common, op_hdr_len);
693 goto no_request;
694 }
695
696 if (unlikely(ret < sizeof(in_hdr->common) + op_hdr_len)) {
697 error_report("FUSE request truncated, expected %zu bytes, read %zi "
698 "bytes",
699 sizeof(in_hdr->common) + op_hdr_len, ret);
700 fuse_write_err(fuse_fd, &in_hdr->common, -EINVAL);
701 goto no_request;
702 }
703
704 /*
705 * Only WRITE uses the write data buffer, so for non-WRITE requests longer
706 * than .head, we need to copy any data that spilled into data_buffer into
707 * .tail. Then we can release the write data buffer.
708 */
709 if (in_hdr->common.opcode != FUSE_WRITE) {
710 if (ret > sizeof(in_hdr_buf.head)) {
711 size_t len;
712 /* Limit size to prevent overflow */
713 len = MIN(ret - sizeof(in_hdr_buf.head), sizeof(in_hdr_buf.tail));
714 memcpy(in_hdr_buf.tail, data_buffer, len);
715 }
716
717 release_write_data_buffer(q, &data_buffer);
718 }
719
720 fuse_co_process_request(q, in_hdr, data_buffer);
721
722 no_request:
723 release_write_data_buffer(q, &data_buffer);
724 fuse_dec_in_flight(exp);
725 }
726
727 /**
728 * Try to read and process a single request from the FUSE FD.
729 * (To be used as a handler for when the FUSE FD becomes readable.)
730 * Takes a FuseQueue pointer in `opaque`.
731 */
732 static void read_from_fuse_fd(void *opaque)
733 {
734 FuseQueue *q = opaque;
735 Coroutine *co;
736
737 co = qemu_coroutine_create(co_read_from_fuse_fd, q);
738 /* Decremented by co_read_from_fuse_fd() */
739 fuse_inc_in_flight(q->exp);
740 qemu_coroutine_enter(co);
741 }
742
743 static void fuse_export_shutdown(BlockExport *blk_exp)
744 {
745 FuseExport *exp = container_of(blk_exp, FuseExport, common);
746
747 if (exp->fd_handler_set_up) {
748 fuse_detach_handlers(exp);
749 }
750
751 if (exp->mountpoint) {
752 /*
753 * Safe to drop now, because we will not handle any requests for this
754 * export anymore anyway (at least not from the main thread).
755 */
756 g_hash_table_remove(exports, exp->mountpoint);
757 }
758 }
759
760 static void fuse_export_delete(BlockExport *blk_exp)
761 {
762 FuseExport *exp = container_of(blk_exp, FuseExport, common);
763
764 for (int i = 0; i < exp->num_queues; i++) {
765 FuseQueue *q = &exp->queues[i];
766
767 /* Queue 0's FD belongs to the FUSE session */
768 if (i > 0 && q->fuse_fd >= 0) {
769 close(q->fuse_fd);
770 }
771 qemu_vfree(q->req_write_data_cached);
772 }
773 g_free(exp->queues);
774
775 if (exp->fuse_session) {
776 if (exp->mounted) {
777 fuse_session_unmount(exp->fuse_session);
778 }
779
780 fuse_session_destroy(exp->fuse_session);
781 }
782
783 g_free(exp->mountpoint);
784 }
785
786 /**
787 * Halt the export: Detach FD handlers, and set exp->halted to true, preventing
788 * fuse_attach_handlers() from re-attaching them, therefore stopping all further
789 * request processing.
790 *
791 * Call this function when an unrecoverable error happens that makes processing
792 * all future requests unreliable.
793 */
794 static void fuse_export_halt(FuseExport *exp)
795 {
796 qatomic_set(&exp->halted, true);
797 fuse_detach_handlers(exp);
798 }
799
800 /**
801 * Check whether @path points to a regular file. If not, put an
802 * appropriate message into *errp.
803 */
804 static bool is_regular_file(const char *path, Error **errp)
805 {
806 struct stat statbuf;
807 int ret;
808
809 ret = stat(path, &statbuf);
810 if (ret < 0) {
811 error_setg_errno(errp, errno, "Failed to stat '%s'", path);
812 return false;
813 }
814
815 if (!S_ISREG(statbuf.st_mode)) {
816 error_setg(errp, "'%s' is not a regular file", path);
817 return false;
818 }
819
820 return true;
821 }
822
823 /**
824 * Process FUSE INIT.
825 * Return the number of bytes written to *out on success, and -errno on error.
826 */
827 static ssize_t coroutine_fn GRAPH_RDLOCK
828 fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
829 const struct fuse_init_in_compat *in)
830 {
831 const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
832
833 if (in->major != 7) {
834 error_report("FUSE major version mismatch: We have 7, but kernel has %"
835 PRIu32, in->major);
836 return -EINVAL;
837 }
838
839 /* 2007's 7.9 added fuse_attr.blksize; working around that would be hard */
840 if (in->minor < 9) {
841 error_report("FUSE minor version too old: 9 required, but kernel has %"
842 PRIu32, in->minor);
843 return -EINVAL;
844 }
845
846 *out = (struct fuse_init_out) {
847 .major = 7,
848 .minor = MIN(FUSE_KERNEL_MINOR_VERSION, in->minor),
849 .max_readahead = in->max_readahead,
850 .max_write = FUSE_MAX_WRITE_BYTES,
851 .flags = in->flags & supported_flags,
852 .flags2 = 0,
853
854 /* libfuse maximum: 2^16 - 1 */
855 .max_background = UINT16_MAX,
856
857 /* libfuse default: max_background * 3 / 4 */
858 .congestion_threshold = (int)UINT16_MAX * 3 / 4,
859
860 /* libfuse default: 1 */
861 .time_gran = 1,
862
863 /*
864 * probably unneeded without FUSE_MAX_PAGES, but this would be the
865 * libfuse default
866 */
867 .max_pages = DIV_ROUND_UP(FUSE_MAX_WRITE_BYTES,
868 qemu_real_host_page_size()),
869
870 /* Only needed for mappings (i.e. DAX) */
871 .map_alignment = 0,
872 };
873
874 /*
875 * Before 7.23, fuse_init_out is shorter.
876 * Drop the tail (time_gran, max_pages, map_alignment).
877 */
878 return out->minor >= 23 ? sizeof(*out) : FUSE_COMPAT_22_INIT_OUT_SIZE;
879 }
880
881 /**
882 * Return some filesystem information, just to not break e.g. `df`.
883 */
884 static ssize_t coroutine_fn GRAPH_RDLOCK
885 fuse_co_statfs(FuseExport *exp, struct fuse_statfs_out *out)
886 {
887 BlockDriverState *root_bs;
888 uint32_t opt_transfer = 512;
889
890 root_bs = blk_bs(exp->common.blk);
891 if (root_bs) {
892 opt_transfer = root_bs->bl.opt_transfer;
893 if (!opt_transfer) {
894 opt_transfer = root_bs->bl.request_alignment;
895 }
896 opt_transfer = MAX(opt_transfer, 512);
897 }
898
899 *out = (struct fuse_statfs_out) {
900 /* These are the fields libfuse sets by default */
901 .st = {
902 .namelen = 255,
903 .bsize = opt_transfer,
904 },
905 };
906 return sizeof(*out);
907 }
908
909 /**
910 * Let clients get file attributes (i.e., stat() the file).
911 * Return the number of bytes written to *out on success, and -errno on error.
912 */
913 static ssize_t coroutine_fn GRAPH_RDLOCK
914 fuse_co_getattr(FuseExport *exp, struct fuse_attr_out *out)
915 {
916 int64_t length, allocated_blocks;
917 time_t now = time(NULL);
918
919 length = blk_co_getlength(exp->common.blk);
920 if (length < 0) {
921 return length;
922 }
923
924 allocated_blocks = bdrv_co_get_allocated_file_size(blk_bs(exp->common.blk));
925 if (allocated_blocks <= 0) {
926 allocated_blocks = DIV_ROUND_UP(length, 512);
927 } else {
928 allocated_blocks = DIV_ROUND_UP(allocated_blocks, 512);
929 }
930
931 *out = (struct fuse_attr_out) {
932 .attr_valid = 1,
933 .attr = {
934 .ino = 1,
935 .mode = qatomic_read(&exp->st_mode),
936 .nlink = 1,
937 .uid = qatomic_read(&exp->st_uid),
938 .gid = qatomic_read(&exp->st_gid),
939 .size = length,
940 .blksize = blk_bs(exp->common.blk)->bl.request_alignment,
941 .blocks = allocated_blocks,
942 .atime = now,
943 .mtime = now,
944 .ctime = now,
945 },
946 };
947
948 return sizeof(*out);
949 }
950
951 static int coroutine_fn GRAPH_RDLOCK
952 fuse_co_do_truncate(const FuseExport *exp, int64_t size, bool req_zero_write,
953 PreallocMode prealloc)
954 {
955 BdrvRequestFlags truncate_flags = 0;
956
957 if (req_zero_write) {
958 truncate_flags |= BDRV_REQ_ZERO_WRITE;
959 }
960
961 return blk_co_truncate(exp->common.blk, size, true, prealloc,
962 truncate_flags, NULL);
963 }
964
965 /**
966 * Let clients set file attributes. Only resizing and changing
967 * permissions (st_mode, st_uid, st_gid) is allowed.
968 * Changing permissions is only allowed as far as it will actually
969 * permit access: Read-only exports cannot be given +w, and exports
970 * without allow_other cannot be given a different UID or GID, and
971 * they cannot be given non-owner access.
972 * Return the number of bytes written to *out on success, and -errno on error.
973 */
974 static ssize_t coroutine_fn GRAPH_RDLOCK
975 fuse_co_setattr(FuseExport *exp, struct fuse_attr_out *out, uint32_t to_set,
976 uint64_t size, uint32_t mode, uint32_t uid, uint32_t gid)
977 {
978 int supported_attrs;
979 int ret;
980
981 /* SIZE and MODE are actually supported, the others can be safely ignored */
982 supported_attrs = FATTR_SIZE | FATTR_MODE |
983 FATTR_FH | FATTR_LOCKOWNER | FATTR_KILL_SUIDGID;
984 if (exp->allow_other) {
985 supported_attrs |= FATTR_UID | FATTR_GID;
986 }
987
988 if (to_set & ~supported_attrs) {
989 return -ENOTSUP;
990 }
991
992 /* Do some argument checks first before committing to anything */
993 if (to_set & FATTR_MODE) {
994 /*
995 * Without allow_other, non-owners can never access the export, so do
996 * not allow setting permissions for them
997 */
998 if (!exp->allow_other && (mode & (S_IRWXG | S_IRWXO)) != 0) {
999 return -EPERM;
1000 }
1001
1002 /* +w for read-only exports makes no sense, disallow it */
1003 if (!exp->writable && (mode & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) {
1004 return -EROFS;
1005 }
1006 }
1007
1008 if (to_set & FATTR_SIZE) {
1009 if (!exp->writable) {
1010 return -EACCES;
1011 }
1012
1013 ret = fuse_co_do_truncate(exp, size, true, PREALLOC_MODE_OFF);
1014 if (ret < 0) {
1015 return ret;
1016 }
1017 }
1018
1019 if (to_set & FATTR_MODE) {
1020 /* Ignore FUSE-supplied file type, only change the mode */
1021 qatomic_set(&exp->st_mode, (mode & 07777) | S_IFREG);
1022 }
1023
1024 if (to_set & FATTR_UID) {
1025 qatomic_set(&exp->st_uid, uid);
1026 }
1027
1028 if (to_set & FATTR_GID) {
1029 qatomic_set(&exp->st_gid, gid);
1030 }
1031
1032 return fuse_co_getattr(exp, out);
1033 }
1034
1035 /**
1036 * Open an inode. We only have a single inode in our exported filesystem, so we
1037 * just acknowledge the request.
1038 * Return the number of bytes written to *out on success, and -errno on error.
1039 */
1040 static ssize_t coroutine_fn GRAPH_RDLOCK
1041 fuse_co_open(FuseExport *exp, struct fuse_open_out *out)
1042 {
1043 *out = (struct fuse_open_out) {
1044 .open_flags = FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES,
1045 };
1046 return sizeof(*out);
1047 }
1048
1049 /**
1050 * Handle client reads from the exported image. Allocates *bufptr and reads
1051 * data from the block device into that buffer.
1052 * Returns the buffer (read) size on success, and -errno on error.
1053 * Note: If the returned size is 0, *bufptr will be set to NULL.
1054 * After use, *bufptr must be freed via qemu_vfree().
1055 */
1056 static ssize_t coroutine_fn GRAPH_RDLOCK
1057 fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size)
1058 {
1059 int64_t blk_len;
1060 void *buf;
1061 int ret;
1062
1063 /* Limited by max_read, should not happen */
1064 if (size > FUSE_MAX_READ_BYTES) {
1065 return -EINVAL;
1066 }
1067
1068 /**
1069 * Clients will expect short reads at EOF, so we have to limit
1070 * offset+size to the image length.
1071 */
1072 blk_len = blk_co_getlength(exp->common.blk);
1073 if (blk_len < 0) {
1074 return blk_len;
1075 }
1076
1077 if (offset >= blk_len) {
1078 /* Explicitly set to NULL because we return success here */
1079 *bufptr = NULL;
1080 return 0;
1081 }
1082
1083 if (offset + size > blk_len) {
1084 size = blk_len - offset;
1085 }
1086
1087 buf = qemu_try_blockalign(blk_bs(exp->common.blk), size);
1088 if (!buf) {
1089 return -ENOMEM;
1090 }
1091
1092 ret = blk_co_pread(exp->common.blk, offset, size, buf, 0);
1093 if (ret < 0) {
1094 qemu_vfree(buf);
1095 return ret;
1096 }
1097
1098 *bufptr = buf;
1099 return size;
1100 }
1101
1102 /**
1103 * Handle client writes to the exported image. @buf has the data to be written.
1104 * Return the number of bytes written to *out on success, and -errno on error.
1105 */
1106 static ssize_t coroutine_fn GRAPH_RDLOCK
1107 fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
1108 uint64_t offset, uint32_t size, const void *buf)
1109 {
1110 int64_t blk_len;
1111 int ret;
1112
1113 QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES);
1114 /* Limited by max_write, should not happen */
1115 if (size > FUSE_MAX_WRITE_BYTES) {
1116 return -EINVAL;
1117 }
1118
1119 if (!exp->writable) {
1120 return -EACCES;
1121 }
1122
1123 /**
1124 * Clients will expect short writes at EOF, so we have to limit
1125 * offset+size to the image length.
1126 */
1127 blk_len = blk_co_getlength(exp->common.blk);
1128 if (blk_len < 0) {
1129 return blk_len;
1130 }
1131
1132 if (offset >= blk_len && !exp->growable) {
1133 *out = (struct fuse_write_out) {
1134 .size = 0,
1135 };
1136 return sizeof(*out);
1137 }
1138
1139 if (offset + size < offset) {
1140 return -EINVAL;
1141 } else if (offset + size > blk_len) {
1142 if (exp->growable) {
1143 ret = fuse_co_do_truncate(exp, offset + size, true,
1144 PREALLOC_MODE_OFF);
1145 if (ret < 0) {
1146 return ret;
1147 }
1148 } else {
1149 size = blk_len - offset;
1150 }
1151 }
1152
1153 ret = blk_co_pwrite(exp->common.blk, offset, size, buf, 0);
1154 if (ret < 0) {
1155 return ret;
1156 }
1157
1158 *out = (struct fuse_write_out) {
1159 .size = size,
1160 };
1161 return sizeof(*out);
1162 }
1163
1164 /**
1165 * Let clients perform various fallocate() operations.
1166 * Return 0 on success (no 'out' object), and -errno on error.
1167 */
1168 static ssize_t coroutine_fn GRAPH_RDLOCK
1169 fuse_co_fallocate(FuseExport *exp,
1170 uint64_t offset, uint64_t length, uint32_t mode)
1171 {
1172 int64_t blk_len;
1173 int ret;
1174
1175 if (!exp->writable) {
1176 return -EACCES;
1177 }
1178
1179 blk_len = blk_co_getlength(exp->common.blk);
1180 if (blk_len < 0) {
1181 return blk_len;
1182 }
1183
1184 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1185 if (mode & FALLOC_FL_KEEP_SIZE) {
1186 length = MIN(length, blk_len - offset);
1187 }
1188 #endif /* CONFIG_FALLOCATE_PUNCH_HOLE */
1189
1190 if (!mode) {
1191 /* We can only fallocate at the EOF with a truncate */
1192 if (offset < blk_len) {
1193 return -EOPNOTSUPP;
1194 }
1195
1196 if (offset > blk_len) {
1197 /* No preallocation needed here */
1198 ret = fuse_co_do_truncate(exp, offset, true, PREALLOC_MODE_OFF);
1199 if (ret < 0) {
1200 return ret;
1201 }
1202 }
1203
1204 ret = fuse_co_do_truncate(exp, offset + length, true,
1205 PREALLOC_MODE_FALLOC);
1206 }
1207 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1208 else if (mode & FALLOC_FL_PUNCH_HOLE) {
1209 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
1210 return -EINVAL;
1211 }
1212
1213 do {
1214 int size = MIN(length, BDRV_REQUEST_MAX_BYTES);
1215
1216 ret = blk_co_pwrite_zeroes(exp->common.blk, offset, size,
1217 BDRV_REQ_MAY_UNMAP |
1218 BDRV_REQ_NO_FALLBACK);
1219 if (ret == -ENOTSUP) {
1220 /*
1221 * fallocate() specifies to return EOPNOTSUPP for unsupported
1222 * operations
1223 */
1224 ret = -EOPNOTSUPP;
1225 }
1226
1227 offset += size;
1228 length -= size;
1229 } while (ret == 0 && length > 0);
1230 }
1231 #endif /* CONFIG_FALLOCATE_PUNCH_HOLE */
1232 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
1233 else if (mode & FALLOC_FL_ZERO_RANGE) {
1234 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + length > blk_len) {
1235 /* No need for zeroes, we are going to write them ourselves */
1236 ret = fuse_co_do_truncate(exp, offset + length, false,
1237 PREALLOC_MODE_OFF);
1238 if (ret < 0) {
1239 return ret;
1240 }
1241 }
1242
1243 do {
1244 int size = MIN(length, BDRV_REQUEST_MAX_BYTES);
1245
1246 ret = blk_co_pwrite_zeroes(exp->common.blk,
1247 offset, size, 0);
1248 offset += size;
1249 length -= size;
1250 } while (ret == 0 && length > 0);
1251 }
1252 #endif /* CONFIG_FALLOCATE_ZERO_RANGE */
1253 else {
1254 ret = -EOPNOTSUPP;
1255 }
1256
1257 return ret < 0 ? ret : 0;
1258 }
1259
1260 /**
1261 * Let clients fsync the exported image.
1262 * Return 0 on success (no 'out' object), and -errno on error.
1263 */
1264 static ssize_t coroutine_fn GRAPH_RDLOCK fuse_co_fsync(FuseExport *exp)
1265 {
1266 return blk_co_flush(exp->common.blk);
1267 }
1268
1269 /**
1270 * Called before an FD to the exported image is closed. (libfuse
1271 * notes this to be a way to return last-minute errors.)
1272 * Return 0 on success (no 'out' object), and -errno on error.
1273 */
1274 static ssize_t coroutine_fn GRAPH_RDLOCK fuse_co_flush(FuseExport *exp)
1275 {
1276 return blk_co_flush(exp->common.blk);
1277 }
1278
1279 #ifdef CONFIG_FUSE_LSEEK
1280 /**
1281 * Let clients inquire allocation status.
1282 * Return the number of bytes written to *out on success, and -errno on error.
1283 */
1284 static ssize_t coroutine_fn GRAPH_RDLOCK
1285 fuse_co_lseek(FuseExport *exp, struct fuse_lseek_out *out,
1286 uint64_t offset, uint32_t whence)
1287 {
1288 if (whence != SEEK_HOLE && whence != SEEK_DATA) {
1289 return -EINVAL;
1290 }
1291
1292 while (true) {
1293 int64_t pnum;
1294 int ret;
1295
1296 ret = bdrv_co_block_status_above(blk_bs(exp->common.blk), NULL,
1297 offset, INT64_MAX, &pnum, NULL, NULL);
1298 if (ret < 0) {
1299 return ret;
1300 }
1301
1302 if (!pnum && (ret & BDRV_BLOCK_EOF)) {
1303 int64_t blk_len;
1304
1305 /*
1306 * If blk_getlength() rounds (e.g. by sectors), then the
1307 * export length will be rounded, too. However,
1308 * bdrv_block_status_above() may return EOF at unaligned
1309 * offsets. We must not let this become visible and thus
1310 * always simulate a hole between @offset (the real EOF)
1311 * and @blk_len (the client-visible EOF).
1312 */
1313
1314 blk_len = blk_co_getlength(exp->common.blk);
1315 if (blk_len < 0) {
1316 return blk_len;
1317 }
1318
1319 if (offset > blk_len || whence == SEEK_DATA) {
1320 return -ENXIO;
1321 }
1322
1323 *out = (struct fuse_lseek_out) {
1324 .offset = offset,
1325 };
1326 return sizeof(*out);
1327 }
1328
1329 if (ret & BDRV_BLOCK_DATA) {
1330 if (whence == SEEK_DATA) {
1331 *out = (struct fuse_lseek_out) {
1332 .offset = offset,
1333 };
1334 return sizeof(*out);
1335 }
1336 } else {
1337 if (whence == SEEK_HOLE) {
1338 *out = (struct fuse_lseek_out) {
1339 .offset = offset,
1340 };
1341 return sizeof(*out);
1342 }
1343 }
1344
1345 /* Safety check against infinite loops */
1346 if (!pnum) {
1347 return -ENXIO;
1348 }
1349
1350 offset += pnum;
1351 }
1352 }
1353 #endif
1354
1355 /**
1356 * Write a FUSE response to the given @fd.
1357 *
1358 * Effectively, writes out_hdr->common.len bytes of the buffer that is *out_hdr.
1359 *
1360 * @fd: FUSE file descriptor
1361 * @out_hdr: Request response header and request-specific response data
1362 */
1363 static int fuse_write_response(int fd, FuseRequestOutHeader *out_hdr)
1364 {
1365 size_t to_write = out_hdr->common.len;
1366 ssize_t ret;
1367
1368 /* Must at least write fuse_out_header */
1369 assert(to_write >= sizeof(out_hdr->common));
1370
1371 ret = RETRY_ON_EINTR(write(fd, out_hdr, to_write));
1372 if (ret < 0) {
1373 ret = -errno;
1374 error_report("Failed to write to FUSE device: %s", strerror(-ret));
1375 return ret;
1376 }
1377
1378 /* Short writes are unexpected, treat them as errors */
1379 if (ret != to_write) {
1380 error_report("Short write to FUSE device, wrote %zi of %zu bytes",
1381 ret, to_write);
1382 return -EIO;
1383 }
1384
1385 return 0;
1386 }
1387
1388 /**
1389 * Write a FUSE error response to @fd.
1390 *
1391 * @fd: FUSE file descriptor
1392 * @in_hdr: Incoming request header to which to respond
1393 * @err: Error code (-errno, must be negative!)
1394 */
1395 static int fuse_write_err(int fd, const struct fuse_in_header *in_hdr, int err)
1396 {
1397 FuseRequestOutHeader out_hdr = {
1398 .common = {
1399 .len = sizeof(out_hdr.common),
1400 /* FUSE expects negative error values */
1401 .error = err,
1402 .unique = in_hdr->unique,
1403 },
1404 };
1405
1406 return fuse_write_response(fd, &out_hdr);
1407 }
1408
1409 /**
1410 * Write a FUSE response to the given @fd, using separate buffers for the
1411 * response header and data.
1412 *
1413 * In contrast to fuse_write_response(), this function cannot return a full
1414 * FuseRequestOutHeader (i.e. including request-specific response structs),
1415 * but only FuseRequestOutHeader.common. The remaining data must be in
1416 * *buf.
1417 *
1418 * (Total length must be set in out_hdr->len.)
1419 *
1420 * @fd: FUSE file descriptor
1421 * @out_hdr: Request response header
1422 * @buf: Pointer to response data
1423 */
1424 static int fuse_write_buf_response(int fd,
1425 const struct fuse_out_header *out_hdr,
1426 const void *buf)
1427 {
1428 size_t to_write = out_hdr->len;
1429 struct iovec iov[2] = {
1430 { (void *)out_hdr, sizeof(*out_hdr) },
1431 { (void *)buf, to_write - sizeof(*out_hdr) },
1432 };
1433 ssize_t ret;
1434
1435 /* *buf length must not be negative */
1436 assert(to_write >= sizeof(*out_hdr));
1437
1438 ret = RETRY_ON_EINTR(writev(fd, iov, ARRAY_SIZE(iov)));
1439 if (ret < 0) {
1440 ret = -errno;
1441 error_report("Failed to write to FUSE device: %s", strerror(-ret));
1442 return ret;
1443 }
1444
1445 /* Short writes are unexpected, treat them as errors */
1446 if (ret != to_write) {
1447 error_report("Short write to FUSE device, wrote %zi of %zu bytes",
1448 ret, to_write);
1449 return -EIO;
1450 }
1451
1452 return 0;
1453 }
1454
1455 /**
1456 * Process a FUSE request, incl. writing the response.
1457 */
1458 static void coroutine_fn
1459 fuse_co_process_request(FuseQueue *q, const FuseRequestInHeader *in_hdr,
1460 const void *data_buffer)
1461 {
1462 FuseRequestOutHeader out_hdr;
1463 FuseExport *exp = q->exp;
1464 /* For read requests: Data to be returned */
1465 void *out_data_buffer = NULL;
1466 ssize_t ret;
1467
1468 GRAPH_RDLOCK_GUARD();
1469
1470 switch (in_hdr->common.opcode) {
1471 case FUSE_INIT:
1472 ret = fuse_co_init(exp, &out_hdr.init, &in_hdr->init);
1473 break;
1474
1475 case FUSE_DESTROY:
1476 ret = 0;
1477 break;
1478
1479 case FUSE_STATFS:
1480 ret = fuse_co_statfs(exp, &out_hdr.statfs);
1481 break;
1482
1483 case FUSE_OPEN:
1484 ret = fuse_co_open(exp, &out_hdr.open);
1485 break;
1486
1487 case FUSE_RELEASE:
1488 ret = 0;
1489 break;
1490
1491 case FUSE_LOOKUP:
1492 ret = -ENOENT; /* There is no node but the root node */
1493 break;
1494
1495 case FUSE_FORGET:
1496 case FUSE_BATCH_FORGET:
1497 /* These have no response, and there is nothing we need to do */
1498 return;
1499
1500 case FUSE_GETATTR:
1501 ret = fuse_co_getattr(exp, &out_hdr.attr);
1502 break;
1503
1504 case FUSE_SETATTR: {
1505 const struct fuse_setattr_in *in = &in_hdr->setattr;
1506 ret = fuse_co_setattr(exp, &out_hdr.attr,
1507 in->valid, in->size, in->mode, in->uid, in->gid);
1508 break;
1509 }
1510
1511 case FUSE_READ: {
1512 const struct fuse_read_in *in = &in_hdr->read;
1513 ret = fuse_co_read(exp, &out_data_buffer, in->offset, in->size);
1514 break;
1515 }
1516
1517 case FUSE_WRITE: {
1518 const struct fuse_write_in *in = &in_hdr->write;
1519 uint32_t req_len = in_hdr->common.len;
1520
1521 if (unlikely(req_len < sizeof(in_hdr->common) + sizeof(*in) +
1522 in->size)) {
1523 warn_report("FUSE WRITE truncated; received %zu bytes of %" PRIu32,
1524 req_len - sizeof(in_hdr->common) - sizeof(*in),
1525 in->size);
1526 ret = -EINVAL;
1527 break;
1528 }
1529
1530 /*
1531 * co_read_from_fuse_fd() has checked that in_hdr->len matches the
1532 * number of bytes read, which cannot exceed the max_write value we set
1533 * (FUSE_MAX_WRITE_BYTES). So we know that FUSE_MAX_WRITE_BYTES >=
1534 * in_hdr->len >= in->size + X, so this assertion must hold.
1535 */
1536 assert(in->size <= FUSE_MAX_WRITE_BYTES);
1537
1538 ret = fuse_co_write(exp, &out_hdr.write,
1539 in->offset, in->size, data_buffer);
1540 break;
1541 }
1542
1543 case FUSE_FALLOCATE: {
1544 const struct fuse_fallocate_in *in = &in_hdr->fallocate;
1545 ret = fuse_co_fallocate(exp, in->offset, in->length, in->mode);
1546 break;
1547 }
1548
1549 case FUSE_FSYNC:
1550 ret = fuse_co_fsync(exp);
1551 break;
1552
1553 case FUSE_FLUSH:
1554 ret = fuse_co_flush(exp);
1555 break;
1556
1557 #ifdef CONFIG_FUSE_LSEEK
1558 case FUSE_LSEEK: {
1559 const struct fuse_lseek_in *in = &in_hdr->lseek;
1560 ret = fuse_co_lseek(exp, &out_hdr.lseek, in->offset, in->whence);
1561 break;
1562 }
1563 #endif
1564
1565 default:
1566 ret = -ENOSYS;
1567 }
1568
1569 if (ret >= 0) {
1570 out_hdr.common = (struct fuse_out_header) {
1571 .len = sizeof(out_hdr.common) + ret,
1572 .unique = in_hdr->common.unique,
1573 };
1574 } else {
1575 /* fuse_read() must not return a buffer in case of error */
1576 assert(out_data_buffer == NULL);
1577
1578 out_hdr.common = (struct fuse_out_header) {
1579 .len = sizeof(out_hdr.common),
1580 /* FUSE expects negative errno values */
1581 .error = ret,
1582 .unique = in_hdr->common.unique,
1583 };
1584 }
1585
1586 if (out_data_buffer) {
1587 fuse_write_buf_response(q->fuse_fd, &out_hdr.common, out_data_buffer);
1588 qemu_vfree(out_data_buffer);
1589 } else {
1590 fuse_write_response(q->fuse_fd, &out_hdr);
1591 }
1592 }
1593
1594 const BlockExportDriver blk_exp_fuse = {
1595 .type = BLOCK_EXPORT_TYPE_FUSE,
1596 .instance_size = sizeof(FuseExport),
1597 .create = fuse_export_create,
1598 .delete = fuse_export_delete,
1599 .request_shutdown = fuse_export_shutdown,
1600 };