2 * Present a block device as a raw image through FUSE
4 * Copyright (c) 2020, 2025 Hanna Czenczek <hreitz@redhat.com>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; under version 2 or later of the License.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 #define FUSE_USE_VERSION 31
21 #include "qemu/osdep.h"
22 #include "qemu/memalign.h"
24 #include "block/block_int-common.h"
25 #include "block/export.h"
26 #include "block/fuse.h"
27 #include "block/qapi.h"
28 #include "qapi/error.h"
29 #include "qapi/qapi-commands-block.h"
30 #include "qemu/coroutine.h"
31 #include "qemu/error-report.h"
32 #include "qemu/main-loop.h"
33 #include "system/block-backend.h"
34 #include "system/iothread.h"
37 #include <fuse_lowlevel.h>
39 #include "standard-headers/linux/fuse.h"
40 #include <sys/ioctl.h>
42 #if defined(CONFIG_FALLOCATE_ZERO_RANGE)
43 #include <linux/falloc.h>
50 /* Prevent overly long bounce buffer allocations */
51 #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
52 #define FUSE_MAX_WRITE_BYTES (64 * 1024)
55 * fuse_init_in structure before 7.36. We don't need the flags2 field added
56 * there, so we can work with the smaller older structure to stay compatible
59 struct fuse_init_in_compat {
62 uint32_t max_readahead;
66 typedef struct FuseRequestInHeader {
67 struct fuse_in_header common;
68 /* All supported requests */
70 struct fuse_init_in_compat init;
71 struct fuse_open_in open;
72 struct fuse_setattr_in setattr;
73 struct fuse_read_in read;
74 struct fuse_write_in write;
75 struct fuse_fallocate_in fallocate;
76 #ifdef CONFIG_FUSE_LSEEK
77 struct fuse_lseek_in lseek;
80 } FuseRequestInHeader;
82 typedef struct FuseRequestOutHeader {
83 struct fuse_out_header common;
84 /* All supported requests */
86 struct fuse_init_out init;
87 struct fuse_statfs_out statfs;
88 struct fuse_open_out open;
89 struct fuse_attr_out attr;
90 struct fuse_write_out write;
91 #ifdef CONFIG_FUSE_LSEEK
92 struct fuse_lseek_out lseek;
95 } FuseRequestOutHeader;
97 typedef union FuseRequestInHeaderBuf {
98 struct FuseRequestInHeader structured;
101 * Part of the request header that is filled for write requests
102 * (Needed because we want the data to go into a different buffer, to
103 * avoid having to use a bounce buffer)
105 char head[sizeof(struct fuse_in_header) +
106 sizeof(struct fuse_write_in)];
108 * Rest of the request header for requests that have a longer header
109 * than write requests
111 char tail[sizeof(FuseRequestInHeader) -
112 (sizeof(struct fuse_in_header) +
113 sizeof(struct fuse_write_in))];
115 } FuseRequestInHeaderBuf;
117 QEMU_BUILD_BUG_ON(sizeof(FuseRequestInHeaderBuf) !=
118 sizeof(FuseRequestInHeader));
119 QEMU_BUILD_BUG_ON(sizeof(((FuseRequestInHeaderBuf *)0)->head) +
120 sizeof(((FuseRequestInHeaderBuf *)0)->tail) !=
121 sizeof(FuseRequestInHeader));
123 typedef struct FuseExport FuseExport;
126 * One FUSE "queue", representing one FUSE FD from which requests are fetched
127 * and processed. Each queue is tied to an AioContext.
129 typedef struct FuseQueue {
136 * Cached buffer to receive the data of WRITE requests. Cached because:
137 * To read requests, we put a FuseRequestInHeaderBuf (FRIHB) object on the
138 * stack, and a (WRITE data) buffer on the heap. We pass FRIHB.head and the
139 * data buffer to readv(). This way, for WRITE requests, we get exactly
140 * their data in the data buffer and can avoid bounce buffering.
141 * However, for non-WRITE requests, some of the header may end up in the
142 * data buffer, so we will need to copy that back into the FRIHB object, and
143 * then we don't need the heap buffer anymore. That is why we cache it, so
144 * we can trivially reuse it between non-WRITE requests.
146 * Note that these data buffers and thus req_write_data_cached are allocated
147 * via blk_blockalign() and thus need to be freed via qemu_vfree().
149 void *req_write_data_cached;
155 struct fuse_session *fuse_session;
156 unsigned int in_flight; /* atomic */
157 bool mounted, fd_handler_set_up;
160 * Set when there was an unrecoverable error and no requests should be read
161 * from the device anymore (basically only in case of something we would
162 * consider a kernel bug). Access atomically.
169 * True if this export should follow the generic export's AioContext.
170 * Will be false if the queues' AioContexts have been explicitly set by the
171 * user, i.e. are expected to stay in those contexts.
172 * (I.e. is always false if there is more than one queue.)
174 bool follow_aio_context;
179 /* Whether allow_other was used as a mount option or not */
189 * Verify that the size of FuseRequestInHeaderBuf.head plus the data
190 * buffer are big enough to be accepted by the FUSE kernel driver.
192 QEMU_BUILD_BUG_ON(sizeof(((FuseRequestInHeaderBuf *)0)->head) +
193 FUSE_MAX_WRITE_BYTES <
194 FUSE_MIN_READ_BUFFER);
196 static GHashTable *exports;
198 static void fuse_export_shutdown(BlockExport *exp);
199 static void fuse_export_delete(BlockExport *exp);
200 static void fuse_export_halt(FuseExport *exp);
202 static void init_exports_table(void);
204 static int mount_fuse_export(FuseExport *exp, Error **errp);
205 static int clone_fuse_fd(int fd, Error **errp);
207 static bool is_regular_file(const char *path, Error **errp);
209 static void read_from_fuse_fd(void *opaque);
210 static void coroutine_fn
211 fuse_co_process_request(FuseQueue *q, const FuseRequestInHeader *in_hdr,
212 const void *data_buffer);
213 static int fuse_write_err(int fd, const struct fuse_in_header *in_hdr, int err);
215 static void fuse_inc_in_flight(FuseExport *exp)
217 if (qatomic_fetch_inc(&exp->in_flight) == 0) {
218 /* Prevent export from being deleted */
219 blk_exp_ref(&exp->common);
223 static void fuse_dec_in_flight(FuseExport *exp)
225 if (qatomic_fetch_dec(&exp->in_flight) == 1) {
226 /* Wake AIO_WAIT_WHILE() */
229 /* Now the export can be deleted */
230 blk_exp_unref(&exp->common);
235 * Attach FUSE FD read handler.
237 static void fuse_attach_handlers(FuseExport *exp)
239 if (qatomic_read(&exp->halted)) {
243 for (int i = 0; i < exp->num_queues; i++) {
244 aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
245 read_from_fuse_fd, NULL, NULL, NULL,
248 exp->fd_handler_set_up = true;
252 * Detach FUSE FD read handler.
254 static void fuse_detach_handlers(FuseExport *exp)
256 for (int i = 0; i < exp->num_queues; i++) {
257 aio_set_fd_handler(exp->queues[i].ctx, exp->queues[i].fuse_fd,
258 NULL, NULL, NULL, NULL, NULL);
260 exp->fd_handler_set_up = false;
263 static void fuse_export_drained_begin(void *opaque)
265 fuse_detach_handlers(opaque);
268 static void fuse_export_drained_end(void *opaque)
270 FuseExport *exp = opaque;
272 /* Refresh AioContext in case it changed */
273 exp->common.ctx = blk_get_aio_context(exp->common.blk);
274 if (exp->follow_aio_context) {
275 assert(exp->num_queues == 1);
276 exp->queues[0].ctx = exp->common.ctx;
279 fuse_attach_handlers(exp);
282 static bool fuse_export_drained_poll(void *opaque)
284 FuseExport *exp = opaque;
286 return qatomic_read(&exp->in_flight) > 0;
289 static const BlockDevOps fuse_export_blk_dev_ops = {
290 .drained_begin = fuse_export_drained_begin,
291 .drained_end = fuse_export_drained_end,
292 .drained_poll = fuse_export_drained_poll,
295 static int fuse_export_create(BlockExport *blk_exp,
296 BlockExportOptions *blk_exp_args,
297 AioContext *const *multithread,
301 ERRP_GUARD(); /* ensure clean-up even with error_fatal */
302 FuseExport *exp = container_of(blk_exp, FuseExport, common);
303 BlockExportOptionsFuse *args = &blk_exp_args->u.fuse;
307 assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
310 /* Guaranteed by common export code */
311 assert(mt_count >= 1);
313 exp->follow_aio_context = false;
314 exp->num_queues = mt_count;
315 exp->queues = g_new(FuseQueue, mt_count);
317 for (size_t i = 0; i < mt_count; i++) {
318 exp->queues[i] = (FuseQueue) {
320 .ctx = multithread[i],
325 /* Guaranteed by common export code */
326 assert(mt_count == 0);
328 exp->follow_aio_context = true;
330 exp->queues = g_new(FuseQueue, 1);
331 exp->queues[0] = (FuseQueue) {
333 .ctx = exp->common.ctx,
338 /* For growable and writable exports, take the RESIZE permission */
339 if (args->growable || blk_exp_args->writable) {
340 uint64_t blk_perm, blk_shared_perm;
342 blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm);
344 ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE,
345 blk_shared_perm, errp);
351 blk_set_dev_ops(exp->common.blk, &fuse_export_blk_dev_ops, exp);
354 * We handle draining ourselves using an in-flight counter and by disabling
355 * the FUSE fd handler. Do not queue BlockBackend requests, they need to
356 * complete so the in-flight counter reaches zero.
358 blk_set_disable_request_queuing(exp->common.blk, true);
360 init_exports_table();
363 * It is important to do this check before calling is_regular_file() --
364 * that function will do a stat(), which we would have to handle if we
365 * already exported something on @mountpoint. But we cannot, because
366 * we are currently caught up here.
367 * (Note that ideally we would want to resolve relative paths here,
368 * but bdrv_make_absolute_filename() might do the wrong thing for
369 * paths that contain colons, and realpath() would resolve symlinks,
370 * which we do not want: The mount point is not going to be the
371 * symlink's destination, but the link itself.)
372 * So this will not catch all potential clashes, but hopefully at
373 * least the most common one of specifying exactly the same path
376 if (g_hash_table_contains(exports, args->mountpoint)) {
377 error_setg(errp, "There already is a FUSE export on '%s'",
383 if (!is_regular_file(args->mountpoint, errp)) {
388 exp->mountpoint = g_strdup(args->mountpoint);
389 exp->writable = blk_exp_args->writable;
390 exp->growable = args->growable;
393 if (!args->has_allow_other) {
394 args->allow_other = FUSE_EXPORT_ALLOW_OTHER_AUTO;
397 st_mode = S_IFREG | S_IRUSR;
401 qatomic_set(&exp->st_mode, st_mode);
402 qatomic_set(&exp->st_uid, getuid());
403 qatomic_set(&exp->st_gid, getgid());
405 if (args->allow_other == FUSE_EXPORT_ALLOW_OTHER_AUTO) {
406 /* Try allow_other == true first, ignore errors */
407 exp->allow_other = true;
408 ret = mount_fuse_export(exp, NULL);
410 exp->allow_other = false;
411 ret = mount_fuse_export(exp, errp);
414 exp->allow_other = args->allow_other == FUSE_EXPORT_ALLOW_OTHER_ON;
415 ret = mount_fuse_export(exp, errp);
421 g_hash_table_insert(exports, g_strdup(exp->mountpoint), NULL);
423 assert(exp->num_queues >= 1);
424 exp->queues[0].fuse_fd = fuse_session_fd(exp->fuse_session);
425 ret = qemu_fcntl_addfl(exp->queues[0].fuse_fd, O_NONBLOCK);
427 error_setg_errno(errp, -ret, "Failed to make FUSE FD non-blocking");
431 for (int i = 1; i < exp->num_queues; i++) {
432 int fd = clone_fuse_fd(exp->queues[0].fuse_fd, errp);
437 exp->queues[i].fuse_fd = fd;
440 fuse_attach_handlers(exp);
444 fuse_export_shutdown(blk_exp);
445 fuse_export_delete(blk_exp);
450 * Allocates the global @exports hash table.
452 static void init_exports_table(void)
458 exports = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
462 * Create exp->fuse_session and mount it. Expects exp->mountpoint,
463 * exp->writable, and exp->allow_other to be set as intended for the mount.
465 static int mount_fuse_export(FuseExport *exp, Error **errp)
467 const char *fuse_argv[4];
469 struct fuse_args fuse_args;
472 * We just create the session for mounting/unmounting, no need to provide
473 * any operations. However, since libfuse commit 52a633a5d, we have to
474 * provide some op struct and cannot just pass NULL (even though the commit
475 * message ("allow passing ops as NULL") seems to imply the exact opposite,
476 * as does the comment added to fuse_session_new_fn() ("To create a no-op
477 * session just for mounting pass op as NULL.").
478 * This is how said libfuse commit implements a no-op session internally, so
479 * do it the same way.
481 static const struct fuse_lowlevel_ops null_ops = { 0 };
484 * Note that these mount options differ from what we would pass to a direct
486 * - nosuid, nodev, and noatime are not understood by the kernel; libfuse
487 * uses those options to construct the mount flags (MS_*)
488 * - The FUSE kernel driver requires additional options (fd, rootmode,
489 * user_id, group_id); these will be set by libfuse.
490 * Note that max_read is set here, while max_write is set via the FUSE INIT
493 mount_opts = g_strdup_printf("%s,nosuid,nodev,noatime,max_read=%zu,"
494 "default_permissions%s",
495 exp->writable ? "rw" : "ro",
497 exp->allow_other ? ",allow_other" : "");
499 fuse_argv[0] = ""; /* Dummy program name */
501 fuse_argv[2] = mount_opts;
503 fuse_args = (struct fuse_args)FUSE_ARGS_INIT(3, (char **)fuse_argv);
505 exp->fuse_session = fuse_session_new(&fuse_args, &null_ops,
506 sizeof(null_ops), NULL);
508 if (!exp->fuse_session) {
509 error_setg(errp, "Failed to set up FUSE session");
513 ret = fuse_session_mount(exp->fuse_session, exp->mountpoint);
515 error_setg(errp, "Failed to mount FUSE session to export");
524 fuse_session_destroy(exp->fuse_session);
525 exp->fuse_session = NULL;
530 * Allocate a buffer to receive WRITE data, or take the cached one.
532 static void *get_write_data_buffer(FuseQueue *q)
534 if (q->req_write_data_cached) {
535 void *cached = q->req_write_data_cached;
536 q->req_write_data_cached = NULL;
539 return blk_blockalign(q->exp->common.blk, FUSE_MAX_WRITE_BYTES);
544 * Release a WRITE data buffer, possibly reusing it for a subsequent request.
546 static void release_write_data_buffer(FuseQueue *q, void **buffer)
552 if (!q->req_write_data_cached) {
553 q->req_write_data_cached = *buffer;
561 * Return the length of the specific operation's own in_header.
562 * Return -ENOSYS if the operation is not supported.
564 static ssize_t req_op_hdr_len(const FuseRequestInHeader *in_hdr)
566 switch (in_hdr->common.opcode) {
568 return sizeof(in_hdr->init);
570 return sizeof(in_hdr->open);
572 return sizeof(in_hdr->setattr);
574 return sizeof(in_hdr->read);
576 return sizeof(in_hdr->write);
578 return sizeof(in_hdr->fallocate);
579 #ifdef CONFIG_FUSE_LSEEK
581 return sizeof(in_hdr->lseek);
588 case FUSE_BATCH_FORGET:
592 /* These requests don't have their own header or we don't care */
600 * Clone the given /dev/fuse file descriptor, yielding a second FD from which
601 * requests can be pulled for the associated filesystem. Returns an FD on
602 * success, and -errno on error.
604 static int clone_fuse_fd(int fd, Error **errp)
606 uint32_t src_fd = fd;
611 * The name "/dev/fuse" is fixed, see libfuse's lib/fuse_loop_mt.c
612 * (fuse_clone_chan()).
614 new_fd = open("/dev/fuse", O_RDWR | O_CLOEXEC | O_NONBLOCK);
617 error_setg_errno(errp, errno, "Failed to open /dev/fuse");
621 ret = ioctl(new_fd, FUSE_DEV_IOC_CLONE, &src_fd);
624 error_setg_errno(errp, errno, "Failed to clone FUSE FD");
633 * Try to read a single request from the FUSE FD.
634 * Takes a FuseQueue pointer in `opaque`.
636 * Assumes the export's in-flight counter has already been incremented.
638 * If a request is available, process it.
640 static void coroutine_fn co_read_from_fuse_fd(void *opaque)
642 FuseQueue *q = opaque;
643 int fuse_fd = q->fuse_fd;
644 FuseExport *exp = q->exp;
646 FuseRequestInHeaderBuf in_hdr_buf;
647 const FuseRequestInHeader *in_hdr;
648 void *data_buffer = NULL;
652 if (unlikely(qatomic_read(&exp->halted))) {
656 data_buffer = get_write_data_buffer(q);
658 /* Construct the I/O vector to hold the FUSE request */
659 iov[0] = (struct iovec) { &in_hdr_buf.head, sizeof(in_hdr_buf.head) };
660 iov[1] = (struct iovec) { data_buffer, FUSE_MAX_WRITE_BYTES };
661 ret = RETRY_ON_EINTR(readv(fuse_fd, iov, ARRAY_SIZE(iov)));
662 if (ret < 0 && errno == EAGAIN) {
663 /* No request available */
665 } else if (unlikely(ret < 0)) {
666 error_report("Failed to read from FUSE device: %s", strerror(errno));
670 if (unlikely(ret < sizeof(in_hdr->common))) {
671 error_report("Incomplete read from FUSE device, expected at least %zu "
672 "bytes, read %zi bytes; cannot trust subsequent "
673 "requests, halting the export",
674 sizeof(in_hdr->common), ret);
675 fuse_export_halt(exp);
678 in_hdr = &in_hdr_buf.structured;
680 if (unlikely(ret != in_hdr->common.len)) {
681 error_report("Number of bytes read from FUSE device does not match "
682 "request size, expected %" PRIu32 " bytes, read %zi "
683 "bytes; cannot trust subsequent requests, halting the "
685 in_hdr->common.len, ret);
686 fuse_export_halt(exp);
690 op_hdr_len = req_op_hdr_len(in_hdr);
691 if (op_hdr_len < 0) {
692 fuse_write_err(fuse_fd, &in_hdr->common, op_hdr_len);
696 if (unlikely(ret < sizeof(in_hdr->common) + op_hdr_len)) {
697 error_report("FUSE request truncated, expected %zu bytes, read %zi "
699 sizeof(in_hdr->common) + op_hdr_len, ret);
700 fuse_write_err(fuse_fd, &in_hdr->common, -EINVAL);
705 * Only WRITE uses the write data buffer, so for non-WRITE requests longer
706 * than .head, we need to copy any data that spilled into data_buffer into
707 * .tail. Then we can release the write data buffer.
709 if (in_hdr->common.opcode != FUSE_WRITE) {
710 if (ret > sizeof(in_hdr_buf.head)) {
712 /* Limit size to prevent overflow */
713 len = MIN(ret - sizeof(in_hdr_buf.head), sizeof(in_hdr_buf.tail));
714 memcpy(in_hdr_buf.tail, data_buffer, len);
717 release_write_data_buffer(q, &data_buffer);
720 fuse_co_process_request(q, in_hdr, data_buffer);
723 release_write_data_buffer(q, &data_buffer);
724 fuse_dec_in_flight(exp);
728 * Try to read and process a single request from the FUSE FD.
729 * (To be used as a handler for when the FUSE FD becomes readable.)
730 * Takes a FuseQueue pointer in `opaque`.
732 static void read_from_fuse_fd(void *opaque)
734 FuseQueue *q = opaque;
737 co = qemu_coroutine_create(co_read_from_fuse_fd, q);
738 /* Decremented by co_read_from_fuse_fd() */
739 fuse_inc_in_flight(q->exp);
740 qemu_coroutine_enter(co);
743 static void fuse_export_shutdown(BlockExport *blk_exp)
745 FuseExport *exp = container_of(blk_exp, FuseExport, common);
747 if (exp->fd_handler_set_up) {
748 fuse_detach_handlers(exp);
751 if (exp->mountpoint) {
753 * Safe to drop now, because we will not handle any requests for this
754 * export anymore anyway (at least not from the main thread).
756 g_hash_table_remove(exports, exp->mountpoint);
760 static void fuse_export_delete(BlockExport *blk_exp)
762 FuseExport *exp = container_of(blk_exp, FuseExport, common);
764 for (int i = 0; i < exp->num_queues; i++) {
765 FuseQueue *q = &exp->queues[i];
767 /* Queue 0's FD belongs to the FUSE session */
768 if (i > 0 && q->fuse_fd >= 0) {
771 qemu_vfree(q->req_write_data_cached);
775 if (exp->fuse_session) {
777 fuse_session_unmount(exp->fuse_session);
780 fuse_session_destroy(exp->fuse_session);
783 g_free(exp->mountpoint);
787 * Halt the export: Detach FD handlers, and set exp->halted to true, preventing
788 * fuse_attach_handlers() from re-attaching them, therefore stopping all further
789 * request processing.
791 * Call this function when an unrecoverable error happens that makes processing
792 * all future requests unreliable.
794 static void fuse_export_halt(FuseExport *exp)
796 qatomic_set(&exp->halted, true);
797 fuse_detach_handlers(exp);
801 * Check whether @path points to a regular file. If not, put an
802 * appropriate message into *errp.
804 static bool is_regular_file(const char *path, Error **errp)
809 ret = stat(path, &statbuf);
811 error_setg_errno(errp, errno, "Failed to stat '%s'", path);
815 if (!S_ISREG(statbuf.st_mode)) {
816 error_setg(errp, "'%s' is not a regular file", path);
825 * Return the number of bytes written to *out on success, and -errno on error.
827 static ssize_t coroutine_fn GRAPH_RDLOCK
828 fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
829 const struct fuse_init_in_compat *in)
831 const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
833 if (in->major != 7) {
834 error_report("FUSE major version mismatch: We have 7, but kernel has %"
839 /* 2007's 7.9 added fuse_attr.blksize; working around that would be hard */
841 error_report("FUSE minor version too old: 9 required, but kernel has %"
846 *out = (struct fuse_init_out) {
848 .minor = MIN(FUSE_KERNEL_MINOR_VERSION, in->minor),
849 .max_readahead = in->max_readahead,
850 .max_write = FUSE_MAX_WRITE_BYTES,
851 .flags = in->flags & supported_flags,
854 /* libfuse maximum: 2^16 - 1 */
855 .max_background = UINT16_MAX,
857 /* libfuse default: max_background * 3 / 4 */
858 .congestion_threshold = (int)UINT16_MAX * 3 / 4,
860 /* libfuse default: 1 */
864 * probably unneeded without FUSE_MAX_PAGES, but this would be the
867 .max_pages = DIV_ROUND_UP(FUSE_MAX_WRITE_BYTES,
868 qemu_real_host_page_size()),
870 /* Only needed for mappings (i.e. DAX) */
875 * Before 7.23, fuse_init_out is shorter.
876 * Drop the tail (time_gran, max_pages, map_alignment).
878 return out->minor >= 23 ? sizeof(*out) : FUSE_COMPAT_22_INIT_OUT_SIZE;
882 * Return some filesystem information, just to not break e.g. `df`.
884 static ssize_t coroutine_fn GRAPH_RDLOCK
885 fuse_co_statfs(FuseExport *exp, struct fuse_statfs_out *out)
887 BlockDriverState *root_bs;
888 uint32_t opt_transfer = 512;
890 root_bs = blk_bs(exp->common.blk);
892 opt_transfer = root_bs->bl.opt_transfer;
894 opt_transfer = root_bs->bl.request_alignment;
896 opt_transfer = MAX(opt_transfer, 512);
899 *out = (struct fuse_statfs_out) {
900 /* These are the fields libfuse sets by default */
903 .bsize = opt_transfer,
910 * Let clients get file attributes (i.e., stat() the file).
911 * Return the number of bytes written to *out on success, and -errno on error.
913 static ssize_t coroutine_fn GRAPH_RDLOCK
914 fuse_co_getattr(FuseExport *exp, struct fuse_attr_out *out)
916 int64_t length, allocated_blocks;
917 time_t now = time(NULL);
919 length = blk_co_getlength(exp->common.blk);
924 allocated_blocks = bdrv_co_get_allocated_file_size(blk_bs(exp->common.blk));
925 if (allocated_blocks <= 0) {
926 allocated_blocks = DIV_ROUND_UP(length, 512);
928 allocated_blocks = DIV_ROUND_UP(allocated_blocks, 512);
931 *out = (struct fuse_attr_out) {
935 .mode = qatomic_read(&exp->st_mode),
937 .uid = qatomic_read(&exp->st_uid),
938 .gid = qatomic_read(&exp->st_gid),
940 .blksize = blk_bs(exp->common.blk)->bl.request_alignment,
941 .blocks = allocated_blocks,
951 static int coroutine_fn GRAPH_RDLOCK
952 fuse_co_do_truncate(const FuseExport *exp, int64_t size, bool req_zero_write,
953 PreallocMode prealloc)
955 BdrvRequestFlags truncate_flags = 0;
957 if (req_zero_write) {
958 truncate_flags |= BDRV_REQ_ZERO_WRITE;
961 return blk_co_truncate(exp->common.blk, size, true, prealloc,
962 truncate_flags, NULL);
966 * Let clients set file attributes. Only resizing and changing
967 * permissions (st_mode, st_uid, st_gid) is allowed.
968 * Changing permissions is only allowed as far as it will actually
969 * permit access: Read-only exports cannot be given +w, and exports
970 * without allow_other cannot be given a different UID or GID, and
971 * they cannot be given non-owner access.
972 * Return the number of bytes written to *out on success, and -errno on error.
974 static ssize_t coroutine_fn GRAPH_RDLOCK
975 fuse_co_setattr(FuseExport *exp, struct fuse_attr_out *out, uint32_t to_set,
976 uint64_t size, uint32_t mode, uint32_t uid, uint32_t gid)
981 /* SIZE and MODE are actually supported, the others can be safely ignored */
982 supported_attrs = FATTR_SIZE | FATTR_MODE |
983 FATTR_FH | FATTR_LOCKOWNER | FATTR_KILL_SUIDGID;
984 if (exp->allow_other) {
985 supported_attrs |= FATTR_UID | FATTR_GID;
988 if (to_set & ~supported_attrs) {
992 /* Do some argument checks first before committing to anything */
993 if (to_set & FATTR_MODE) {
995 * Without allow_other, non-owners can never access the export, so do
996 * not allow setting permissions for them
998 if (!exp->allow_other && (mode & (S_IRWXG | S_IRWXO)) != 0) {
1002 /* +w for read-only exports makes no sense, disallow it */
1003 if (!exp->writable && (mode & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) {
1008 if (to_set & FATTR_SIZE) {
1009 if (!exp->writable) {
1013 ret = fuse_co_do_truncate(exp, size, true, PREALLOC_MODE_OFF);
1019 if (to_set & FATTR_MODE) {
1020 /* Ignore FUSE-supplied file type, only change the mode */
1021 qatomic_set(&exp->st_mode, (mode & 07777) | S_IFREG);
1024 if (to_set & FATTR_UID) {
1025 qatomic_set(&exp->st_uid, uid);
1028 if (to_set & FATTR_GID) {
1029 qatomic_set(&exp->st_gid, gid);
1032 return fuse_co_getattr(exp, out);
1036 * Open an inode. We only have a single inode in our exported filesystem, so we
1037 * just acknowledge the request.
1038 * Return the number of bytes written to *out on success, and -errno on error.
1040 static ssize_t coroutine_fn GRAPH_RDLOCK
1041 fuse_co_open(FuseExport *exp, struct fuse_open_out *out)
1043 *out = (struct fuse_open_out) {
1044 .open_flags = FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES,
1046 return sizeof(*out);
1050 * Handle client reads from the exported image. Allocates *bufptr and reads
1051 * data from the block device into that buffer.
1052 * Returns the buffer (read) size on success, and -errno on error.
1053 * Note: If the returned size is 0, *bufptr will be set to NULL.
1054 * After use, *bufptr must be freed via qemu_vfree().
1056 static ssize_t coroutine_fn GRAPH_RDLOCK
1057 fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size)
1063 /* Limited by max_read, should not happen */
1064 if (size > FUSE_MAX_READ_BYTES) {
1069 * Clients will expect short reads at EOF, so we have to limit
1070 * offset+size to the image length.
1072 blk_len = blk_co_getlength(exp->common.blk);
1077 if (offset >= blk_len) {
1078 /* Explicitly set to NULL because we return success here */
1083 if (offset + size > blk_len) {
1084 size = blk_len - offset;
1087 buf = qemu_try_blockalign(blk_bs(exp->common.blk), size);
1092 ret = blk_co_pread(exp->common.blk, offset, size, buf, 0);
1103 * Handle client writes to the exported image. @buf has the data to be written.
1104 * Return the number of bytes written to *out on success, and -errno on error.
1106 static ssize_t coroutine_fn GRAPH_RDLOCK
1107 fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
1108 uint64_t offset, uint32_t size, const void *buf)
1113 QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES);
1114 /* Limited by max_write, should not happen */
1115 if (size > FUSE_MAX_WRITE_BYTES) {
1119 if (!exp->writable) {
1124 * Clients will expect short writes at EOF, so we have to limit
1125 * offset+size to the image length.
1127 blk_len = blk_co_getlength(exp->common.blk);
1132 if (offset >= blk_len && !exp->growable) {
1133 *out = (struct fuse_write_out) {
1136 return sizeof(*out);
1139 if (offset + size < offset) {
1141 } else if (offset + size > blk_len) {
1142 if (exp->growable) {
1143 ret = fuse_co_do_truncate(exp, offset + size, true,
1149 size = blk_len - offset;
1153 ret = blk_co_pwrite(exp->common.blk, offset, size, buf, 0);
1158 *out = (struct fuse_write_out) {
1161 return sizeof(*out);
1165 * Let clients perform various fallocate() operations.
1166 * Return 0 on success (no 'out' object), and -errno on error.
1168 static ssize_t coroutine_fn GRAPH_RDLOCK
1169 fuse_co_fallocate(FuseExport *exp,
1170 uint64_t offset, uint64_t length, uint32_t mode)
1175 if (!exp->writable) {
1179 blk_len = blk_co_getlength(exp->common.blk);
1184 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1185 if (mode & FALLOC_FL_KEEP_SIZE) {
1186 length = MIN(length, blk_len - offset);
1188 #endif /* CONFIG_FALLOCATE_PUNCH_HOLE */
1191 /* We can only fallocate at the EOF with a truncate */
1192 if (offset < blk_len) {
1196 if (offset > blk_len) {
1197 /* No preallocation needed here */
1198 ret = fuse_co_do_truncate(exp, offset, true, PREALLOC_MODE_OFF);
1204 ret = fuse_co_do_truncate(exp, offset + length, true,
1205 PREALLOC_MODE_FALLOC);
1207 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1208 else if (mode & FALLOC_FL_PUNCH_HOLE) {
1209 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
1214 int size = MIN(length, BDRV_REQUEST_MAX_BYTES);
1216 ret = blk_co_pwrite_zeroes(exp->common.blk, offset, size,
1217 BDRV_REQ_MAY_UNMAP |
1218 BDRV_REQ_NO_FALLBACK);
1219 if (ret == -ENOTSUP) {
1221 * fallocate() specifies to return EOPNOTSUPP for unsupported
1229 } while (ret == 0 && length > 0);
1231 #endif /* CONFIG_FALLOCATE_PUNCH_HOLE */
1232 #ifdef CONFIG_FALLOCATE_ZERO_RANGE
1233 else if (mode & FALLOC_FL_ZERO_RANGE) {
1234 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + length > blk_len) {
1235 /* No need for zeroes, we are going to write them ourselves */
1236 ret = fuse_co_do_truncate(exp, offset + length, false,
1244 int size = MIN(length, BDRV_REQUEST_MAX_BYTES);
1246 ret = blk_co_pwrite_zeroes(exp->common.blk,
1250 } while (ret == 0 && length > 0);
1252 #endif /* CONFIG_FALLOCATE_ZERO_RANGE */
1257 return ret < 0 ? ret : 0;
1261 * Let clients fsync the exported image.
1262 * Return 0 on success (no 'out' object), and -errno on error.
1264 static ssize_t coroutine_fn GRAPH_RDLOCK fuse_co_fsync(FuseExport *exp)
1266 return blk_co_flush(exp->common.blk);
1270 * Called before an FD to the exported image is closed. (libfuse
1271 * notes this to be a way to return last-minute errors.)
1272 * Return 0 on success (no 'out' object), and -errno on error.
1274 static ssize_t coroutine_fn GRAPH_RDLOCK fuse_co_flush(FuseExport *exp)
1276 return blk_co_flush(exp->common.blk);
1279 #ifdef CONFIG_FUSE_LSEEK
1281 * Let clients inquire allocation status.
1282 * Return the number of bytes written to *out on success, and -errno on error.
1284 static ssize_t coroutine_fn GRAPH_RDLOCK
1285 fuse_co_lseek(FuseExport *exp, struct fuse_lseek_out *out,
1286 uint64_t offset, uint32_t whence)
1288 if (whence != SEEK_HOLE && whence != SEEK_DATA) {
1296 ret = bdrv_co_block_status_above(blk_bs(exp->common.blk), NULL,
1297 offset, INT64_MAX, &pnum, NULL, NULL);
1302 if (!pnum && (ret & BDRV_BLOCK_EOF)) {
1306 * If blk_getlength() rounds (e.g. by sectors), then the
1307 * export length will be rounded, too. However,
1308 * bdrv_block_status_above() may return EOF at unaligned
1309 * offsets. We must not let this become visible and thus
1310 * always simulate a hole between @offset (the real EOF)
1311 * and @blk_len (the client-visible EOF).
1314 blk_len = blk_co_getlength(exp->common.blk);
1319 if (offset > blk_len || whence == SEEK_DATA) {
1323 *out = (struct fuse_lseek_out) {
1326 return sizeof(*out);
1329 if (ret & BDRV_BLOCK_DATA) {
1330 if (whence == SEEK_DATA) {
1331 *out = (struct fuse_lseek_out) {
1334 return sizeof(*out);
1337 if (whence == SEEK_HOLE) {
1338 *out = (struct fuse_lseek_out) {
1341 return sizeof(*out);
1345 /* Safety check against infinite loops */
1356 * Write a FUSE response to the given @fd.
1358 * Effectively, writes out_hdr->common.len bytes of the buffer that is *out_hdr.
1360 * @fd: FUSE file descriptor
1361 * @out_hdr: Request response header and request-specific response data
1363 static int fuse_write_response(int fd, FuseRequestOutHeader *out_hdr)
1365 size_t to_write = out_hdr->common.len;
1368 /* Must at least write fuse_out_header */
1369 assert(to_write >= sizeof(out_hdr->common));
1371 ret = RETRY_ON_EINTR(write(fd, out_hdr, to_write));
1374 error_report("Failed to write to FUSE device: %s", strerror(-ret));
1378 /* Short writes are unexpected, treat them as errors */
1379 if (ret != to_write) {
1380 error_report("Short write to FUSE device, wrote %zi of %zu bytes",
1389 * Write a FUSE error response to @fd.
1391 * @fd: FUSE file descriptor
1392 * @in_hdr: Incoming request header to which to respond
1393 * @err: Error code (-errno, must be negative!)
1395 static int fuse_write_err(int fd, const struct fuse_in_header *in_hdr, int err)
1397 FuseRequestOutHeader out_hdr = {
1399 .len = sizeof(out_hdr.common),
1400 /* FUSE expects negative error values */
1402 .unique = in_hdr->unique,
1406 return fuse_write_response(fd, &out_hdr);
1410 * Write a FUSE response to the given @fd, using separate buffers for the
1411 * response header and data.
1413 * In contrast to fuse_write_response(), this function cannot return a full
1414 * FuseRequestOutHeader (i.e. including request-specific response structs),
1415 * but only FuseRequestOutHeader.common. The remaining data must be in
1418 * (Total length must be set in out_hdr->len.)
1420 * @fd: FUSE file descriptor
1421 * @out_hdr: Request response header
1422 * @buf: Pointer to response data
1424 static int fuse_write_buf_response(int fd,
1425 const struct fuse_out_header *out_hdr,
1428 size_t to_write = out_hdr->len;
1429 struct iovec iov[2] = {
1430 { (void *)out_hdr, sizeof(*out_hdr) },
1431 { (void *)buf, to_write - sizeof(*out_hdr) },
1435 /* *buf length must not be negative */
1436 assert(to_write >= sizeof(*out_hdr));
1438 ret = RETRY_ON_EINTR(writev(fd, iov, ARRAY_SIZE(iov)));
1441 error_report("Failed to write to FUSE device: %s", strerror(-ret));
1445 /* Short writes are unexpected, treat them as errors */
1446 if (ret != to_write) {
1447 error_report("Short write to FUSE device, wrote %zi of %zu bytes",
1456 * Process a FUSE request, incl. writing the response.
1458 static void coroutine_fn
1459 fuse_co_process_request(FuseQueue *q, const FuseRequestInHeader *in_hdr,
1460 const void *data_buffer)
1462 FuseRequestOutHeader out_hdr;
1463 FuseExport *exp = q->exp;
1464 /* For read requests: Data to be returned */
1465 void *out_data_buffer = NULL;
1468 GRAPH_RDLOCK_GUARD();
1470 switch (in_hdr->common.opcode) {
1472 ret = fuse_co_init(exp, &out_hdr.init, &in_hdr->init);
1480 ret = fuse_co_statfs(exp, &out_hdr.statfs);
1484 ret = fuse_co_open(exp, &out_hdr.open);
1492 ret = -ENOENT; /* There is no node but the root node */
1496 case FUSE_BATCH_FORGET:
1497 /* These have no response, and there is nothing we need to do */
1501 ret = fuse_co_getattr(exp, &out_hdr.attr);
1504 case FUSE_SETATTR: {
1505 const struct fuse_setattr_in *in = &in_hdr->setattr;
1506 ret = fuse_co_setattr(exp, &out_hdr.attr,
1507 in->valid, in->size, in->mode, in->uid, in->gid);
1512 const struct fuse_read_in *in = &in_hdr->read;
1513 ret = fuse_co_read(exp, &out_data_buffer, in->offset, in->size);
1518 const struct fuse_write_in *in = &in_hdr->write;
1519 uint32_t req_len = in_hdr->common.len;
1521 if (unlikely(req_len < sizeof(in_hdr->common) + sizeof(*in) +
1523 warn_report("FUSE WRITE truncated; received %zu bytes of %" PRIu32,
1524 req_len - sizeof(in_hdr->common) - sizeof(*in),
1531 * co_read_from_fuse_fd() has checked that in_hdr->len matches the
1532 * number of bytes read, which cannot exceed the max_write value we set
1533 * (FUSE_MAX_WRITE_BYTES). So we know that FUSE_MAX_WRITE_BYTES >=
1534 * in_hdr->len >= in->size + X, so this assertion must hold.
1536 assert(in->size <= FUSE_MAX_WRITE_BYTES);
1538 ret = fuse_co_write(exp, &out_hdr.write,
1539 in->offset, in->size, data_buffer);
1543 case FUSE_FALLOCATE: {
1544 const struct fuse_fallocate_in *in = &in_hdr->fallocate;
1545 ret = fuse_co_fallocate(exp, in->offset, in->length, in->mode);
1550 ret = fuse_co_fsync(exp);
1554 ret = fuse_co_flush(exp);
1557 #ifdef CONFIG_FUSE_LSEEK
1559 const struct fuse_lseek_in *in = &in_hdr->lseek;
1560 ret = fuse_co_lseek(exp, &out_hdr.lseek, in->offset, in->whence);
1570 out_hdr.common = (struct fuse_out_header) {
1571 .len = sizeof(out_hdr.common) + ret,
1572 .unique = in_hdr->common.unique,
1575 /* fuse_read() must not return a buffer in case of error */
1576 assert(out_data_buffer == NULL);
1578 out_hdr.common = (struct fuse_out_header) {
1579 .len = sizeof(out_hdr.common),
1580 /* FUSE expects negative errno values */
1582 .unique = in_hdr->common.unique,
1586 if (out_data_buffer) {
1587 fuse_write_buf_response(q->fuse_fd, &out_hdr.common, out_data_buffer);
1588 qemu_vfree(out_data_buffer);
1590 fuse_write_response(q->fuse_fd, &out_hdr);
1594 const BlockExportDriver blk_exp_fuse = {
1595 .type = BLOCK_EXPORT_TYPE_FUSE,
1596 .instance_size = sizeof(FuseExport),
1597 .create = fuse_export_create,
1598 .delete = fuse_export_delete,
1599 .request_shutdown = fuse_export_shutdown,