]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
exfat: add iomap direct I/O support
authorNamjae Jeon <linkinjeon@kernel.org>
Sat, 23 May 2026 04:56:43 +0000 (13:56 +0900)
committerNamjae Jeon <linkinjeon@kernel.org>
Mon, 15 Jun 2026 11:00:40 +0000 (20:00 +0900)
Add iomap-based direct I/O support to the exfat filesystem. This replaces
the previous exfat_direct_IO() implementation that used
blockdev_direct_IO() with iomap_dio_rw() interface.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
fs/exfat/Kconfig
fs/exfat/exfat_fs.h
fs/exfat/file.c
fs/exfat/inode.c
fs/exfat/iomap.c
fs/exfat/iomap.h

index e0b200902253983a94015d604337a4f16f387c2d..1fcb10c8d7bc9afacc5f9b8c711a1e580d77b8e9 100644 (file)
@@ -4,7 +4,6 @@ config EXFAT_FS
        tristate "exFAT filesystem support"
        select BUFFER_HEAD
        select NLS
-       select LEGACY_DIRECT_IO
        select FS_IOMAP
        help
          This allows you to mount devices formatted with the exFAT file system.
index 5f36c6892c8a551f7b4b8e30f17ab9b32fb66e14..2607e51804b25f90d865e48726dcdca8ccb9d028 100644 (file)
@@ -557,7 +557,6 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
 /* file.c */
 extern const struct file_operations exfat_file_operations;
 int __exfat_truncate(struct inode *inode);
-void exfat_truncate(struct inode *inode);
 int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                  struct iattr *attr);
 int exfat_getattr(struct mnt_idmap *idmap, const struct path *path,
index 5b667077865d4e0c61a4056f7cca95686a5ca90c..9cd34149a1886bac92936e519c6349936f77e172 100644 (file)
@@ -293,7 +293,7 @@ int __exfat_truncate(struct inode *inode)
        return 0;
 }
 
-void exfat_truncate(struct inode *inode)
+static void exfat_truncate(struct inode *inode)
 {
        struct super_block *sb = inode->i_sb;
        struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -672,6 +672,56 @@ static int exfat_extend_valid_size(struct inode *inode, loff_t new_valid_size)
        return ret;
 }
 
+static ssize_t exfat_fallback_buffered_write(struct kiocb *iocb,
+               struct iov_iter *from)
+{
+       loff_t offset = iocb->ki_pos, end;
+       ssize_t written;
+       int ret;
+
+       iocb->ki_flags &= ~IOCB_DIRECT;
+
+       written = iomap_file_buffered_write(iocb, from, &exfat_write_iomap_ops,
+                       NULL, NULL);
+       if (written < 0)
+               return written;
+
+       end = iocb->ki_pos + written - 1;
+       ret = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
+                       offset, end);
+       if (ret)
+               return -EIO;
+
+       invalidate_mapping_pages(iocb->ki_filp->f_mapping,
+                       offset >> PAGE_SHIFT,
+                       end >> PAGE_SHIFT);
+
+       return written;
+}
+
+static ssize_t exfat_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       ssize_t ret;
+
+       ret = iomap_dio_rw(iocb, from, &exfat_write_iomap_ops,
+                       &exfat_write_dio_ops, 0, NULL, 0);
+       if (ret == -ENOTBLK)
+               ret = 0;
+       else if (ret < 0)
+               return ret;
+
+       if (iov_iter_count(from)) {
+               ssize_t written;
+
+               written = exfat_fallback_buffered_write(iocb, from);
+               if (written < 0)
+                       return written;
+               ret += written;
+       }
+
+       return ret;
+}
+
 static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
        ssize_t ret;
@@ -696,16 +746,6 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
        if (ret <= 0)
                goto unlock;
 
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               unsigned long align = pos | iov_iter_alignment(iter);
-
-               if (!IS_ALIGNED(align, i_blocksize(inode)) &&
-                   !IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) {
-                       ret = -EINVAL;
-                       goto unlock;
-               }
-       }
-
        err = file_modified(iocb->ki_filp);
        if (err) {
                ret = err;
@@ -724,7 +764,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
        }
 
        if (iocb->ki_flags & IOCB_DIRECT)
-               ret = __generic_file_write_iter(iocb, iter);
+               ret = exfat_dio_write_iter(iocb, iter);
        else
                ret = iomap_file_buffered_write(iocb, iter,
                                &exfat_write_iomap_ops, NULL, NULL);
@@ -754,11 +794,24 @@ unlock:
 static ssize_t exfat_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
        struct inode *inode = file_inode(iocb->ki_filp);
+       ssize_t ret;
 
        if (unlikely(exfat_forced_shutdown(inode->i_sb)))
                return -EIO;
 
-       return generic_file_read_iter(iocb, iter);
+       inode_lock_shared(inode);
+
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               file_accessed(iocb->ki_filp);
+               ret = iomap_dio_rw(iocb, iter, &exfat_iomap_ops, NULL, 0,
+                               NULL, 0);
+       } else {
+               ret = generic_file_read_iter(iocb, iter);
+       }
+
+       inode_unlock_shared(inode);
+
+       return ret;
 }
 
 static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf)
@@ -859,10 +912,18 @@ static ssize_t exfat_splice_read(struct file *in, loff_t *ppos,
 
 static int exfat_file_open(struct inode *inode, struct file *filp)
 {
+       int err;
+
        if (unlikely(exfat_forced_shutdown(inode->i_sb)))
                return -EIO;
 
-       return generic_file_open(inode, filp);
+       err = generic_file_open(inode, filp);
+       if (err)
+               return err;
+
+       filp->f_mode |= FMODE_CAN_ODIRECT;
+
+       return 0;
 }
 
 const struct file_operations exfat_file_operations = {
index 96ea243c67dbcd4c87c50388b8dd6559ae6e7f16..8e8d94319c3c2a0bb68514c0d0a311233e6df6ea 100644 (file)
@@ -72,14 +72,27 @@ int __exfat_write_inode(struct inode *inode, int sync)
                             &ep->dentry.file.access_date,
                             NULL);
 
-       /* File size should be zero if there is no cluster allocated */
-       on_disk_size = i_size_read(inode);
+       /*
+        * During a DIO write, valid_size is updated eagerly in iomap_end (so
+        * that concurrent buffered reads see IOMAP_MAPPED) while i_size is
+        * updated asynchronously in end_io.  The FAT chain was already
+        * extended to cover ceil(valid_size/cluster_size) clusters.  Use the
+        * maximum so the on-disk size field always covers the FAT chain,
+        * preventing fsck from reporting "more clusters are allocated".
+        */
+       on_disk_size = max_t(unsigned long long, i_size_read(inode),
+                       ei->valid_size);
 
        if (ei->start_clu == EXFAT_EOF_CLUSTER)
                on_disk_size = 0;
-       /* valid_size must not exceed size in the on-disk stream entry. */
+       /*
+        * valid_size on disk must reflect only confirmed data (up to i_size)
+        * and must not exceed on_disk_size.
+        */
        on_disk_valid_size = min_t(unsigned long long, ei->valid_size,
-                       on_disk_size);
+                       i_size_read(inode));
+       if (ei->start_clu == EXFAT_EOF_CLUSTER)
+               on_disk_valid_size = 0;
 
        ep2->dentry.stream.size = cpu_to_le64(on_disk_size);
        ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_valid_size);
@@ -228,151 +241,6 @@ int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
        return 0;
 }
 
-static int exfat_get_block(struct inode *inode, sector_t iblock,
-               struct buffer_head *bh_result, int create)
-{
-       struct exfat_inode_info *ei = EXFAT_I(inode);
-       struct super_block *sb = inode->i_sb;
-       struct exfat_sb_info *sbi = EXFAT_SB(sb);
-       unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
-       int err = 0;
-       unsigned long mapped_blocks = 0;
-       unsigned int cluster, sec_offset, count;
-       sector_t last_block;
-       sector_t phys = 0;
-       sector_t valid_blks;
-       loff_t i_size;
-
-       mutex_lock(&sbi->s_lock);
-       i_size = i_size_read(inode);
-       last_block = exfat_bytes_to_block_round_up(sb, i_size);
-       if (iblock >= last_block && !create)
-               goto done;
-
-       /* Is this block already allocated? */
-       count = exfat_bytes_to_cluster_round_up(sbi, bh_result->b_size);
-       err = exfat_map_cluster(inode, iblock >> sbi->sect_per_clus_bits,
-                       &cluster, &count, create, NULL);
-       if (err) {
-               if (err != -ENOSPC)
-                       exfat_fs_error_ratelimit(sb,
-                               "failed to bmap (inode : %p iblock : %llu, err : %d)",
-                               inode, (unsigned long long)iblock, err);
-               goto unlock_ret;
-       }
-
-       if (cluster == EXFAT_EOF_CLUSTER)
-               goto done;
-
-       /* sector offset in cluster */
-       sec_offset = iblock & (sbi->sect_per_clus - 1);
-
-       phys = exfat_cluster_to_sector(sbi, cluster) + sec_offset;
-       mapped_blocks = ((unsigned long)count << sbi->sect_per_clus_bits) - sec_offset;
-       max_blocks = min(mapped_blocks, max_blocks);
-
-       map_bh(bh_result, sb, phys);
-       if (buffer_delay(bh_result))
-               clear_buffer_delay(bh_result);
-
-       /*
-        * In most cases, we just need to set bh_result to mapped, unmapped
-        * or new status as follows:
-        *  1. i_size == valid_size
-        *  2. write case (create == 1)
-        *  3. direct_read (!bh_result->b_folio)
-        *     -> the unwritten part will be zeroed in exfat_direct_IO()
-        *
-        * Otherwise, in the case of buffered read, it is necessary to take
-        * care the last nested block if valid_size is not equal to i_size.
-        */
-       if (i_size == ei->valid_size || create || !bh_result->b_folio)
-               valid_blks = exfat_bytes_to_block_round_up(sb, ei->valid_size);
-       else
-               valid_blks = exfat_bytes_to_block(sb, ei->valid_size);
-
-       /* The range has been fully written, map it */
-       if (iblock + max_blocks < valid_blks)
-               goto done;
-
-       /* The range has been partially written, map the written part */
-       if (iblock < valid_blks) {
-               max_blocks = valid_blks - iblock;
-               goto done;
-       }
-
-       /* The area has not been written, map and mark as new for create case */
-       if (create) {
-               set_buffer_new(bh_result);
-               ei->valid_size = exfat_block_to_bytes(sb, iblock + max_blocks);
-               mark_inode_dirty(inode);
-               goto done;
-       }
-
-       /*
-        * The area has just one block partially written.
-        * In that case, we should read and fill the unwritten part of
-        * a block with zero.
-        */
-       if (bh_result->b_folio && iblock == valid_blks &&
-           (ei->valid_size & (sb->s_blocksize - 1))) {
-               loff_t size, pos;
-               void *addr;
-
-               max_blocks = 1;
-
-               /*
-                * No buffer_head is allocated.
-                * (1) bmap: It's enough to set blocknr without I/O.
-                * (2) read: The unwritten part should be filled with zero.
-                *           If a folio does not have any buffers,
-                *           let's returns -EAGAIN to fallback to
-                *           block_read_full_folio() for per-bh IO.
-                */
-               if (!folio_buffers(bh_result->b_folio)) {
-                       err = -EAGAIN;
-                       goto done;
-               }
-
-               pos = exfat_block_to_bytes(sb, iblock);
-               size = ei->valid_size - pos;
-               addr = folio_address(bh_result->b_folio) +
-                       offset_in_folio(bh_result->b_folio, pos);
-
-               /* Check if bh->b_data points to proper addr in folio */
-               if (bh_result->b_data != addr) {
-                       exfat_fs_error_ratelimit(sb,
-                                       "b_data(%p) != folio_addr(%p)",
-                                       bh_result->b_data, addr);
-                       err = -EINVAL;
-                       goto done;
-               }
-
-               /* Read a block */
-               err = bh_read(bh_result, 0);
-               if (err < 0)
-                       goto done;
-
-               /* Zero unwritten part of a block */
-               memset(bh_result->b_data + size, 0, bh_result->b_size - size);
-               err = 0;
-               goto done;
-       }
-
-       /*
-        * The area has not been written, clear mapped for read/bmap cases.
-        * If so, it will be filled with zero without reading from disk.
-        */
-       clear_buffer_mapped(bh_result);
-done:
-       bh_result->b_size = exfat_block_to_bytes(sb, max_blocks);
-       if (err < 0)
-               clear_buffer_mapped(bh_result);
-unlock_ret:
-       mutex_unlock(&sbi->s_lock);
-       return err;
-}
-
 static int exfat_read_folio(struct file *file, struct folio *folio)
 {
        struct iomap_read_folio_ctx ctx = {
@@ -419,60 +287,6 @@ static int exfat_writepages(struct address_space *mapping,
        return iomap_writepages(&wpc);
 }
 
-static void exfat_write_failed(struct address_space *mapping, loff_t to)
-{
-       struct inode *inode = mapping->host;
-
-       if (to > i_size_read(inode)) {
-               truncate_pagecache(inode, i_size_read(inode));
-               inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
-               exfat_truncate(inode);
-       }
-}
-
-static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct address_space *mapping = iocb->ki_filp->f_mapping;
-       struct inode *inode = mapping->host;
-       struct exfat_inode_info *ei = EXFAT_I(inode);
-       loff_t pos = iocb->ki_pos;
-       loff_t size = pos + iov_iter_count(iter);
-       int rw = iov_iter_rw(iter);
-       ssize_t ret;
-
-       /*
-        * Need to use the DIO_LOCKING for avoiding the race
-        * condition of exfat_get_block() and ->truncate().
-        */
-       ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block);
-       if (ret < 0) {
-               if (rw == WRITE && ret != -EIOCBQUEUED)
-                       exfat_write_failed(mapping, size);
-
-               return ret;
-       }
-
-       size = pos + ret;
-
-       if (rw == WRITE) {
-               /*
-                * If the block had been partially written before this write,
-                * ->valid_size will not be updated in exfat_get_block(),
-                * update it here.
-                */
-               if (ei->valid_size < size) {
-                       ei->valid_size = size;
-                       mark_inode_dirty(inode);
-               }
-       } else if (pos < ei->valid_size && ei->valid_size < size) {
-               /* zero the unwritten part in the partially written block */
-               iov_iter_revert(iter, size - ei->valid_size);
-               iov_iter_zero(size - ei->valid_size, iter);
-       }
-
-       return ret;
-}
-
 static sector_t exfat_aop_bmap(struct address_space *mapping, sector_t block)
 {
        sector_t blocknr;
@@ -495,7 +309,6 @@ static const struct address_space_operations exfat_aops = {
        .error_remove_folio     = generic_error_remove_folio,
        .release_folio          = iomap_release_folio,
        .invalidate_folio       = iomap_invalidate_folio,
-       .direct_IO              = exfat_direct_IO,
 };
 
 static inline unsigned long exfat_hash(loff_t i_pos)
index 188df8cfac9a68cc46f434c077d8189a8ece7abf..7ad94d5806d9b89ff410aa3aef1b6d7ba3bc3830 100644 (file)
 #include "exfat_fs.h"
 #include "iomap.h"
 
+/*
+ * exfat_file_write_dio_end_io - Direct I/O write completion handler
+ *
+ * Updates i_size if the write extended the file. Called from the dio layer
+ * after I/O completion.
+ */
+static int exfat_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
+               int error, unsigned int flags)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+
+       if (error)
+               return error;
+
+       if (size && i_size_read(inode) < iocb->ki_pos + size) {
+               i_size_write(inode, iocb->ki_pos + size);
+               mark_inode_dirty(inode);
+       }
+
+       return 0;
+}
+
+const struct iomap_dio_ops exfat_write_dio_ops = {
+       .end_io         = exfat_file_write_dio_end_io,
+};
+
 static int __exfat_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned int flags, struct iomap *iomap, bool may_alloc)
 {
index 7f8dcbe20a174a5bb479b83c6449ea612c31732b..830388f386f4efdf591956c10a55b0c4ed83319f 100644 (file)
@@ -6,6 +6,7 @@
 #ifndef _LINUX_EXFAT_IOMAP_H
 #define _LINUX_EXFAT_IOMAP_H
 
+extern const struct iomap_dio_ops exfat_write_dio_ops;
 extern const struct iomap_ops exfat_iomap_ops;
 extern const struct iomap_ops exfat_write_iomap_ops;
 extern const struct iomap_writeback_ops exfat_writeback_ops;